{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 37680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005307855626326964, "grad_norm": 149.0, "learning_rate": 3.025477707006369e-06, "loss": 15.7984, "mean_token_accuracy": 2.1743857450928773e-05, "num_tokens": 737280.0, "step": 20 }, { "epoch": 0.010615711252653927, "grad_norm": 149.0, "learning_rate": 6.210191082802547e-06, "loss": 15.7261, "mean_token_accuracy": 3.2615786130918424e-05, "num_tokens": 1474560.0, "step": 40 }, { "epoch": 0.01592356687898089, "grad_norm": 146.0, "learning_rate": 9.394904458598725e-06, "loss": 15.4598, "mean_token_accuracy": 0.00011279626050963998, "num_tokens": 2211840.0, "step": 60 }, { "epoch": 0.021231422505307854, "grad_norm": 142.0, "learning_rate": 1.2579617834394903e-05, "loss": 14.8954, "mean_token_accuracy": 0.0035388127696933227, "num_tokens": 2949120.0, "step": 80 }, { "epoch": 0.02653927813163482, "grad_norm": 129.0, "learning_rate": 1.5764331210191083e-05, "loss": 14.5311, "mean_token_accuracy": 0.009695042506791651, "num_tokens": 3686400.0, "step": 100 }, { "epoch": 0.03184713375796178, "grad_norm": 92.5, "learning_rate": 1.894904458598726e-05, "loss": 13.8317, "mean_token_accuracy": 0.009828223614022136, "num_tokens": 4423680.0, "step": 120 }, { "epoch": 0.037154989384288746, "grad_norm": 58.25, "learning_rate": 2.2133757961783435e-05, "loss": 13.1268, "mean_token_accuracy": 0.013046314427629113, "num_tokens": 5160960.0, "step": 140 }, { "epoch": 0.04246284501061571, "grad_norm": 31.125, "learning_rate": 2.5318471337579613e-05, "loss": 12.4425, "mean_token_accuracy": 0.02042291797697544, "num_tokens": 5898240.0, "step": 160 }, { "epoch": 0.04777070063694268, "grad_norm": 16.625, "learning_rate": 2.8503184713375795e-05, "loss": 11.8726, "mean_token_accuracy": 0.03195395730435848, "num_tokens": 6635520.0, "step": 180 }, { "epoch": 0.05307855626326964, "grad_norm": 6.4375, "learning_rate": 3.1687898089171976e-05, "loss": 11.434, "mean_token_accuracy": 0.0435475105419755, "num_tokens": 7372800.0, "step": 200 }, { "epoch": 0.058386411889596604, "grad_norm": 2.78125, "learning_rate": 3.487261146496815e-05, "loss": 10.9717, "mean_token_accuracy": 0.05855892561376095, "num_tokens": 8110080.0, "step": 220 }, { "epoch": 0.06369426751592357, "grad_norm": 2.0, "learning_rate": 3.8057324840764325e-05, "loss": 10.698, "mean_token_accuracy": 0.0695708304643631, "num_tokens": 8847360.0, "step": 240 }, { "epoch": 0.06900212314225053, "grad_norm": 1.6171875, "learning_rate": 4.124203821656051e-05, "loss": 10.4693, "mean_token_accuracy": 0.07921287231147289, "num_tokens": 9584640.0, "step": 260 }, { "epoch": 0.07430997876857749, "grad_norm": 1.90625, "learning_rate": 4.442675159235668e-05, "loss": 10.305, "mean_token_accuracy": 0.08840101212263107, "num_tokens": 10321920.0, "step": 280 }, { "epoch": 0.07961783439490445, "grad_norm": 1.6640625, "learning_rate": 4.761146496815286e-05, "loss": 10.0555, "mean_token_accuracy": 0.09631985165178776, "num_tokens": 11059200.0, "step": 300 }, { "epoch": 0.08492569002123142, "grad_norm": 1.1875, "learning_rate": 5.079617834394904e-05, "loss": 9.8517, "mean_token_accuracy": 0.10118368156254291, "num_tokens": 11796480.0, "step": 320 }, { "epoch": 0.09023354564755838, "grad_norm": 1.515625, "learning_rate": 5.398089171974521e-05, "loss": 9.6325, "mean_token_accuracy": 0.10604479275643826, "num_tokens": 12533760.0, "step": 340 }, { "epoch": 0.09554140127388536, "grad_norm": 1.234375, "learning_rate": 5.71656050955414e-05, "loss": 9.4321, "mean_token_accuracy": 0.11175663135945797, "num_tokens": 13271040.0, "step": 360 }, { "epoch": 0.10084925690021232, "grad_norm": 1.28125, "learning_rate": 6.0350318471337575e-05, "loss": 9.2114, "mean_token_accuracy": 0.1188886173069477, "num_tokens": 14008320.0, "step": 380 }, { "epoch": 0.10615711252653928, "grad_norm": 2.078125, "learning_rate": 6.353503184713376e-05, "loss": 9.0332, "mean_token_accuracy": 0.1140614815056324, "num_tokens": 14745600.0, "step": 400 }, { "epoch": 0.11146496815286625, "grad_norm": 1.5, "learning_rate": 6.671974522292994e-05, "loss": 8.5291, "mean_token_accuracy": 0.12357306033372879, "num_tokens": 15482880.0, "step": 420 }, { "epoch": 0.11677282377919321, "grad_norm": 1.1328125, "learning_rate": 6.99044585987261e-05, "loss": 8.1325, "mean_token_accuracy": 0.13222032003104686, "num_tokens": 16220160.0, "step": 440 }, { "epoch": 0.12208067940552017, "grad_norm": 1.1171875, "learning_rate": 7.308917197452229e-05, "loss": 7.8443, "mean_token_accuracy": 0.1393224075436592, "num_tokens": 16957440.0, "step": 460 }, { "epoch": 0.12738853503184713, "grad_norm": 0.75390625, "learning_rate": 7.627388535031847e-05, "loss": 7.5939, "mean_token_accuracy": 0.14508317038416862, "num_tokens": 17694720.0, "step": 480 }, { "epoch": 0.1326963906581741, "grad_norm": 0.96875, "learning_rate": 7.945859872611464e-05, "loss": 7.4083, "mean_token_accuracy": 0.14720863178372384, "num_tokens": 18432000.0, "step": 500 }, { "epoch": 0.13800424628450106, "grad_norm": 1.046875, "learning_rate": 8.264331210191082e-05, "loss": 7.2023, "mean_token_accuracy": 0.1542196661233902, "num_tokens": 19169280.0, "step": 520 }, { "epoch": 0.14331210191082802, "grad_norm": 0.96875, "learning_rate": 8.5828025477707e-05, "loss": 7.1189, "mean_token_accuracy": 0.1562459222972393, "num_tokens": 19906560.0, "step": 540 }, { "epoch": 0.14861995753715498, "grad_norm": 0.98828125, "learning_rate": 8.901273885350317e-05, "loss": 6.9864, "mean_token_accuracy": 0.16205968633294104, "num_tokens": 20643840.0, "step": 560 }, { "epoch": 0.15392781316348195, "grad_norm": 1.125, "learning_rate": 9.219745222929935e-05, "loss": 6.921, "mean_token_accuracy": 0.1635151669383049, "num_tokens": 21381120.0, "step": 580 }, { "epoch": 0.1592356687898089, "grad_norm": 0.83203125, "learning_rate": 9.538216560509553e-05, "loss": 6.7956, "mean_token_accuracy": 0.16828114837408065, "num_tokens": 22118400.0, "step": 600 }, { "epoch": 0.16454352441613587, "grad_norm": 0.82421875, "learning_rate": 9.856687898089171e-05, "loss": 6.7067, "mean_token_accuracy": 0.17232006937265396, "num_tokens": 22855680.0, "step": 620 }, { "epoch": 0.16985138004246284, "grad_norm": 0.8828125, "learning_rate": 0.00010175159235668788, "loss": 6.6241, "mean_token_accuracy": 0.17689715027809144, "num_tokens": 23592960.0, "step": 640 }, { "epoch": 0.1751592356687898, "grad_norm": 1.0078125, "learning_rate": 0.00010493630573248406, "loss": 6.5103, "mean_token_accuracy": 0.18290796726942063, "num_tokens": 24330240.0, "step": 660 }, { "epoch": 0.18046709129511676, "grad_norm": 0.875, "learning_rate": 0.00010812101910828024, "loss": 6.4558, "mean_token_accuracy": 0.18631903678178788, "num_tokens": 25067520.0, "step": 680 }, { "epoch": 0.18577494692144372, "grad_norm": 0.9140625, "learning_rate": 0.00011130573248407641, "loss": 6.3541, "mean_token_accuracy": 0.19223880246281624, "num_tokens": 25804800.0, "step": 700 }, { "epoch": 0.1910828025477707, "grad_norm": 0.875, "learning_rate": 0.0001144904458598726, "loss": 6.3029, "mean_token_accuracy": 0.19308001697063445, "num_tokens": 26542080.0, "step": 720 }, { "epoch": 0.19639065817409768, "grad_norm": 0.8828125, "learning_rate": 0.00011767515923566879, "loss": 6.2773, "mean_token_accuracy": 0.19540253281593323, "num_tokens": 27279360.0, "step": 740 }, { "epoch": 0.20169851380042464, "grad_norm": 0.85546875, "learning_rate": 0.00012085987261146497, "loss": 6.1874, "mean_token_accuracy": 0.19977848529815673, "num_tokens": 28016640.0, "step": 760 }, { "epoch": 0.2070063694267516, "grad_norm": 0.97265625, "learning_rate": 0.00012404458598726115, "loss": 6.1474, "mean_token_accuracy": 0.200519135594368, "num_tokens": 28753920.0, "step": 780 }, { "epoch": 0.21231422505307856, "grad_norm": 1.0234375, "learning_rate": 0.00012722929936305732, "loss": 6.113, "mean_token_accuracy": 0.20138345286250114, "num_tokens": 29491200.0, "step": 800 }, { "epoch": 0.21762208067940553, "grad_norm": 1.046875, "learning_rate": 0.00013041401273885349, "loss": 6.004, "mean_token_accuracy": 0.2075614258646965, "num_tokens": 30228480.0, "step": 820 }, { "epoch": 0.2229299363057325, "grad_norm": 0.91015625, "learning_rate": 0.00013359872611464968, "loss": 5.9263, "mean_token_accuracy": 0.21112470254302024, "num_tokens": 30965760.0, "step": 840 }, { "epoch": 0.22823779193205945, "grad_norm": 1.0703125, "learning_rate": 0.00013678343949044585, "loss": 5.8558, "mean_token_accuracy": 0.21528593376278876, "num_tokens": 31703040.0, "step": 860 }, { "epoch": 0.23354564755838642, "grad_norm": 0.88671875, "learning_rate": 0.00013996815286624202, "loss": 5.7999, "mean_token_accuracy": 0.21691536232829095, "num_tokens": 32440320.0, "step": 880 }, { "epoch": 0.23885350318471338, "grad_norm": 0.828125, "learning_rate": 0.0001431528662420382, "loss": 5.7186, "mean_token_accuracy": 0.22191373258829117, "num_tokens": 33177600.0, "step": 900 }, { "epoch": 0.24416135881104034, "grad_norm": 1.109375, "learning_rate": 0.00014633757961783438, "loss": 5.6406, "mean_token_accuracy": 0.22538051679730414, "num_tokens": 33914880.0, "step": 920 }, { "epoch": 0.2494692144373673, "grad_norm": 0.87109375, "learning_rate": 0.00014952229299363055, "loss": 5.5795, "mean_token_accuracy": 0.22849940359592438, "num_tokens": 34652160.0, "step": 940 }, { "epoch": 0.25477707006369427, "grad_norm": 0.85546875, "learning_rate": 0.00015270700636942674, "loss": 5.5371, "mean_token_accuracy": 0.23100402280688287, "num_tokens": 35389440.0, "step": 960 }, { "epoch": 0.26008492569002123, "grad_norm": 0.73046875, "learning_rate": 0.00015589171974522294, "loss": 5.5029, "mean_token_accuracy": 0.23145112991333008, "num_tokens": 36126720.0, "step": 980 }, { "epoch": 0.2653927813163482, "grad_norm": 1.09375, "learning_rate": 0.0001590764331210191, "loss": 5.4097, "mean_token_accuracy": 0.23701348155736923, "num_tokens": 36864000.0, "step": 1000 }, { "epoch": 0.27070063694267515, "grad_norm": 0.921875, "learning_rate": 0.0001622611464968153, "loss": 5.378, "mean_token_accuracy": 0.23914302065968512, "num_tokens": 37601280.0, "step": 1020 }, { "epoch": 0.2760084925690021, "grad_norm": 0.875, "learning_rate": 0.00016544585987261147, "loss": 5.3438, "mean_token_accuracy": 0.24019080102443696, "num_tokens": 38338560.0, "step": 1040 }, { "epoch": 0.2813163481953291, "grad_norm": 1.2109375, "learning_rate": 0.00016863057324840764, "loss": 5.2862, "mean_token_accuracy": 0.2443411596119404, "num_tokens": 39075840.0, "step": 1060 }, { "epoch": 0.28662420382165604, "grad_norm": 0.8984375, "learning_rate": 0.00017181528662420383, "loss": 5.2983, "mean_token_accuracy": 0.2426927037537098, "num_tokens": 39813120.0, "step": 1080 }, { "epoch": 0.291932059447983, "grad_norm": 0.76171875, "learning_rate": 0.000175, "loss": 5.2099, "mean_token_accuracy": 0.2485581122338772, "num_tokens": 40550400.0, "step": 1100 }, { "epoch": 0.29723991507430997, "grad_norm": 0.7578125, "learning_rate": 0.00017818471337579617, "loss": 5.1966, "mean_token_accuracy": 0.24811779707670212, "num_tokens": 41287680.0, "step": 1120 }, { "epoch": 0.30254777070063693, "grad_norm": 0.76171875, "learning_rate": 0.00018136942675159236, "loss": 5.1892, "mean_token_accuracy": 0.2481368213891983, "num_tokens": 42024960.0, "step": 1140 }, { "epoch": 0.3078556263269639, "grad_norm": 0.9453125, "learning_rate": 0.00018455414012738853, "loss": 5.1594, "mean_token_accuracy": 0.25068901032209395, "num_tokens": 42762240.0, "step": 1160 }, { "epoch": 0.31316348195329086, "grad_norm": 0.92578125, "learning_rate": 0.0001877388535031847, "loss": 5.0942, "mean_token_accuracy": 0.253795662522316, "num_tokens": 43499520.0, "step": 1180 }, { "epoch": 0.3184713375796178, "grad_norm": 0.8515625, "learning_rate": 0.0001909235668789809, "loss": 5.1219, "mean_token_accuracy": 0.2513644263148308, "num_tokens": 44236800.0, "step": 1200 }, { "epoch": 0.3237791932059448, "grad_norm": 0.6328125, "learning_rate": 0.00019410828025477706, "loss": 5.1095, "mean_token_accuracy": 0.25136578381061553, "num_tokens": 44974080.0, "step": 1220 }, { "epoch": 0.32908704883227174, "grad_norm": 0.84375, "learning_rate": 0.00019729299363057323, "loss": 5.0543, "mean_token_accuracy": 0.25736165419220924, "num_tokens": 45711360.0, "step": 1240 }, { "epoch": 0.3343949044585987, "grad_norm": 0.81640625, "learning_rate": 0.00020047770700636942, "loss": 5.0194, "mean_token_accuracy": 0.2589027531445026, "num_tokens": 46448640.0, "step": 1260 }, { "epoch": 0.33970276008492567, "grad_norm": 0.7734375, "learning_rate": 0.0002036624203821656, "loss": 4.9988, "mean_token_accuracy": 0.2601299241185188, "num_tokens": 47185920.0, "step": 1280 }, { "epoch": 0.34501061571125263, "grad_norm": 0.7265625, "learning_rate": 0.00020684713375796176, "loss": 4.9895, "mean_token_accuracy": 0.2598187118768692, "num_tokens": 47923200.0, "step": 1300 }, { "epoch": 0.3503184713375796, "grad_norm": 0.63671875, "learning_rate": 0.00021003184713375795, "loss": 4.9207, "mean_token_accuracy": 0.26470564156770704, "num_tokens": 48660480.0, "step": 1320 }, { "epoch": 0.35562632696390656, "grad_norm": 0.67578125, "learning_rate": 0.00021321656050955412, "loss": 4.951, "mean_token_accuracy": 0.2630558267235756, "num_tokens": 49397760.0, "step": 1340 }, { "epoch": 0.3609341825902335, "grad_norm": 0.765625, "learning_rate": 0.0002164012738853503, "loss": 4.9255, "mean_token_accuracy": 0.2637937605381012, "num_tokens": 50135040.0, "step": 1360 }, { "epoch": 0.3662420382165605, "grad_norm": 0.62890625, "learning_rate": 0.00021958598726114648, "loss": 4.8924, "mean_token_accuracy": 0.26640846133232116, "num_tokens": 50872320.0, "step": 1380 }, { "epoch": 0.37154989384288745, "grad_norm": 0.64453125, "learning_rate": 0.00022277070063694265, "loss": 4.8766, "mean_token_accuracy": 0.26638399958610537, "num_tokens": 51609600.0, "step": 1400 }, { "epoch": 0.37685774946921446, "grad_norm": 0.6328125, "learning_rate": 0.00022595541401273885, "loss": 4.8486, "mean_token_accuracy": 0.26806642562150956, "num_tokens": 52346880.0, "step": 1420 }, { "epoch": 0.3821656050955414, "grad_norm": 0.609375, "learning_rate": 0.00022914012738853501, "loss": 4.8501, "mean_token_accuracy": 0.2687771782279015, "num_tokens": 53084160.0, "step": 1440 }, { "epoch": 0.3874734607218684, "grad_norm": 0.65625, "learning_rate": 0.00023232484076433118, "loss": 4.8155, "mean_token_accuracy": 0.2720238074660301, "num_tokens": 53821440.0, "step": 1460 }, { "epoch": 0.39278131634819535, "grad_norm": 0.5859375, "learning_rate": 0.00023550955414012738, "loss": 4.7904, "mean_token_accuracy": 0.27355811148881914, "num_tokens": 54558720.0, "step": 1480 }, { "epoch": 0.3980891719745223, "grad_norm": 0.63671875, "learning_rate": 0.00023869426751592355, "loss": 4.7962, "mean_token_accuracy": 0.27195585817098616, "num_tokens": 55296000.0, "step": 1500 }, { "epoch": 0.4033970276008493, "grad_norm": 0.62890625, "learning_rate": 0.0002418789808917197, "loss": 4.7532, "mean_token_accuracy": 0.2762054234743118, "num_tokens": 56033280.0, "step": 1520 }, { "epoch": 0.40870488322717624, "grad_norm": 0.58984375, "learning_rate": 0.0002450636942675159, "loss": 4.8028, "mean_token_accuracy": 0.27046640664339067, "num_tokens": 56770560.0, "step": 1540 }, { "epoch": 0.4140127388535032, "grad_norm": 0.65625, "learning_rate": 0.0002482484076433121, "loss": 4.8153, "mean_token_accuracy": 0.2694199845194817, "num_tokens": 57507840.0, "step": 1560 }, { "epoch": 0.41932059447983017, "grad_norm": 0.671875, "learning_rate": 0.00025143312101910827, "loss": 4.7934, "mean_token_accuracy": 0.2714788541197777, "num_tokens": 58245120.0, "step": 1580 }, { "epoch": 0.42462845010615713, "grad_norm": 0.640625, "learning_rate": 0.0002546178343949044, "loss": 4.8166, "mean_token_accuracy": 0.2687323361635208, "num_tokens": 58982400.0, "step": 1600 }, { "epoch": 0.4299363057324841, "grad_norm": 0.640625, "learning_rate": 0.0002578025477707006, "loss": 4.7695, "mean_token_accuracy": 0.2725361481308937, "num_tokens": 59719680.0, "step": 1620 }, { "epoch": 0.43524416135881105, "grad_norm": 0.6640625, "learning_rate": 0.0002609872611464968, "loss": 4.7687, "mean_token_accuracy": 0.2704691231250763, "num_tokens": 60456960.0, "step": 1640 }, { "epoch": 0.440552016985138, "grad_norm": 0.6484375, "learning_rate": 0.00026417197452229294, "loss": 4.7456, "mean_token_accuracy": 0.2743137091398239, "num_tokens": 61194240.0, "step": 1660 }, { "epoch": 0.445859872611465, "grad_norm": 0.7109375, "learning_rate": 0.00026735668789808914, "loss": 4.729, "mean_token_accuracy": 0.27244509756565094, "num_tokens": 61931520.0, "step": 1680 }, { "epoch": 0.45116772823779194, "grad_norm": 0.54296875, "learning_rate": 0.00027054140127388533, "loss": 4.6863, "mean_token_accuracy": 0.2771771028637886, "num_tokens": 62668800.0, "step": 1700 }, { "epoch": 0.4564755838641189, "grad_norm": 0.53515625, "learning_rate": 0.0002737261146496815, "loss": 4.6683, "mean_token_accuracy": 0.2777737036347389, "num_tokens": 63406080.0, "step": 1720 }, { "epoch": 0.46178343949044587, "grad_norm": 0.6953125, "learning_rate": 0.00027691082802547767, "loss": 4.6585, "mean_token_accuracy": 0.2791965648531914, "num_tokens": 64143360.0, "step": 1740 }, { "epoch": 0.46709129511677283, "grad_norm": 0.5703125, "learning_rate": 0.00028009554140127386, "loss": 4.6173, "mean_token_accuracy": 0.28250434845685957, "num_tokens": 64880640.0, "step": 1760 }, { "epoch": 0.4723991507430998, "grad_norm": 0.5078125, "learning_rate": 0.00028328025477707, "loss": 4.6059, "mean_token_accuracy": 0.2823168128728867, "num_tokens": 65617920.0, "step": 1780 }, { "epoch": 0.47770700636942676, "grad_norm": 0.6015625, "learning_rate": 0.0002864649681528662, "loss": 4.5821, "mean_token_accuracy": 0.2838470295071602, "num_tokens": 66355200.0, "step": 1800 }, { "epoch": 0.4830148619957537, "grad_norm": 0.49609375, "learning_rate": 0.0002896496815286624, "loss": 4.5794, "mean_token_accuracy": 0.283673083782196, "num_tokens": 67092480.0, "step": 1820 }, { "epoch": 0.4883227176220807, "grad_norm": 0.49609375, "learning_rate": 0.00029283439490445854, "loss": 4.5296, "mean_token_accuracy": 0.2876399755477905, "num_tokens": 67829760.0, "step": 1840 }, { "epoch": 0.49363057324840764, "grad_norm": 0.49609375, "learning_rate": 0.00029601910828025473, "loss": 4.5423, "mean_token_accuracy": 0.28767259418964386, "num_tokens": 68567040.0, "step": 1860 }, { "epoch": 0.4989384288747346, "grad_norm": 0.44140625, "learning_rate": 0.0002992038216560509, "loss": 4.5124, "mean_token_accuracy": 0.288636115193367, "num_tokens": 69304320.0, "step": 1880 }, { "epoch": 0.5042462845010616, "grad_norm": 0.5390625, "learning_rate": 0.00029999987002061866, "loss": 4.5142, "mean_token_accuracy": 0.2884390622377396, "num_tokens": 70041600.0, "step": 1900 }, { "epoch": 0.5095541401273885, "grad_norm": 0.44921875, "learning_rate": 0.0002999992923349339, "loss": 4.4755, "mean_token_accuracy": 0.29082273244857787, "num_tokens": 70778880.0, "step": 1920 }, { "epoch": 0.5148619957537155, "grad_norm": 0.4921875, "learning_rate": 0.00029999825250257006, "loss": 4.4794, "mean_token_accuracy": 0.2912141218781471, "num_tokens": 71516160.0, "step": 1940 }, { "epoch": 0.5201698513800425, "grad_norm": 0.4765625, "learning_rate": 0.0002999967505267309, "loss": 4.4436, "mean_token_accuracy": 0.293876388669014, "num_tokens": 72253440.0, "step": 1960 }, { "epoch": 0.5254777070063694, "grad_norm": 0.4765625, "learning_rate": 0.0002999947864120441, "loss": 4.4601, "mean_token_accuracy": 0.2918039247393608, "num_tokens": 72990720.0, "step": 1980 }, { "epoch": 0.5307855626326964, "grad_norm": 0.4140625, "learning_rate": 0.00029999236016456095, "loss": 4.4465, "mean_token_accuracy": 0.29361953735351565, "num_tokens": 73728000.0, "step": 2000 }, { "epoch": 0.5360934182590233, "grad_norm": 0.408203125, "learning_rate": 0.0002999894717917568, "loss": 4.3995, "mean_token_accuracy": 0.29734588861465455, "num_tokens": 74465280.0, "step": 2020 }, { "epoch": 0.5414012738853503, "grad_norm": 0.470703125, "learning_rate": 0.00029998612130253054, "loss": 4.4187, "mean_token_accuracy": 0.2963769301772118, "num_tokens": 75202560.0, "step": 2040 }, { "epoch": 0.5467091295116773, "grad_norm": 0.44140625, "learning_rate": 0.0002999823087072052, "loss": 4.4002, "mean_token_accuracy": 0.29651418775320054, "num_tokens": 75939840.0, "step": 2060 }, { "epoch": 0.5520169851380042, "grad_norm": 0.404296875, "learning_rate": 0.0002999780340175273, "loss": 4.3938, "mean_token_accuracy": 0.2971433997154236, "num_tokens": 76677120.0, "step": 2080 }, { "epoch": 0.5573248407643312, "grad_norm": 0.4453125, "learning_rate": 0.0002999732972466671, "loss": 4.3665, "mean_token_accuracy": 0.3003248006105423, "num_tokens": 77414400.0, "step": 2100 }, { "epoch": 0.5626326963906582, "grad_norm": 0.423828125, "learning_rate": 0.00029996809840921845, "loss": 4.33, "mean_token_accuracy": 0.30245026051998136, "num_tokens": 78151680.0, "step": 2120 }, { "epoch": 0.5679405520169851, "grad_norm": 0.40625, "learning_rate": 0.00029996243752119915, "loss": 4.3416, "mean_token_accuracy": 0.3019419968128204, "num_tokens": 78888960.0, "step": 2140 }, { "epoch": 0.5732484076433121, "grad_norm": 0.3828125, "learning_rate": 0.00029995631460005015, "loss": 4.3532, "mean_token_accuracy": 0.30051233917474746, "num_tokens": 79626240.0, "step": 2160 }, { "epoch": 0.578556263269639, "grad_norm": 0.52734375, "learning_rate": 0.00029994972966463626, "loss": 4.3075, "mean_token_accuracy": 0.3041109457612038, "num_tokens": 80363520.0, "step": 2180 }, { "epoch": 0.583864118895966, "grad_norm": 0.388671875, "learning_rate": 0.0002999426827352456, "loss": 4.3133, "mean_token_accuracy": 0.30318003743886945, "num_tokens": 81100800.0, "step": 2200 }, { "epoch": 0.589171974522293, "grad_norm": 0.439453125, "learning_rate": 0.00029993517383358965, "loss": 4.3147, "mean_token_accuracy": 0.303795662522316, "num_tokens": 81838080.0, "step": 2220 }, { "epoch": 0.5944798301486199, "grad_norm": 0.3984375, "learning_rate": 0.00029992720298280327, "loss": 4.2959, "mean_token_accuracy": 0.3049263432621956, "num_tokens": 82575360.0, "step": 2240 }, { "epoch": 0.5997876857749469, "grad_norm": 0.37109375, "learning_rate": 0.0002999187702074447, "loss": 4.3119, "mean_token_accuracy": 0.30353881418704987, "num_tokens": 83312640.0, "step": 2260 }, { "epoch": 0.6050955414012739, "grad_norm": 0.427734375, "learning_rate": 0.0002999098755334951, "loss": 4.2393, "mean_token_accuracy": 0.3107591301202774, "num_tokens": 84049920.0, "step": 2280 }, { "epoch": 0.6104033970276008, "grad_norm": 0.37109375, "learning_rate": 0.000299900518988359, "loss": 4.2746, "mean_token_accuracy": 0.3070246264338493, "num_tokens": 84787200.0, "step": 2300 }, { "epoch": 0.6157112526539278, "grad_norm": 0.44921875, "learning_rate": 0.0002998907006008638, "loss": 4.2742, "mean_token_accuracy": 0.3060638129711151, "num_tokens": 85524480.0, "step": 2320 }, { "epoch": 0.6210191082802548, "grad_norm": 0.388671875, "learning_rate": 0.0002998804204012599, "loss": 4.2313, "mean_token_accuracy": 0.31023863703012466, "num_tokens": 86261760.0, "step": 2340 }, { "epoch": 0.6263269639065817, "grad_norm": 0.404296875, "learning_rate": 0.00029986967842122054, "loss": 4.2299, "mean_token_accuracy": 0.31158403903245924, "num_tokens": 86999040.0, "step": 2360 }, { "epoch": 0.6316348195329087, "grad_norm": 0.396484375, "learning_rate": 0.0002998584746938416, "loss": 4.2415, "mean_token_accuracy": 0.3100565329194069, "num_tokens": 87736320.0, "step": 2380 }, { "epoch": 0.6369426751592356, "grad_norm": 0.451171875, "learning_rate": 0.0002998468092536418, "loss": 4.2148, "mean_token_accuracy": 0.3117742985486984, "num_tokens": 88473600.0, "step": 2400 }, { "epoch": 0.6422505307855626, "grad_norm": 0.421875, "learning_rate": 0.0002998346821365621, "loss": 4.1916, "mean_token_accuracy": 0.31463905423879623, "num_tokens": 89210880.0, "step": 2420 }, { "epoch": 0.6475583864118896, "grad_norm": 0.396484375, "learning_rate": 0.00029982209337996624, "loss": 4.1967, "mean_token_accuracy": 0.3132610350847244, "num_tokens": 89948160.0, "step": 2440 }, { "epoch": 0.6528662420382165, "grad_norm": 0.3828125, "learning_rate": 0.00029980904302264, "loss": 4.1908, "mean_token_accuracy": 0.3144542261958122, "num_tokens": 90685440.0, "step": 2460 }, { "epoch": 0.6581740976645435, "grad_norm": 0.3671875, "learning_rate": 0.0002997955311047914, "loss": 4.2139, "mean_token_accuracy": 0.3113693192601204, "num_tokens": 91422720.0, "step": 2480 }, { "epoch": 0.6634819532908705, "grad_norm": 0.462890625, "learning_rate": 0.0002997815576680506, "loss": 4.1742, "mean_token_accuracy": 0.3150250047445297, "num_tokens": 92160000.0, "step": 2500 }, { "epoch": 0.6687898089171974, "grad_norm": 0.373046875, "learning_rate": 0.0002997671227554696, "loss": 4.1518, "mean_token_accuracy": 0.3168881818652153, "num_tokens": 92897280.0, "step": 2520 }, { "epoch": 0.6740976645435244, "grad_norm": 0.39453125, "learning_rate": 0.00029975222641152237, "loss": 4.1565, "mean_token_accuracy": 0.3171545445919037, "num_tokens": 93634560.0, "step": 2540 }, { "epoch": 0.6794055201698513, "grad_norm": 0.384765625, "learning_rate": 0.00029973686868210427, "loss": 4.174, "mean_token_accuracy": 0.31492715775966645, "num_tokens": 94371840.0, "step": 2560 }, { "epoch": 0.6847133757961783, "grad_norm": 0.375, "learning_rate": 0.0002997210496145325, "loss": 4.1579, "mean_token_accuracy": 0.31657969057559965, "num_tokens": 95109120.0, "step": 2580 }, { "epoch": 0.6900212314225053, "grad_norm": 0.400390625, "learning_rate": 0.00029970476925754544, "loss": 4.1423, "mean_token_accuracy": 0.3181710705161095, "num_tokens": 95846400.0, "step": 2600 }, { "epoch": 0.6953290870488322, "grad_norm": 0.34765625, "learning_rate": 0.0002996880276613027, "loss": 4.1304, "mean_token_accuracy": 0.31911285072565077, "num_tokens": 96583680.0, "step": 2620 }, { "epoch": 0.7006369426751592, "grad_norm": 0.45703125, "learning_rate": 0.000299670824877385, "loss": 4.1301, "mean_token_accuracy": 0.3186018690466881, "num_tokens": 97320960.0, "step": 2640 }, { "epoch": 0.7059447983014862, "grad_norm": 0.35546875, "learning_rate": 0.0002996531609587941, "loss": 4.1234, "mean_token_accuracy": 0.31881522834300996, "num_tokens": 98058240.0, "step": 2660 }, { "epoch": 0.7112526539278131, "grad_norm": 0.40234375, "learning_rate": 0.0002996350359599523, "loss": 4.104, "mean_token_accuracy": 0.322117580473423, "num_tokens": 98795520.0, "step": 2680 }, { "epoch": 0.7165605095541401, "grad_norm": 0.3671875, "learning_rate": 0.00029961644993670253, "loss": 4.1223, "mean_token_accuracy": 0.3196564495563507, "num_tokens": 99532800.0, "step": 2700 }, { "epoch": 0.721868365180467, "grad_norm": 0.44921875, "learning_rate": 0.0002995974029463084, "loss": 4.0966, "mean_token_accuracy": 0.3216229096055031, "num_tokens": 100270080.0, "step": 2720 }, { "epoch": 0.727176220806794, "grad_norm": 0.361328125, "learning_rate": 0.0002995778950474533, "loss": 4.1073, "mean_token_accuracy": 0.32084692269563675, "num_tokens": 101007360.0, "step": 2740 }, { "epoch": 0.732484076433121, "grad_norm": 0.33984375, "learning_rate": 0.0002995579263002411, "loss": 4.0722, "mean_token_accuracy": 0.3244618371129036, "num_tokens": 101744640.0, "step": 2760 }, { "epoch": 0.7377919320594479, "grad_norm": 0.357421875, "learning_rate": 0.00029953749676619524, "loss": 4.0869, "mean_token_accuracy": 0.32258099913597105, "num_tokens": 102481920.0, "step": 2780 }, { "epoch": 0.7430997876857749, "grad_norm": 0.337890625, "learning_rate": 0.00029951660650825903, "loss": 4.0888, "mean_token_accuracy": 0.32243150621652605, "num_tokens": 103219200.0, "step": 2800 }, { "epoch": 0.7484076433121019, "grad_norm": 0.330078125, "learning_rate": 0.0002994952555907951, "loss": 4.0631, "mean_token_accuracy": 0.32425527423620226, "num_tokens": 103956480.0, "step": 2820 }, { "epoch": 0.7537154989384289, "grad_norm": 0.35546875, "learning_rate": 0.0002994734440795856, "loss": 4.1119, "mean_token_accuracy": 0.318578764796257, "num_tokens": 104693760.0, "step": 2840 }, { "epoch": 0.7590233545647559, "grad_norm": 0.349609375, "learning_rate": 0.0002994511720418316, "loss": 4.0822, "mean_token_accuracy": 0.32164872586727145, "num_tokens": 105431040.0, "step": 2860 }, { "epoch": 0.7643312101910829, "grad_norm": 0.44140625, "learning_rate": 0.0002994284395461529, "loss": 4.0621, "mean_token_accuracy": 0.325258207321167, "num_tokens": 106168320.0, "step": 2880 }, { "epoch": 0.7696390658174098, "grad_norm": 0.34765625, "learning_rate": 0.0002994052466625884, "loss": 4.0452, "mean_token_accuracy": 0.32502310276031493, "num_tokens": 106905600.0, "step": 2900 }, { "epoch": 0.7749469214437368, "grad_norm": 0.34375, "learning_rate": 0.000299381593462595, "loss": 4.0703, "mean_token_accuracy": 0.3239005759358406, "num_tokens": 107642880.0, "step": 2920 }, { "epoch": 0.7802547770700637, "grad_norm": 0.330078125, "learning_rate": 0.00029935748001904826, "loss": 4.05, "mean_token_accuracy": 0.3265016883611679, "num_tokens": 108380160.0, "step": 2940 }, { "epoch": 0.7855626326963907, "grad_norm": 0.341796875, "learning_rate": 0.00029933290640624127, "loss": 4.0587, "mean_token_accuracy": 0.3245596900582314, "num_tokens": 109117440.0, "step": 2960 }, { "epoch": 0.7908704883227177, "grad_norm": 0.345703125, "learning_rate": 0.00029930787269988524, "loss": 4.0429, "mean_token_accuracy": 0.32607768177986146, "num_tokens": 109854720.0, "step": 2980 }, { "epoch": 0.7961783439490446, "grad_norm": 0.384765625, "learning_rate": 0.0002992823789771089, "loss": 4.0549, "mean_token_accuracy": 0.32446591556072235, "num_tokens": 110592000.0, "step": 3000 }, { "epoch": 0.8014861995753716, "grad_norm": 0.35546875, "learning_rate": 0.0002992564253164581, "loss": 4.0299, "mean_token_accuracy": 0.3260423466563225, "num_tokens": 111329280.0, "step": 3020 }, { "epoch": 0.8067940552016986, "grad_norm": 0.36328125, "learning_rate": 0.00029923001179789593, "loss": 3.9977, "mean_token_accuracy": 0.329715695977211, "num_tokens": 112066560.0, "step": 3040 }, { "epoch": 0.8121019108280255, "grad_norm": 0.341796875, "learning_rate": 0.0002992031385028022, "loss": 4.0165, "mean_token_accuracy": 0.3288092538714409, "num_tokens": 112803840.0, "step": 3060 }, { "epoch": 0.8174097664543525, "grad_norm": 0.33203125, "learning_rate": 0.00029917580551397344, "loss": 3.9876, "mean_token_accuracy": 0.33256821930408476, "num_tokens": 113541120.0, "step": 3080 }, { "epoch": 0.8227176220806794, "grad_norm": 0.3359375, "learning_rate": 0.00029914801291562224, "loss": 3.9868, "mean_token_accuracy": 0.33118748664855957, "num_tokens": 114278400.0, "step": 3100 }, { "epoch": 0.8280254777070064, "grad_norm": 0.400390625, "learning_rate": 0.00029911976079337744, "loss": 4.0026, "mean_token_accuracy": 0.33054604530334475, "num_tokens": 115015680.0, "step": 3120 }, { "epoch": 0.8333333333333334, "grad_norm": 0.359375, "learning_rate": 0.00029909104923428367, "loss": 3.9793, "mean_token_accuracy": 0.33128669410943984, "num_tokens": 115752960.0, "step": 3140 }, { "epoch": 0.8386411889596603, "grad_norm": 0.365234375, "learning_rate": 0.00029906187832680093, "loss": 3.9865, "mean_token_accuracy": 0.3321387812495232, "num_tokens": 116490240.0, "step": 3160 }, { "epoch": 0.8439490445859873, "grad_norm": 0.3515625, "learning_rate": 0.0002990322481608046, "loss": 3.9889, "mean_token_accuracy": 0.3321686774492264, "num_tokens": 117227520.0, "step": 3180 }, { "epoch": 0.8492569002123143, "grad_norm": 0.353515625, "learning_rate": 0.00029900215882758507, "loss": 3.9934, "mean_token_accuracy": 0.3307906612753868, "num_tokens": 117964800.0, "step": 3200 }, { "epoch": 0.8545647558386412, "grad_norm": 0.369140625, "learning_rate": 0.0002989716104198472, "loss": 3.9904, "mean_token_accuracy": 0.3313464894890785, "num_tokens": 118702080.0, "step": 3220 }, { "epoch": 0.8598726114649682, "grad_norm": 0.373046875, "learning_rate": 0.00029894060303171054, "loss": 3.9986, "mean_token_accuracy": 0.33004593253135683, "num_tokens": 119439360.0, "step": 3240 }, { "epoch": 0.8651804670912951, "grad_norm": 0.333984375, "learning_rate": 0.0002989091367587085, "loss": 3.9751, "mean_token_accuracy": 0.33293107599020005, "num_tokens": 120176640.0, "step": 3260 }, { "epoch": 0.8704883227176221, "grad_norm": 0.427734375, "learning_rate": 0.0002988772116977885, "loss": 3.9928, "mean_token_accuracy": 0.33070232570171354, "num_tokens": 120913920.0, "step": 3280 }, { "epoch": 0.8757961783439491, "grad_norm": 0.361328125, "learning_rate": 0.00029884482794731145, "loss": 3.9566, "mean_token_accuracy": 0.33489209711551665, "num_tokens": 121651200.0, "step": 3300 }, { "epoch": 0.881104033970276, "grad_norm": 0.345703125, "learning_rate": 0.00029881198560705133, "loss": 3.9568, "mean_token_accuracy": 0.33375054448843, "num_tokens": 122388480.0, "step": 3320 }, { "epoch": 0.886411889596603, "grad_norm": 0.328125, "learning_rate": 0.0002987786847781952, "loss": 3.9383, "mean_token_accuracy": 0.33672265857458117, "num_tokens": 123125760.0, "step": 3340 }, { "epoch": 0.89171974522293, "grad_norm": 0.318359375, "learning_rate": 0.0002987449255633427, "loss": 3.9483, "mean_token_accuracy": 0.33508643358945844, "num_tokens": 123863040.0, "step": 3360 }, { "epoch": 0.8970276008492569, "grad_norm": 0.345703125, "learning_rate": 0.0002987107080665057, "loss": 3.9763, "mean_token_accuracy": 0.3321999326348305, "num_tokens": 124600320.0, "step": 3380 }, { "epoch": 0.9023354564755839, "grad_norm": 0.330078125, "learning_rate": 0.00029867603239310794, "loss": 3.9124, "mean_token_accuracy": 0.3390614792704582, "num_tokens": 125337600.0, "step": 3400 }, { "epoch": 0.9076433121019108, "grad_norm": 0.345703125, "learning_rate": 0.000298640898649985, "loss": 3.9429, "mean_token_accuracy": 0.3345550671219826, "num_tokens": 126074880.0, "step": 3420 }, { "epoch": 0.9129511677282378, "grad_norm": 0.3359375, "learning_rate": 0.0002986053069453837, "loss": 3.9574, "mean_token_accuracy": 0.3337532624602318, "num_tokens": 126812160.0, "step": 3440 }, { "epoch": 0.9182590233545648, "grad_norm": 0.33203125, "learning_rate": 0.00029856925738896175, "loss": 3.9413, "mean_token_accuracy": 0.3354275390505791, "num_tokens": 127549440.0, "step": 3460 }, { "epoch": 0.9235668789808917, "grad_norm": 0.33203125, "learning_rate": 0.00029853275009178757, "loss": 3.9111, "mean_token_accuracy": 0.3389011204242706, "num_tokens": 128286720.0, "step": 3480 }, { "epoch": 0.9288747346072187, "grad_norm": 0.365234375, "learning_rate": 0.0002984957851663399, "loss": 3.9104, "mean_token_accuracy": 0.33805989772081374, "num_tokens": 129024000.0, "step": 3500 }, { "epoch": 0.9341825902335457, "grad_norm": 0.40625, "learning_rate": 0.00029845836272650734, "loss": 3.9297, "mean_token_accuracy": 0.33650793582201005, "num_tokens": 129761280.0, "step": 3520 }, { "epoch": 0.9394904458598726, "grad_norm": 0.326171875, "learning_rate": 0.0002984204828875882, "loss": 3.9209, "mean_token_accuracy": 0.33741302490234376, "num_tokens": 130498560.0, "step": 3540 }, { "epoch": 0.9447983014861996, "grad_norm": 0.32421875, "learning_rate": 0.0002983821457662899, "loss": 3.922, "mean_token_accuracy": 0.3376875385642052, "num_tokens": 131235840.0, "step": 3560 }, { "epoch": 0.9501061571125266, "grad_norm": 0.34375, "learning_rate": 0.0002983433514807289, "loss": 3.8807, "mean_token_accuracy": 0.3409817323088646, "num_tokens": 131973120.0, "step": 3580 }, { "epoch": 0.9554140127388535, "grad_norm": 0.341796875, "learning_rate": 0.0002983041001504301, "loss": 3.8952, "mean_token_accuracy": 0.3391090452671051, "num_tokens": 132710400.0, "step": 3600 }, { "epoch": 0.9607218683651805, "grad_norm": 0.330078125, "learning_rate": 0.0002982643918963265, "loss": 3.8939, "mean_token_accuracy": 0.3410986065864563, "num_tokens": 133447680.0, "step": 3620 }, { "epoch": 0.9660297239915074, "grad_norm": 0.36328125, "learning_rate": 0.0002982242268407589, "loss": 3.8973, "mean_token_accuracy": 0.33945286870002744, "num_tokens": 134184960.0, "step": 3640 }, { "epoch": 0.9713375796178344, "grad_norm": 0.373046875, "learning_rate": 0.0002981836051074756, "loss": 3.9093, "mean_token_accuracy": 0.33963089883327485, "num_tokens": 134922240.0, "step": 3660 }, { "epoch": 0.9766454352441614, "grad_norm": 0.33203125, "learning_rate": 0.00029814252682163183, "loss": 3.8854, "mean_token_accuracy": 0.3408594250679016, "num_tokens": 135659520.0, "step": 3680 }, { "epoch": 0.9819532908704883, "grad_norm": 0.359375, "learning_rate": 0.0002981009921097895, "loss": 3.8924, "mean_token_accuracy": 0.3408893242478371, "num_tokens": 136396800.0, "step": 3700 }, { "epoch": 0.9872611464968153, "grad_norm": 0.310546875, "learning_rate": 0.0002980590010999167, "loss": 3.9092, "mean_token_accuracy": 0.33803136348724366, "num_tokens": 137134080.0, "step": 3720 }, { "epoch": 0.9925690021231423, "grad_norm": 0.333984375, "learning_rate": 0.00029801655392138745, "loss": 3.8678, "mean_token_accuracy": 0.3428612187504768, "num_tokens": 137871360.0, "step": 3740 }, { "epoch": 0.9978768577494692, "grad_norm": 0.341796875, "learning_rate": 0.0002979736507049811, "loss": 3.8857, "mean_token_accuracy": 0.34208931475877763, "num_tokens": 138608640.0, "step": 3760 }, { "epoch": 1.0031847133757963, "grad_norm": 0.302734375, "learning_rate": 0.00029793029158288226, "loss": 3.8558, "mean_token_accuracy": 0.3440150052309036, "num_tokens": 139345920.0, "step": 3780 }, { "epoch": 1.0084925690021231, "grad_norm": 0.306640625, "learning_rate": 0.00029788647668867993, "loss": 3.7901, "mean_token_accuracy": 0.34982468783855436, "num_tokens": 140083200.0, "step": 3800 }, { "epoch": 1.0138004246284502, "grad_norm": 0.345703125, "learning_rate": 0.0002978422061573676, "loss": 3.7697, "mean_token_accuracy": 0.3517232045531273, "num_tokens": 140820480.0, "step": 3820 }, { "epoch": 1.019108280254777, "grad_norm": 0.32421875, "learning_rate": 0.0002977974801253423, "loss": 3.7704, "mean_token_accuracy": 0.35201402604579923, "num_tokens": 141557760.0, "step": 3840 }, { "epoch": 1.0244161358811041, "grad_norm": 0.298828125, "learning_rate": 0.0002977522987304047, "loss": 3.7903, "mean_token_accuracy": 0.3495963796973228, "num_tokens": 142295040.0, "step": 3860 }, { "epoch": 1.029723991507431, "grad_norm": 0.318359375, "learning_rate": 0.00029770666211175823, "loss": 3.8255, "mean_token_accuracy": 0.34629810601472855, "num_tokens": 143032320.0, "step": 3880 }, { "epoch": 1.035031847133758, "grad_norm": 0.34765625, "learning_rate": 0.00029766057041000915, "loss": 3.8028, "mean_token_accuracy": 0.3492783769965172, "num_tokens": 143769600.0, "step": 3900 }, { "epoch": 1.040339702760085, "grad_norm": 0.357421875, "learning_rate": 0.00029761402376716545, "loss": 3.8136, "mean_token_accuracy": 0.34817215502262117, "num_tokens": 144506880.0, "step": 3920 }, { "epoch": 1.045647558386412, "grad_norm": 0.326171875, "learning_rate": 0.000297567022326637, "loss": 3.7801, "mean_token_accuracy": 0.35125163197517395, "num_tokens": 145244160.0, "step": 3940 }, { "epoch": 1.0509554140127388, "grad_norm": 0.36328125, "learning_rate": 0.000297519566233235, "loss": 3.7783, "mean_token_accuracy": 0.35097303837537763, "num_tokens": 145981440.0, "step": 3960 }, { "epoch": 1.056263269639066, "grad_norm": 0.33203125, "learning_rate": 0.0002974716556331713, "loss": 3.7953, "mean_token_accuracy": 0.3500176668167114, "num_tokens": 146718720.0, "step": 3980 }, { "epoch": 1.0615711252653928, "grad_norm": 0.298828125, "learning_rate": 0.00029742329067405796, "loss": 3.8004, "mean_token_accuracy": 0.34820069521665575, "num_tokens": 147456000.0, "step": 4000 }, { "epoch": 1.0668789808917198, "grad_norm": 0.294921875, "learning_rate": 0.00029737447150490716, "loss": 3.7949, "mean_token_accuracy": 0.3504022628068924, "num_tokens": 148193280.0, "step": 4020 }, { "epoch": 1.0721868365180467, "grad_norm": 0.318359375, "learning_rate": 0.00029732519827613035, "loss": 3.7822, "mean_token_accuracy": 0.35130327194929123, "num_tokens": 148930560.0, "step": 4040 }, { "epoch": 1.0774946921443738, "grad_norm": 0.322265625, "learning_rate": 0.00029727547113953803, "loss": 3.7968, "mean_token_accuracy": 0.3487782686948776, "num_tokens": 149667840.0, "step": 4060 }, { "epoch": 1.0828025477707006, "grad_norm": 0.353515625, "learning_rate": 0.00029722529024833906, "loss": 3.7995, "mean_token_accuracy": 0.3496724843978882, "num_tokens": 150405120.0, "step": 4080 }, { "epoch": 1.0881104033970277, "grad_norm": 0.330078125, "learning_rate": 0.0002971746557571404, "loss": 3.7846, "mean_token_accuracy": 0.3500896915793419, "num_tokens": 151142400.0, "step": 4100 }, { "epoch": 1.0934182590233545, "grad_norm": 0.349609375, "learning_rate": 0.0002971235678219466, "loss": 3.7846, "mean_token_accuracy": 0.35166068822145463, "num_tokens": 151879680.0, "step": 4120 }, { "epoch": 1.0987261146496816, "grad_norm": 0.3515625, "learning_rate": 0.00029707202660015904, "loss": 3.7751, "mean_token_accuracy": 0.351664762198925, "num_tokens": 152616960.0, "step": 4140 }, { "epoch": 1.1040339702760085, "grad_norm": 0.33203125, "learning_rate": 0.000297020032250576, "loss": 3.7901, "mean_token_accuracy": 0.3495121240615845, "num_tokens": 153354240.0, "step": 4160 }, { "epoch": 1.1093418259023355, "grad_norm": 0.357421875, "learning_rate": 0.00029696758493339163, "loss": 3.7712, "mean_token_accuracy": 0.35191753655672076, "num_tokens": 154091520.0, "step": 4180 }, { "epoch": 1.1146496815286624, "grad_norm": 0.328125, "learning_rate": 0.00029691468481019565, "loss": 3.7694, "mean_token_accuracy": 0.35236192792654036, "num_tokens": 154828800.0, "step": 4200 }, { "epoch": 1.1199575371549895, "grad_norm": 0.365234375, "learning_rate": 0.00029686133204397304, "loss": 3.7934, "mean_token_accuracy": 0.35003397762775423, "num_tokens": 155566080.0, "step": 4220 }, { "epoch": 1.1252653927813163, "grad_norm": 0.35546875, "learning_rate": 0.00029680752679910315, "loss": 3.7577, "mean_token_accuracy": 0.35332545191049575, "num_tokens": 156303360.0, "step": 4240 }, { "epoch": 1.1305732484076434, "grad_norm": 0.3515625, "learning_rate": 0.00029675326924135965, "loss": 3.7688, "mean_token_accuracy": 0.3521173134446144, "num_tokens": 157040640.0, "step": 4260 }, { "epoch": 1.1358811040339702, "grad_norm": 0.328125, "learning_rate": 0.0002966985595379095, "loss": 3.7733, "mean_token_accuracy": 0.3522369012236595, "num_tokens": 157777920.0, "step": 4280 }, { "epoch": 1.1411889596602973, "grad_norm": 0.37109375, "learning_rate": 0.0002966433978573131, "loss": 3.777, "mean_token_accuracy": 0.35229125916957854, "num_tokens": 158515200.0, "step": 4300 }, { "epoch": 1.1464968152866242, "grad_norm": 0.361328125, "learning_rate": 0.000296587784369523, "loss": 3.7817, "mean_token_accuracy": 0.3508085966110229, "num_tokens": 159252480.0, "step": 4320 }, { "epoch": 1.1518046709129512, "grad_norm": 0.328125, "learning_rate": 0.00029653171924588404, "loss": 3.7808, "mean_token_accuracy": 0.350706672668457, "num_tokens": 159989760.0, "step": 4340 }, { "epoch": 1.157112526539278, "grad_norm": 0.306640625, "learning_rate": 0.0002964752026591323, "loss": 3.7561, "mean_token_accuracy": 0.35282262712717055, "num_tokens": 160727040.0, "step": 4360 }, { "epoch": 1.1624203821656052, "grad_norm": 0.341796875, "learning_rate": 0.0002964182347833951, "loss": 3.7552, "mean_token_accuracy": 0.35383235812187197, "num_tokens": 161464320.0, "step": 4380 }, { "epoch": 1.167728237791932, "grad_norm": 0.306640625, "learning_rate": 0.0002963608157941899, "loss": 3.776, "mean_token_accuracy": 0.35114155411720277, "num_tokens": 162201600.0, "step": 4400 }, { "epoch": 1.173036093418259, "grad_norm": 0.326171875, "learning_rate": 0.0002963029458684243, "loss": 3.793, "mean_token_accuracy": 0.34886660128831865, "num_tokens": 162938880.0, "step": 4420 }, { "epoch": 1.178343949044586, "grad_norm": 0.322265625, "learning_rate": 0.00029624462518439506, "loss": 3.7629, "mean_token_accuracy": 0.3531188860535622, "num_tokens": 163676160.0, "step": 4440 }, { "epoch": 1.183651804670913, "grad_norm": 0.306640625, "learning_rate": 0.0002961858539217877, "loss": 3.7504, "mean_token_accuracy": 0.35443574488162993, "num_tokens": 164413440.0, "step": 4460 }, { "epoch": 1.1889596602972399, "grad_norm": 0.328125, "learning_rate": 0.00029612663226167614, "loss": 3.7552, "mean_token_accuracy": 0.3532751649618149, "num_tokens": 165150720.0, "step": 4480 }, { "epoch": 1.194267515923567, "grad_norm": 0.31640625, "learning_rate": 0.00029606696038652177, "loss": 3.7513, "mean_token_accuracy": 0.35349260568618773, "num_tokens": 165888000.0, "step": 4500 }, { "epoch": 1.1995753715498938, "grad_norm": 0.318359375, "learning_rate": 0.0002960068384801732, "loss": 3.7727, "mean_token_accuracy": 0.3513331711292267, "num_tokens": 166625280.0, "step": 4520 }, { "epoch": 1.2048832271762209, "grad_norm": 0.34765625, "learning_rate": 0.0002959462667278656, "loss": 3.7601, "mean_token_accuracy": 0.3538540959358215, "num_tokens": 167362560.0, "step": 4540 }, { "epoch": 1.2101910828025477, "grad_norm": 0.322265625, "learning_rate": 0.00029588524531622, "loss": 3.7783, "mean_token_accuracy": 0.3522015675902367, "num_tokens": 168099840.0, "step": 4560 }, { "epoch": 1.2154989384288748, "grad_norm": 0.353515625, "learning_rate": 0.00029582377443324303, "loss": 3.7404, "mean_token_accuracy": 0.3550935000181198, "num_tokens": 168837120.0, "step": 4580 }, { "epoch": 1.2208067940552016, "grad_norm": 0.3046875, "learning_rate": 0.0002957618542683259, "loss": 3.7399, "mean_token_accuracy": 0.35597412586212157, "num_tokens": 169574400.0, "step": 4600 }, { "epoch": 1.2261146496815287, "grad_norm": 0.33203125, "learning_rate": 0.0002956994850122443, "loss": 3.7317, "mean_token_accuracy": 0.355904820561409, "num_tokens": 170311680.0, "step": 4620 }, { "epoch": 1.2314225053078556, "grad_norm": 0.3359375, "learning_rate": 0.00029563666685715733, "loss": 3.7262, "mean_token_accuracy": 0.3565883919596672, "num_tokens": 171048960.0, "step": 4640 }, { "epoch": 1.2367303609341826, "grad_norm": 0.326171875, "learning_rate": 0.0002955733999966072, "loss": 3.7358, "mean_token_accuracy": 0.3540579453110695, "num_tokens": 171786240.0, "step": 4660 }, { "epoch": 1.2420382165605095, "grad_norm": 0.32421875, "learning_rate": 0.0002955096846255187, "loss": 3.7354, "mean_token_accuracy": 0.3545865938067436, "num_tokens": 172523520.0, "step": 4680 }, { "epoch": 1.2473460721868366, "grad_norm": 0.36328125, "learning_rate": 0.0002954455209401985, "loss": 3.7544, "mean_token_accuracy": 0.3537209168076515, "num_tokens": 173260800.0, "step": 4700 }, { "epoch": 1.2526539278131634, "grad_norm": 0.306640625, "learning_rate": 0.00029538090913833426, "loss": 3.7102, "mean_token_accuracy": 0.3563709482550621, "num_tokens": 173998080.0, "step": 4720 }, { "epoch": 1.2579617834394905, "grad_norm": 0.373046875, "learning_rate": 0.00029531584941899455, "loss": 3.75, "mean_token_accuracy": 0.3536651983857155, "num_tokens": 174735360.0, "step": 4740 }, { "epoch": 1.2632696390658174, "grad_norm": 0.302734375, "learning_rate": 0.00029525034198262776, "loss": 3.7349, "mean_token_accuracy": 0.3555039197206497, "num_tokens": 175472640.0, "step": 4760 }, { "epoch": 1.2685774946921444, "grad_norm": 0.3515625, "learning_rate": 0.0002951843870310619, "loss": 3.717, "mean_token_accuracy": 0.358041150867939, "num_tokens": 176209920.0, "step": 4780 }, { "epoch": 1.2738853503184713, "grad_norm": 0.330078125, "learning_rate": 0.0002951179847675037, "loss": 3.762, "mean_token_accuracy": 0.35168243050575254, "num_tokens": 176947200.0, "step": 4800 }, { "epoch": 1.2791932059447984, "grad_norm": 0.30078125, "learning_rate": 0.00029505113539653795, "loss": 3.7303, "mean_token_accuracy": 0.356339693069458, "num_tokens": 177684480.0, "step": 4820 }, { "epoch": 1.2845010615711252, "grad_norm": 0.31640625, "learning_rate": 0.00029498383912412714, "loss": 3.6891, "mean_token_accuracy": 0.3587913140654564, "num_tokens": 178421760.0, "step": 4840 }, { "epoch": 1.2898089171974523, "grad_norm": 0.302734375, "learning_rate": 0.00029491609615761046, "loss": 3.7503, "mean_token_accuracy": 0.35429984629154204, "num_tokens": 179159040.0, "step": 4860 }, { "epoch": 1.2951167728237791, "grad_norm": 0.306640625, "learning_rate": 0.00029484790670570356, "loss": 3.7357, "mean_token_accuracy": 0.35489780455827713, "num_tokens": 179896320.0, "step": 4880 }, { "epoch": 1.3004246284501062, "grad_norm": 0.330078125, "learning_rate": 0.00029477927097849757, "loss": 3.7185, "mean_token_accuracy": 0.35750706791877745, "num_tokens": 180633600.0, "step": 4900 }, { "epoch": 1.305732484076433, "grad_norm": 0.337890625, "learning_rate": 0.00029471018918745867, "loss": 3.7204, "mean_token_accuracy": 0.3562255397439003, "num_tokens": 181370880.0, "step": 4920 }, { "epoch": 1.3110403397027601, "grad_norm": 0.3203125, "learning_rate": 0.00029464066154542736, "loss": 3.7231, "mean_token_accuracy": 0.35591025054454806, "num_tokens": 182108160.0, "step": 4940 }, { "epoch": 1.316348195329087, "grad_norm": 0.33984375, "learning_rate": 0.00029457068826661776, "loss": 3.7279, "mean_token_accuracy": 0.35606245696544647, "num_tokens": 182845440.0, "step": 4960 }, { "epoch": 1.321656050955414, "grad_norm": 0.3203125, "learning_rate": 0.0002945002695666169, "loss": 3.7144, "mean_token_accuracy": 0.3572216793894768, "num_tokens": 183582720.0, "step": 4980 }, { "epoch": 1.326963906581741, "grad_norm": 0.326171875, "learning_rate": 0.0002944294056623845, "loss": 3.726, "mean_token_accuracy": 0.3547877296805382, "num_tokens": 184320000.0, "step": 5000 }, { "epoch": 1.332271762208068, "grad_norm": 0.3046875, "learning_rate": 0.00029435809677225155, "loss": 3.7059, "mean_token_accuracy": 0.35740242451429366, "num_tokens": 185057280.0, "step": 5020 }, { "epoch": 1.3375796178343948, "grad_norm": 0.306640625, "learning_rate": 0.0002942863431159201, "loss": 3.7051, "mean_token_accuracy": 0.35800445526838304, "num_tokens": 185794560.0, "step": 5040 }, { "epoch": 1.342887473460722, "grad_norm": 0.31640625, "learning_rate": 0.00029421414491446286, "loss": 3.7126, "mean_token_accuracy": 0.35709257423877716, "num_tokens": 186531840.0, "step": 5060 }, { "epoch": 1.3481953290870488, "grad_norm": 0.310546875, "learning_rate": 0.00029414150239032177, "loss": 3.736, "mean_token_accuracy": 0.35560176223516465, "num_tokens": 187269120.0, "step": 5080 }, { "epoch": 1.3535031847133758, "grad_norm": 0.31640625, "learning_rate": 0.00029406841576730793, "loss": 3.7119, "mean_token_accuracy": 0.3567677766084671, "num_tokens": 188006400.0, "step": 5100 }, { "epoch": 1.3588110403397027, "grad_norm": 0.29296875, "learning_rate": 0.0002939948852706007, "loss": 3.7196, "mean_token_accuracy": 0.35653674602508545, "num_tokens": 188743680.0, "step": 5120 }, { "epoch": 1.3641188959660298, "grad_norm": 0.33203125, "learning_rate": 0.00029392091112674694, "loss": 3.6989, "mean_token_accuracy": 0.35854941308498384, "num_tokens": 189480960.0, "step": 5140 }, { "epoch": 1.3694267515923566, "grad_norm": 0.33203125, "learning_rate": 0.00029384649356366043, "loss": 3.6766, "mean_token_accuracy": 0.3608692094683647, "num_tokens": 190218240.0, "step": 5160 }, { "epoch": 1.3747346072186837, "grad_norm": 0.322265625, "learning_rate": 0.0002937716328106212, "loss": 3.7025, "mean_token_accuracy": 0.3594477042555809, "num_tokens": 190955520.0, "step": 5180 }, { "epoch": 1.3800424628450108, "grad_norm": 0.328125, "learning_rate": 0.00029369632909827454, "loss": 3.6702, "mean_token_accuracy": 0.36147396117448805, "num_tokens": 191692800.0, "step": 5200 }, { "epoch": 1.3853503184713376, "grad_norm": 0.3046875, "learning_rate": 0.0002936205826586307, "loss": 3.7123, "mean_token_accuracy": 0.35848689675331114, "num_tokens": 192430080.0, "step": 5220 }, { "epoch": 1.3906581740976645, "grad_norm": 0.30078125, "learning_rate": 0.0002935443937250638, "loss": 3.7094, "mean_token_accuracy": 0.35681942105293274, "num_tokens": 193167360.0, "step": 5240 }, { "epoch": 1.3959660297239915, "grad_norm": 0.294921875, "learning_rate": 0.0002934677625323115, "loss": 3.6718, "mean_token_accuracy": 0.3608773618936539, "num_tokens": 193904640.0, "step": 5260 }, { "epoch": 1.4012738853503186, "grad_norm": 0.330078125, "learning_rate": 0.0002933906893164738, "loss": 3.6949, "mean_token_accuracy": 0.3604642331600189, "num_tokens": 194641920.0, "step": 5280 }, { "epoch": 1.4065817409766455, "grad_norm": 0.3125, "learning_rate": 0.0002933131743150127, "loss": 3.7408, "mean_token_accuracy": 0.3542373314499855, "num_tokens": 195379200.0, "step": 5300 }, { "epoch": 1.4118895966029723, "grad_norm": 0.302734375, "learning_rate": 0.0002932352177667514, "loss": 3.7115, "mean_token_accuracy": 0.357599475979805, "num_tokens": 196116480.0, "step": 5320 }, { "epoch": 1.4171974522292994, "grad_norm": 0.3046875, "learning_rate": 0.00029315681991187335, "loss": 3.7172, "mean_token_accuracy": 0.356248639523983, "num_tokens": 196853760.0, "step": 5340 }, { "epoch": 1.4225053078556265, "grad_norm": 0.330078125, "learning_rate": 0.0002930779809919218, "loss": 3.6992, "mean_token_accuracy": 0.35984453111886977, "num_tokens": 197591040.0, "step": 5360 }, { "epoch": 1.4278131634819533, "grad_norm": 0.30859375, "learning_rate": 0.0002929987012497989, "loss": 3.6976, "mean_token_accuracy": 0.35915416330099104, "num_tokens": 198328320.0, "step": 5380 }, { "epoch": 1.4331210191082802, "grad_norm": 0.296875, "learning_rate": 0.00029291898092976485, "loss": 3.6988, "mean_token_accuracy": 0.35835779309272764, "num_tokens": 199065600.0, "step": 5400 }, { "epoch": 1.4384288747346072, "grad_norm": 0.333984375, "learning_rate": 0.0002928388202774374, "loss": 3.7124, "mean_token_accuracy": 0.35706539303064344, "num_tokens": 199802880.0, "step": 5420 }, { "epoch": 1.4437367303609343, "grad_norm": 0.29296875, "learning_rate": 0.0002927582195397909, "loss": 3.6935, "mean_token_accuracy": 0.3582422822713852, "num_tokens": 200540160.0, "step": 5440 }, { "epoch": 1.4490445859872612, "grad_norm": 0.302734375, "learning_rate": 0.0002926771789651556, "loss": 3.6846, "mean_token_accuracy": 0.36032969057559966, "num_tokens": 201277440.0, "step": 5460 }, { "epoch": 1.454352441613588, "grad_norm": 0.3203125, "learning_rate": 0.00029259569880321683, "loss": 3.6873, "mean_token_accuracy": 0.36046015173196794, "num_tokens": 202014720.0, "step": 5480 }, { "epoch": 1.459660297239915, "grad_norm": 0.29296875, "learning_rate": 0.0002925137793050145, "loss": 3.681, "mean_token_accuracy": 0.36028348505496977, "num_tokens": 202752000.0, "step": 5500 }, { "epoch": 1.4649681528662422, "grad_norm": 0.341796875, "learning_rate": 0.00029243142072294184, "loss": 3.6758, "mean_token_accuracy": 0.36076728701591493, "num_tokens": 203489280.0, "step": 5520 }, { "epoch": 1.470276008492569, "grad_norm": 0.30078125, "learning_rate": 0.0002923486233107451, "loss": 3.6965, "mean_token_accuracy": 0.3586866736412048, "num_tokens": 204226560.0, "step": 5540 }, { "epoch": 1.4755838641188959, "grad_norm": 0.337890625, "learning_rate": 0.00029226538732352237, "loss": 3.66, "mean_token_accuracy": 0.36305718719959257, "num_tokens": 204963840.0, "step": 5560 }, { "epoch": 1.480891719745223, "grad_norm": 0.3125, "learning_rate": 0.0002921817130177232, "loss": 3.7332, "mean_token_accuracy": 0.35452407896518706, "num_tokens": 205701120.0, "step": 5580 }, { "epoch": 1.48619957537155, "grad_norm": 0.3046875, "learning_rate": 0.0002920976006511474, "loss": 3.6622, "mean_token_accuracy": 0.36328413486480715, "num_tokens": 206438400.0, "step": 5600 }, { "epoch": 1.4915074309978769, "grad_norm": 0.3359375, "learning_rate": 0.00029201305048294463, "loss": 3.6918, "mean_token_accuracy": 0.3597670704126358, "num_tokens": 207175680.0, "step": 5620 }, { "epoch": 1.4968152866242037, "grad_norm": 0.291015625, "learning_rate": 0.00029192806277361324, "loss": 3.7095, "mean_token_accuracy": 0.35791747868061063, "num_tokens": 207912960.0, "step": 5640 }, { "epoch": 1.5021231422505308, "grad_norm": 0.333984375, "learning_rate": 0.00029184263778499973, "loss": 3.6785, "mean_token_accuracy": 0.3622159719467163, "num_tokens": 208650240.0, "step": 5660 }, { "epoch": 1.5074309978768579, "grad_norm": 0.349609375, "learning_rate": 0.00029175677578029786, "loss": 3.6522, "mean_token_accuracy": 0.3637366831302643, "num_tokens": 209387520.0, "step": 5680 }, { "epoch": 1.5127388535031847, "grad_norm": 0.314453125, "learning_rate": 0.00029167047702404776, "loss": 3.6857, "mean_token_accuracy": 0.36019922941923144, "num_tokens": 210124800.0, "step": 5700 }, { "epoch": 1.5180467091295116, "grad_norm": 0.296875, "learning_rate": 0.0002915837417821353, "loss": 3.6719, "mean_token_accuracy": 0.3615677312016487, "num_tokens": 210862080.0, "step": 5720 }, { "epoch": 1.5233545647558386, "grad_norm": 0.330078125, "learning_rate": 0.0002914965703217911, "loss": 3.6594, "mean_token_accuracy": 0.36377609223127366, "num_tokens": 211599360.0, "step": 5740 }, { "epoch": 1.5286624203821657, "grad_norm": 0.3515625, "learning_rate": 0.0002914089629115898, "loss": 3.6789, "mean_token_accuracy": 0.36013943552970884, "num_tokens": 212336640.0, "step": 5760 }, { "epoch": 1.5339702760084926, "grad_norm": 0.314453125, "learning_rate": 0.00029132091982144907, "loss": 3.6588, "mean_token_accuracy": 0.36205832958221434, "num_tokens": 213073920.0, "step": 5780 }, { "epoch": 1.5392781316348194, "grad_norm": 0.291015625, "learning_rate": 0.00029123244132262904, "loss": 3.6429, "mean_token_accuracy": 0.3639663517475128, "num_tokens": 213811200.0, "step": 5800 }, { "epoch": 1.5445859872611465, "grad_norm": 0.29296875, "learning_rate": 0.0002911435276877313, "loss": 3.6611, "mean_token_accuracy": 0.3630639806389809, "num_tokens": 214548480.0, "step": 5820 }, { "epoch": 1.5498938428874736, "grad_norm": 0.310546875, "learning_rate": 0.00029105417919069804, "loss": 3.6468, "mean_token_accuracy": 0.36436181515455246, "num_tokens": 215285760.0, "step": 5840 }, { "epoch": 1.5552016985138004, "grad_norm": 0.3359375, "learning_rate": 0.0002909643961068114, "loss": 3.6487, "mean_token_accuracy": 0.36384675949811934, "num_tokens": 216023040.0, "step": 5860 }, { "epoch": 1.5605095541401273, "grad_norm": 0.33203125, "learning_rate": 0.0002908741787126923, "loss": 3.6744, "mean_token_accuracy": 0.3607401058077812, "num_tokens": 216760320.0, "step": 5880 }, { "epoch": 1.5658174097664543, "grad_norm": 0.318359375, "learning_rate": 0.0002907835272862999, "loss": 3.6445, "mean_token_accuracy": 0.363505657017231, "num_tokens": 217497600.0, "step": 5900 }, { "epoch": 1.5711252653927814, "grad_norm": 0.318359375, "learning_rate": 0.0002906924421069305, "loss": 3.6383, "mean_token_accuracy": 0.36442433297634125, "num_tokens": 218234880.0, "step": 5920 }, { "epoch": 1.5764331210191083, "grad_norm": 0.3203125, "learning_rate": 0.0002906009234552168, "loss": 3.6623, "mean_token_accuracy": 0.3622513055801392, "num_tokens": 218972160.0, "step": 5940 }, { "epoch": 1.5817409766454351, "grad_norm": 0.3046875, "learning_rate": 0.0002905089716131272, "loss": 3.6602, "mean_token_accuracy": 0.3628071337938309, "num_tokens": 219709440.0, "step": 5960 }, { "epoch": 1.5870488322717622, "grad_norm": 0.30859375, "learning_rate": 0.0002904165868639646, "loss": 3.6467, "mean_token_accuracy": 0.3640927359461784, "num_tokens": 220446720.0, "step": 5980 }, { "epoch": 1.5923566878980893, "grad_norm": 0.33203125, "learning_rate": 0.0002903237694923656, "loss": 3.6442, "mean_token_accuracy": 0.36401391625404356, "num_tokens": 221184000.0, "step": 6000 }, { "epoch": 1.5976645435244161, "grad_norm": 0.3203125, "learning_rate": 0.0002902305197842999, "loss": 3.6566, "mean_token_accuracy": 0.3627894654870033, "num_tokens": 221921280.0, "step": 6020 }, { "epoch": 1.602972399150743, "grad_norm": 0.31640625, "learning_rate": 0.00029013683802706915, "loss": 3.6096, "mean_token_accuracy": 0.36791693717241286, "num_tokens": 222658560.0, "step": 6040 }, { "epoch": 1.60828025477707, "grad_norm": 0.322265625, "learning_rate": 0.00029004272450930613, "loss": 3.6647, "mean_token_accuracy": 0.3632963716983795, "num_tokens": 223395840.0, "step": 6060 }, { "epoch": 1.6135881104033971, "grad_norm": 0.31640625, "learning_rate": 0.0002899481795209738, "loss": 3.6573, "mean_token_accuracy": 0.36316862553358076, "num_tokens": 224133120.0, "step": 6080 }, { "epoch": 1.618895966029724, "grad_norm": 0.310546875, "learning_rate": 0.00028985320335336466, "loss": 3.6454, "mean_token_accuracy": 0.36454935520887377, "num_tokens": 224870400.0, "step": 6100 }, { "epoch": 1.6242038216560508, "grad_norm": 0.30859375, "learning_rate": 0.0002897577962990995, "loss": 3.6691, "mean_token_accuracy": 0.3610635489225388, "num_tokens": 225607680.0, "step": 6120 }, { "epoch": 1.629511677282378, "grad_norm": 0.291015625, "learning_rate": 0.0002896619586521267, "loss": 3.64, "mean_token_accuracy": 0.36392558068037034, "num_tokens": 226344960.0, "step": 6140 }, { "epoch": 1.634819532908705, "grad_norm": 0.302734375, "learning_rate": 0.00028956569070772136, "loss": 3.6201, "mean_token_accuracy": 0.3673937261104584, "num_tokens": 227082240.0, "step": 6160 }, { "epoch": 1.6401273885350318, "grad_norm": 0.306640625, "learning_rate": 0.0002894689927624843, "loss": 3.6435, "mean_token_accuracy": 0.3638549119234085, "num_tokens": 227819520.0, "step": 6180 }, { "epoch": 1.6454352441613587, "grad_norm": 0.2890625, "learning_rate": 0.0002893718651143411, "loss": 3.6434, "mean_token_accuracy": 0.3635260403156281, "num_tokens": 228556800.0, "step": 6200 }, { "epoch": 1.6507430997876857, "grad_norm": 0.3046875, "learning_rate": 0.0002892743080625413, "loss": 3.6314, "mean_token_accuracy": 0.365743912756443, "num_tokens": 229294080.0, "step": 6220 }, { "epoch": 1.6560509554140128, "grad_norm": 0.349609375, "learning_rate": 0.0002891763219076575, "loss": 3.6342, "mean_token_accuracy": 0.36500734090805054, "num_tokens": 230031360.0, "step": 6240 }, { "epoch": 1.6613588110403397, "grad_norm": 0.302734375, "learning_rate": 0.0002890779069515843, "loss": 3.643, "mean_token_accuracy": 0.36362796276807785, "num_tokens": 230768640.0, "step": 6260 }, { "epoch": 1.6666666666666665, "grad_norm": 0.314453125, "learning_rate": 0.0002889790634975374, "loss": 3.6463, "mean_token_accuracy": 0.36444199830293655, "num_tokens": 231505920.0, "step": 6280 }, { "epoch": 1.6719745222929936, "grad_norm": 0.306640625, "learning_rate": 0.00028887979185005273, "loss": 3.6472, "mean_token_accuracy": 0.3640682741999626, "num_tokens": 232243200.0, "step": 6300 }, { "epoch": 1.6772823779193207, "grad_norm": 0.337890625, "learning_rate": 0.00028878009231498545, "loss": 3.6247, "mean_token_accuracy": 0.3655101642012596, "num_tokens": 232980480.0, "step": 6320 }, { "epoch": 1.6825902335456475, "grad_norm": 0.3203125, "learning_rate": 0.00028867996519950915, "loss": 3.6276, "mean_token_accuracy": 0.3664247661828995, "num_tokens": 233717760.0, "step": 6340 }, { "epoch": 1.6878980891719744, "grad_norm": 0.3125, "learning_rate": 0.0002885794108121146, "loss": 3.6282, "mean_token_accuracy": 0.36596270799636843, "num_tokens": 234455040.0, "step": 6360 }, { "epoch": 1.6932059447983014, "grad_norm": 0.31640625, "learning_rate": 0.00028847842946260924, "loss": 3.6335, "mean_token_accuracy": 0.3658485531806946, "num_tokens": 235192320.0, "step": 6380 }, { "epoch": 1.6985138004246285, "grad_norm": 0.3125, "learning_rate": 0.0002883770214621157, "loss": 3.6465, "mean_token_accuracy": 0.36347031891345977, "num_tokens": 235929600.0, "step": 6400 }, { "epoch": 1.7038216560509554, "grad_norm": 0.314453125, "learning_rate": 0.0002882751871230714, "loss": 3.634, "mean_token_accuracy": 0.3653131127357483, "num_tokens": 236666880.0, "step": 6420 }, { "epoch": 1.7091295116772822, "grad_norm": 0.30078125, "learning_rate": 0.00028817292675922704, "loss": 3.6533, "mean_token_accuracy": 0.36318493634462357, "num_tokens": 237404160.0, "step": 6440 }, { "epoch": 1.7144373673036093, "grad_norm": 0.333984375, "learning_rate": 0.0002880702406856461, "loss": 3.618, "mean_token_accuracy": 0.3665987208485603, "num_tokens": 238141440.0, "step": 6460 }, { "epoch": 1.7197452229299364, "grad_norm": 0.310546875, "learning_rate": 0.00028796712921870353, "loss": 3.6155, "mean_token_accuracy": 0.3671273604035378, "num_tokens": 238878720.0, "step": 6480 }, { "epoch": 1.7250530785562632, "grad_norm": 0.318359375, "learning_rate": 0.000287863592676085, "loss": 3.6222, "mean_token_accuracy": 0.3669126436114311, "num_tokens": 239616000.0, "step": 6500 }, { "epoch": 1.73036093418259, "grad_norm": 0.314453125, "learning_rate": 0.0002877596313767858, "loss": 3.6326, "mean_token_accuracy": 0.3653402909636497, "num_tokens": 240353280.0, "step": 6520 }, { "epoch": 1.7356687898089171, "grad_norm": 0.359375, "learning_rate": 0.00028765524564111004, "loss": 3.6347, "mean_token_accuracy": 0.36520982831716536, "num_tokens": 241090560.0, "step": 6540 }, { "epoch": 1.7409766454352442, "grad_norm": 0.330078125, "learning_rate": 0.00028755043579066915, "loss": 3.6215, "mean_token_accuracy": 0.36663948446512223, "num_tokens": 241827840.0, "step": 6560 }, { "epoch": 1.746284501061571, "grad_norm": 0.33984375, "learning_rate": 0.00028744520214838164, "loss": 3.6324, "mean_token_accuracy": 0.36438899487257004, "num_tokens": 242565120.0, "step": 6580 }, { "epoch": 1.7515923566878981, "grad_norm": 0.298828125, "learning_rate": 0.0002873395450384715, "loss": 3.6012, "mean_token_accuracy": 0.368570613861084, "num_tokens": 243302400.0, "step": 6600 }, { "epoch": 1.7569002123142252, "grad_norm": 0.3046875, "learning_rate": 0.0002872334647864675, "loss": 3.6277, "mean_token_accuracy": 0.36555773168802264, "num_tokens": 244039680.0, "step": 6620 }, { "epoch": 1.762208067940552, "grad_norm": 0.328125, "learning_rate": 0.0002871269617192021, "loss": 3.624, "mean_token_accuracy": 0.3661189913749695, "num_tokens": 244776960.0, "step": 6640 }, { "epoch": 1.767515923566879, "grad_norm": 0.326171875, "learning_rate": 0.00028702003616481043, "loss": 3.6108, "mean_token_accuracy": 0.36727685630321505, "num_tokens": 245514240.0, "step": 6660 }, { "epoch": 1.772823779193206, "grad_norm": 0.310546875, "learning_rate": 0.00028691268845272924, "loss": 3.6228, "mean_token_accuracy": 0.3657085791230202, "num_tokens": 246251520.0, "step": 6680 }, { "epoch": 1.778131634819533, "grad_norm": 0.314453125, "learning_rate": 0.0002868049189136961, "loss": 3.6216, "mean_token_accuracy": 0.36733936965465547, "num_tokens": 246988800.0, "step": 6700 }, { "epoch": 1.78343949044586, "grad_norm": 0.314453125, "learning_rate": 0.0002866967278797482, "loss": 3.6195, "mean_token_accuracy": 0.3672619044780731, "num_tokens": 247726080.0, "step": 6720 }, { "epoch": 1.7887473460721868, "grad_norm": 0.314453125, "learning_rate": 0.00028658811568422116, "loss": 3.605, "mean_token_accuracy": 0.36786665618419645, "num_tokens": 248463360.0, "step": 6740 }, { "epoch": 1.7940552016985138, "grad_norm": 0.3125, "learning_rate": 0.0002864790826617484, "loss": 3.6014, "mean_token_accuracy": 0.3685189738869667, "num_tokens": 249200640.0, "step": 6760 }, { "epoch": 1.799363057324841, "grad_norm": 0.33984375, "learning_rate": 0.00028636962914825977, "loss": 3.6142, "mean_token_accuracy": 0.3671151340007782, "num_tokens": 249937920.0, "step": 6780 }, { "epoch": 1.8046709129511678, "grad_norm": 0.333984375, "learning_rate": 0.0002862597554809808, "loss": 3.6225, "mean_token_accuracy": 0.3672496736049652, "num_tokens": 250675200.0, "step": 6800 }, { "epoch": 1.8099787685774946, "grad_norm": 0.291015625, "learning_rate": 0.00028614946199843135, "loss": 3.6105, "mean_token_accuracy": 0.3675459325313568, "num_tokens": 251412480.0, "step": 6820 }, { "epoch": 1.8152866242038217, "grad_norm": 0.3359375, "learning_rate": 0.00028603874904042486, "loss": 3.5777, "mean_token_accuracy": 0.3716990098357201, "num_tokens": 252149760.0, "step": 6840 }, { "epoch": 1.8205944798301488, "grad_norm": 0.30078125, "learning_rate": 0.0002859276169480671, "loss": 3.6182, "mean_token_accuracy": 0.3671599790453911, "num_tokens": 252887040.0, "step": 6860 }, { "epoch": 1.8259023354564756, "grad_norm": 0.302734375, "learning_rate": 0.0002858160660637552, "loss": 3.5986, "mean_token_accuracy": 0.36904761642217637, "num_tokens": 253624320.0, "step": 6880 }, { "epoch": 1.8312101910828025, "grad_norm": 0.298828125, "learning_rate": 0.0002857040967311765, "loss": 3.581, "mean_token_accuracy": 0.37064035832881925, "num_tokens": 254361600.0, "step": 6900 }, { "epoch": 1.8365180467091295, "grad_norm": 0.3125, "learning_rate": 0.0002855917092953078, "loss": 3.6417, "mean_token_accuracy": 0.36494346857070925, "num_tokens": 255098880.0, "step": 6920 }, { "epoch": 1.8418259023354566, "grad_norm": 0.328125, "learning_rate": 0.00028547890410241385, "loss": 3.6028, "mean_token_accuracy": 0.3683069676160812, "num_tokens": 255836160.0, "step": 6940 }, { "epoch": 1.8471337579617835, "grad_norm": 0.337890625, "learning_rate": 0.00028536568150004654, "loss": 3.6006, "mean_token_accuracy": 0.3687241807579994, "num_tokens": 256573440.0, "step": 6960 }, { "epoch": 1.8524416135881103, "grad_norm": 0.314453125, "learning_rate": 0.00028525204183704403, "loss": 3.6017, "mean_token_accuracy": 0.3700627863407135, "num_tokens": 257310720.0, "step": 6980 }, { "epoch": 1.8577494692144374, "grad_norm": 0.3125, "learning_rate": 0.00028513798546352906, "loss": 3.6103, "mean_token_accuracy": 0.3677402690052986, "num_tokens": 258048000.0, "step": 7000 }, { "epoch": 1.8630573248407645, "grad_norm": 0.310546875, "learning_rate": 0.0002850235127309086, "loss": 3.6093, "mean_token_accuracy": 0.3687459215521812, "num_tokens": 258785280.0, "step": 7020 }, { "epoch": 1.8683651804670913, "grad_norm": 0.296875, "learning_rate": 0.0002849086239918722, "loss": 3.582, "mean_token_accuracy": 0.37090807408094406, "num_tokens": 259522560.0, "step": 7040 }, { "epoch": 1.8736730360934182, "grad_norm": 0.32421875, "learning_rate": 0.00028479331960039116, "loss": 3.609, "mean_token_accuracy": 0.36827978789806365, "num_tokens": 260259840.0, "step": 7060 }, { "epoch": 1.8789808917197452, "grad_norm": 0.30078125, "learning_rate": 0.00028467759991171747, "loss": 3.6238, "mean_token_accuracy": 0.3670186445116997, "num_tokens": 260997120.0, "step": 7080 }, { "epoch": 1.8842887473460723, "grad_norm": 0.328125, "learning_rate": 0.0002845614652823826, "loss": 3.6292, "mean_token_accuracy": 0.36672510355710985, "num_tokens": 261734400.0, "step": 7100 }, { "epoch": 1.8895966029723992, "grad_norm": 0.3125, "learning_rate": 0.0002844449160701965, "loss": 3.5721, "mean_token_accuracy": 0.3717411383986473, "num_tokens": 262471680.0, "step": 7120 }, { "epoch": 1.894904458598726, "grad_norm": 0.29296875, "learning_rate": 0.00028432795263424635, "loss": 3.6012, "mean_token_accuracy": 0.36936698108911514, "num_tokens": 263208960.0, "step": 7140 }, { "epoch": 1.900212314225053, "grad_norm": 0.328125, "learning_rate": 0.00028421057533489563, "loss": 3.5745, "mean_token_accuracy": 0.37161611169576647, "num_tokens": 263946240.0, "step": 7160 }, { "epoch": 1.9055201698513802, "grad_norm": 0.310546875, "learning_rate": 0.00028409278453378294, "loss": 3.6064, "mean_token_accuracy": 0.36713144183158875, "num_tokens": 264683520.0, "step": 7180 }, { "epoch": 1.910828025477707, "grad_norm": 0.30078125, "learning_rate": 0.0002839745805938208, "loss": 3.5861, "mean_token_accuracy": 0.37066889405250547, "num_tokens": 265420800.0, "step": 7200 }, { "epoch": 1.9161358811040339, "grad_norm": 0.310546875, "learning_rate": 0.0002838559638791946, "loss": 3.5738, "mean_token_accuracy": 0.37160795778036115, "num_tokens": 266158080.0, "step": 7220 }, { "epoch": 1.921443736730361, "grad_norm": 0.326171875, "learning_rate": 0.0002837369347553617, "loss": 3.5886, "mean_token_accuracy": 0.37043650448322296, "num_tokens": 266895360.0, "step": 7240 }, { "epoch": 1.926751592356688, "grad_norm": 0.345703125, "learning_rate": 0.0002836174935890498, "loss": 3.591, "mean_token_accuracy": 0.37067433148622514, "num_tokens": 267632640.0, "step": 7260 }, { "epoch": 1.9320594479830149, "grad_norm": 0.3125, "learning_rate": 0.00028349764074825617, "loss": 3.5932, "mean_token_accuracy": 0.3696741133928299, "num_tokens": 268369920.0, "step": 7280 }, { "epoch": 1.9373673036093417, "grad_norm": 0.30859375, "learning_rate": 0.00028337737660224664, "loss": 3.593, "mean_token_accuracy": 0.3701538383960724, "num_tokens": 269107200.0, "step": 7300 }, { "epoch": 1.9426751592356688, "grad_norm": 0.33203125, "learning_rate": 0.0002832567015215541, "loss": 3.5867, "mean_token_accuracy": 0.37056017518043516, "num_tokens": 269844480.0, "step": 7320 }, { "epoch": 1.9479830148619959, "grad_norm": 0.31640625, "learning_rate": 0.0002831356158779775, "loss": 3.5722, "mean_token_accuracy": 0.37230376303195956, "num_tokens": 270581760.0, "step": 7340 }, { "epoch": 1.9532908704883227, "grad_norm": 0.3125, "learning_rate": 0.0002830141200445808, "loss": 3.6065, "mean_token_accuracy": 0.3679481968283653, "num_tokens": 271319040.0, "step": 7360 }, { "epoch": 1.9585987261146496, "grad_norm": 0.310546875, "learning_rate": 0.00028289221439569166, "loss": 3.5813, "mean_token_accuracy": 0.3708673074841499, "num_tokens": 272056320.0, "step": 7380 }, { "epoch": 1.9639065817409767, "grad_norm": 0.34765625, "learning_rate": 0.0002827698993069007, "loss": 3.5786, "mean_token_accuracy": 0.3715372920036316, "num_tokens": 272793600.0, "step": 7400 }, { "epoch": 1.9692144373673037, "grad_norm": 0.31640625, "learning_rate": 0.00028264717515505953, "loss": 3.5989, "mean_token_accuracy": 0.3685148984193802, "num_tokens": 273530880.0, "step": 7420 }, { "epoch": 1.9745222929936306, "grad_norm": 0.30859375, "learning_rate": 0.0002825240423182804, "loss": 3.5722, "mean_token_accuracy": 0.37265166342258454, "num_tokens": 274268160.0, "step": 7440 }, { "epoch": 1.9798301486199574, "grad_norm": 0.333984375, "learning_rate": 0.00028240050117593477, "loss": 3.5902, "mean_token_accuracy": 0.3691169276833534, "num_tokens": 275005440.0, "step": 7460 }, { "epoch": 1.9851380042462845, "grad_norm": 0.29296875, "learning_rate": 0.00028227655210865185, "loss": 3.5669, "mean_token_accuracy": 0.372916667163372, "num_tokens": 275742720.0, "step": 7480 }, { "epoch": 1.9904458598726116, "grad_norm": 0.306640625, "learning_rate": 0.0002821521954983177, "loss": 3.5627, "mean_token_accuracy": 0.37321972399950026, "num_tokens": 276480000.0, "step": 7500 }, { "epoch": 1.9957537154989384, "grad_norm": 0.318359375, "learning_rate": 0.0002820274317280742, "loss": 3.5747, "mean_token_accuracy": 0.3720047831535339, "num_tokens": 277217280.0, "step": 7520 }, { "epoch": 2.0010615711252653, "grad_norm": 0.298828125, "learning_rate": 0.0002819022611823176, "loss": 3.5368, "mean_token_accuracy": 0.37617688477039335, "num_tokens": 277954560.0, "step": 7540 }, { "epoch": 2.0063694267515926, "grad_norm": 0.294921875, "learning_rate": 0.00028177668424669725, "loss": 3.458, "mean_token_accuracy": 0.3840033158659935, "num_tokens": 278691840.0, "step": 7560 }, { "epoch": 2.0116772823779194, "grad_norm": 0.306640625, "learning_rate": 0.0002816507013081149, "loss": 3.4599, "mean_token_accuracy": 0.38432811498641967, "num_tokens": 279429120.0, "step": 7580 }, { "epoch": 2.0169851380042463, "grad_norm": 0.32421875, "learning_rate": 0.00028152431275472295, "loss": 3.4669, "mean_token_accuracy": 0.38341079503297804, "num_tokens": 280166400.0, "step": 7600 }, { "epoch": 2.022292993630573, "grad_norm": 0.306640625, "learning_rate": 0.0002813975189759236, "loss": 3.4686, "mean_token_accuracy": 0.3841038852930069, "num_tokens": 280903680.0, "step": 7620 }, { "epoch": 2.0276008492569004, "grad_norm": 0.33203125, "learning_rate": 0.00028127032036236746, "loss": 3.4717, "mean_token_accuracy": 0.3828780695796013, "num_tokens": 281640960.0, "step": 7640 }, { "epoch": 2.0329087048832273, "grad_norm": 0.30859375, "learning_rate": 0.00028114271730595255, "loss": 3.4667, "mean_token_accuracy": 0.3834529250860214, "num_tokens": 282378240.0, "step": 7660 }, { "epoch": 2.038216560509554, "grad_norm": 0.291015625, "learning_rate": 0.0002810147101998229, "loss": 3.4591, "mean_token_accuracy": 0.3842696785926819, "num_tokens": 283115520.0, "step": 7680 }, { "epoch": 2.043524416135881, "grad_norm": 0.3125, "learning_rate": 0.00028088629943836743, "loss": 3.4724, "mean_token_accuracy": 0.3822298288345337, "num_tokens": 283852800.0, "step": 7700 }, { "epoch": 2.0488322717622083, "grad_norm": 0.30859375, "learning_rate": 0.0002807574854172187, "loss": 3.4699, "mean_token_accuracy": 0.3832110226154327, "num_tokens": 284590080.0, "step": 7720 }, { "epoch": 2.054140127388535, "grad_norm": 0.30859375, "learning_rate": 0.0002806282685332518, "loss": 3.489, "mean_token_accuracy": 0.38188193291425704, "num_tokens": 285327360.0, "step": 7740 }, { "epoch": 2.059447983014862, "grad_norm": 0.3125, "learning_rate": 0.0002804986491845829, "loss": 3.4704, "mean_token_accuracy": 0.3832504391670227, "num_tokens": 286064640.0, "step": 7760 }, { "epoch": 2.064755838641189, "grad_norm": 0.302734375, "learning_rate": 0.0002803686277705682, "loss": 3.4659, "mean_token_accuracy": 0.3844422698020935, "num_tokens": 286801920.0, "step": 7780 }, { "epoch": 2.070063694267516, "grad_norm": 0.30859375, "learning_rate": 0.0002802382046918027, "loss": 3.4647, "mean_token_accuracy": 0.3828019708395004, "num_tokens": 287539200.0, "step": 7800 }, { "epoch": 2.075371549893843, "grad_norm": 0.302734375, "learning_rate": 0.000280107380350119, "loss": 3.4668, "mean_token_accuracy": 0.38314171582460405, "num_tokens": 288276480.0, "step": 7820 }, { "epoch": 2.08067940552017, "grad_norm": 0.3046875, "learning_rate": 0.0002799761551485858, "loss": 3.4706, "mean_token_accuracy": 0.3828604057431221, "num_tokens": 289013760.0, "step": 7840 }, { "epoch": 2.0859872611464967, "grad_norm": 0.31640625, "learning_rate": 0.000279844529491507, "loss": 3.4826, "mean_token_accuracy": 0.3816223621368408, "num_tokens": 289751040.0, "step": 7860 }, { "epoch": 2.091295116772824, "grad_norm": 0.3046875, "learning_rate": 0.0002797125037844203, "loss": 3.5148, "mean_token_accuracy": 0.37814334630966184, "num_tokens": 290488320.0, "step": 7880 }, { "epoch": 2.096602972399151, "grad_norm": 0.30859375, "learning_rate": 0.0002795800784340958, "loss": 3.4657, "mean_token_accuracy": 0.38261171132326127, "num_tokens": 291225600.0, "step": 7900 }, { "epoch": 2.1019108280254777, "grad_norm": 0.306640625, "learning_rate": 0.000279447253848535, "loss": 3.4629, "mean_token_accuracy": 0.3826416090130806, "num_tokens": 291962880.0, "step": 7920 }, { "epoch": 2.1072186836518045, "grad_norm": 0.31640625, "learning_rate": 0.0002793140304369696, "loss": 3.4574, "mean_token_accuracy": 0.3840372905135155, "num_tokens": 292700160.0, "step": 7940 }, { "epoch": 2.112526539278132, "grad_norm": 0.314453125, "learning_rate": 0.0002791804086098596, "loss": 3.4972, "mean_token_accuracy": 0.3793705180287361, "num_tokens": 293437440.0, "step": 7960 }, { "epoch": 2.1178343949044587, "grad_norm": 0.333984375, "learning_rate": 0.0002790463887788932, "loss": 3.4893, "mean_token_accuracy": 0.38114535957574847, "num_tokens": 294174720.0, "step": 7980 }, { "epoch": 2.1231422505307855, "grad_norm": 0.31640625, "learning_rate": 0.00027891197135698423, "loss": 3.4753, "mean_token_accuracy": 0.3824119359254837, "num_tokens": 294912000.0, "step": 8000 }, { "epoch": 2.1284501061571124, "grad_norm": 0.32421875, "learning_rate": 0.0002787771567582718, "loss": 3.4571, "mean_token_accuracy": 0.3842071607708931, "num_tokens": 295649280.0, "step": 8020 }, { "epoch": 2.1337579617834397, "grad_norm": 0.298828125, "learning_rate": 0.0002786419453981187, "loss": 3.489, "mean_token_accuracy": 0.38154082745313644, "num_tokens": 296386560.0, "step": 8040 }, { "epoch": 2.1390658174097665, "grad_norm": 0.31640625, "learning_rate": 0.0002785063376931101, "loss": 3.4862, "mean_token_accuracy": 0.38022395968437195, "num_tokens": 297123840.0, "step": 8060 }, { "epoch": 2.1443736730360934, "grad_norm": 0.337890625, "learning_rate": 0.00027837033406105237, "loss": 3.4788, "mean_token_accuracy": 0.3817949563264847, "num_tokens": 297861120.0, "step": 8080 }, { "epoch": 2.1496815286624202, "grad_norm": 0.296875, "learning_rate": 0.0002782339349209715, "loss": 3.4892, "mean_token_accuracy": 0.38105838149786, "num_tokens": 298598400.0, "step": 8100 }, { "epoch": 2.1549893842887475, "grad_norm": 0.3125, "learning_rate": 0.0002780971406931123, "loss": 3.5056, "mean_token_accuracy": 0.38026065230369566, "num_tokens": 299335680.0, "step": 8120 }, { "epoch": 2.1602972399150744, "grad_norm": 0.302734375, "learning_rate": 0.00027795995179893674, "loss": 3.4904, "mean_token_accuracy": 0.3804699406027794, "num_tokens": 300072960.0, "step": 8140 }, { "epoch": 2.1656050955414012, "grad_norm": 0.298828125, "learning_rate": 0.0002778223686611228, "loss": 3.498, "mean_token_accuracy": 0.3794275939464569, "num_tokens": 300810240.0, "step": 8160 }, { "epoch": 2.170912951167728, "grad_norm": 0.291015625, "learning_rate": 0.000277684391703563, "loss": 3.4823, "mean_token_accuracy": 0.3818207785487175, "num_tokens": 301547520.0, "step": 8180 }, { "epoch": 2.1762208067940554, "grad_norm": 0.291015625, "learning_rate": 0.0002775460213513634, "loss": 3.4543, "mean_token_accuracy": 0.3837818011641502, "num_tokens": 302284800.0, "step": 8200 }, { "epoch": 2.1815286624203822, "grad_norm": 0.3125, "learning_rate": 0.0002774072580308419, "loss": 3.4952, "mean_token_accuracy": 0.3797455981373787, "num_tokens": 303022080.0, "step": 8220 }, { "epoch": 2.186836518046709, "grad_norm": 0.294921875, "learning_rate": 0.00027726810216952724, "loss": 3.4914, "mean_token_accuracy": 0.3807444557547569, "num_tokens": 303759360.0, "step": 8240 }, { "epoch": 2.192144373673036, "grad_norm": 0.330078125, "learning_rate": 0.0002771285541961576, "loss": 3.4671, "mean_token_accuracy": 0.38225836753845216, "num_tokens": 304496640.0, "step": 8260 }, { "epoch": 2.1974522292993632, "grad_norm": 0.3203125, "learning_rate": 0.0002769886145406793, "loss": 3.4674, "mean_token_accuracy": 0.38318792134523394, "num_tokens": 305233920.0, "step": 8280 }, { "epoch": 2.20276008492569, "grad_norm": 0.31640625, "learning_rate": 0.00027684828363424527, "loss": 3.4785, "mean_token_accuracy": 0.38178408294916155, "num_tokens": 305971200.0, "step": 8300 }, { "epoch": 2.208067940552017, "grad_norm": 0.3203125, "learning_rate": 0.00027670756190921393, "loss": 3.4937, "mean_token_accuracy": 0.3808259919285774, "num_tokens": 306708480.0, "step": 8320 }, { "epoch": 2.213375796178344, "grad_norm": 0.30078125, "learning_rate": 0.00027656644979914797, "loss": 3.4649, "mean_token_accuracy": 0.3829596102237701, "num_tokens": 307445760.0, "step": 8340 }, { "epoch": 2.218683651804671, "grad_norm": 0.296875, "learning_rate": 0.0002764249477388126, "loss": 3.4994, "mean_token_accuracy": 0.3797632619738579, "num_tokens": 308183040.0, "step": 8360 }, { "epoch": 2.223991507430998, "grad_norm": 0.3125, "learning_rate": 0.0002762830561641747, "loss": 3.4507, "mean_token_accuracy": 0.38439199030399324, "num_tokens": 308920320.0, "step": 8380 }, { "epoch": 2.229299363057325, "grad_norm": 0.33984375, "learning_rate": 0.000276140775512401, "loss": 3.4674, "mean_token_accuracy": 0.3835725158452988, "num_tokens": 309657600.0, "step": 8400 }, { "epoch": 2.2346072186836516, "grad_norm": 0.31640625, "learning_rate": 0.0002759981062218573, "loss": 3.4798, "mean_token_accuracy": 0.38212654888629916, "num_tokens": 310394880.0, "step": 8420 }, { "epoch": 2.239915074309979, "grad_norm": 0.326171875, "learning_rate": 0.0002758550487321064, "loss": 3.4873, "mean_token_accuracy": 0.38046586513519287, "num_tokens": 311132160.0, "step": 8440 }, { "epoch": 2.245222929936306, "grad_norm": 0.33984375, "learning_rate": 0.0002757116034839074, "loss": 3.4853, "mean_token_accuracy": 0.3816984683275223, "num_tokens": 311869440.0, "step": 8460 }, { "epoch": 2.2505307855626326, "grad_norm": 0.322265625, "learning_rate": 0.0002755677709192141, "loss": 3.5057, "mean_token_accuracy": 0.37867063134908674, "num_tokens": 312606720.0, "step": 8480 }, { "epoch": 2.2558386411889595, "grad_norm": 0.314453125, "learning_rate": 0.00027542355148117345, "loss": 3.5012, "mean_token_accuracy": 0.37995759695768355, "num_tokens": 313344000.0, "step": 8500 }, { "epoch": 2.261146496815287, "grad_norm": 0.326171875, "learning_rate": 0.0002752789456141245, "loss": 3.4789, "mean_token_accuracy": 0.381981135904789, "num_tokens": 314081280.0, "step": 8520 }, { "epoch": 2.2664543524416136, "grad_norm": 0.330078125, "learning_rate": 0.0002751339537635968, "loss": 3.4851, "mean_token_accuracy": 0.3812119483947754, "num_tokens": 314818560.0, "step": 8540 }, { "epoch": 2.2717622080679405, "grad_norm": 0.392578125, "learning_rate": 0.0002749885763763091, "loss": 3.4714, "mean_token_accuracy": 0.38266063332557676, "num_tokens": 315555840.0, "step": 8560 }, { "epoch": 2.2770700636942673, "grad_norm": 0.30859375, "learning_rate": 0.0002748428139001681, "loss": 3.4707, "mean_token_accuracy": 0.38282371312379837, "num_tokens": 316293120.0, "step": 8580 }, { "epoch": 2.2823779193205946, "grad_norm": 0.296875, "learning_rate": 0.0002746966667842668, "loss": 3.4601, "mean_token_accuracy": 0.38432675451040266, "num_tokens": 317030400.0, "step": 8600 }, { "epoch": 2.2876857749469215, "grad_norm": 0.306640625, "learning_rate": 0.0002745501354788834, "loss": 3.5004, "mean_token_accuracy": 0.3788962259888649, "num_tokens": 317767680.0, "step": 8620 }, { "epoch": 2.2929936305732483, "grad_norm": 0.287109375, "learning_rate": 0.00027440322043547976, "loss": 3.4938, "mean_token_accuracy": 0.3792971268296242, "num_tokens": 318504960.0, "step": 8640 }, { "epoch": 2.298301486199575, "grad_norm": 0.302734375, "learning_rate": 0.00027425592210669997, "loss": 3.4449, "mean_token_accuracy": 0.385890956223011, "num_tokens": 319242240.0, "step": 8660 }, { "epoch": 2.3036093418259025, "grad_norm": 0.306640625, "learning_rate": 0.0002741082409463691, "loss": 3.4576, "mean_token_accuracy": 0.38578902930021286, "num_tokens": 319979520.0, "step": 8680 }, { "epoch": 2.3089171974522293, "grad_norm": 0.3125, "learning_rate": 0.0002739601774094916, "loss": 3.4877, "mean_token_accuracy": 0.3800404995679855, "num_tokens": 320716800.0, "step": 8700 }, { "epoch": 2.314225053078556, "grad_norm": 0.3046875, "learning_rate": 0.00027381173195225016, "loss": 3.4684, "mean_token_accuracy": 0.383776368200779, "num_tokens": 321454080.0, "step": 8720 }, { "epoch": 2.319532908704883, "grad_norm": 0.3046875, "learning_rate": 0.0002736629050320042, "loss": 3.4728, "mean_token_accuracy": 0.3816522642970085, "num_tokens": 322191360.0, "step": 8740 }, { "epoch": 2.3248407643312103, "grad_norm": 0.296875, "learning_rate": 0.0002735136971072881, "loss": 3.4686, "mean_token_accuracy": 0.38411067575216296, "num_tokens": 322928640.0, "step": 8760 }, { "epoch": 2.330148619957537, "grad_norm": 0.291015625, "learning_rate": 0.00027336410863781056, "loss": 3.4983, "mean_token_accuracy": 0.3792074382305145, "num_tokens": 323665920.0, "step": 8780 }, { "epoch": 2.335456475583864, "grad_norm": 0.322265625, "learning_rate": 0.00027321414008445237, "loss": 3.4743, "mean_token_accuracy": 0.3822203204035759, "num_tokens": 324403200.0, "step": 8800 }, { "epoch": 2.340764331210191, "grad_norm": 0.3046875, "learning_rate": 0.00027306379190926563, "loss": 3.4944, "mean_token_accuracy": 0.3795988276600838, "num_tokens": 325140480.0, "step": 8820 }, { "epoch": 2.346072186836518, "grad_norm": 0.3203125, "learning_rate": 0.0002729130645754718, "loss": 3.4722, "mean_token_accuracy": 0.3828699171543121, "num_tokens": 325877760.0, "step": 8840 }, { "epoch": 2.351380042462845, "grad_norm": 0.296875, "learning_rate": 0.0002727619585474608, "loss": 3.4919, "mean_token_accuracy": 0.38106925040483475, "num_tokens": 326615040.0, "step": 8860 }, { "epoch": 2.356687898089172, "grad_norm": 0.318359375, "learning_rate": 0.000272610474290789, "loss": 3.4866, "mean_token_accuracy": 0.38153131157159803, "num_tokens": 327352320.0, "step": 8880 }, { "epoch": 2.3619957537154987, "grad_norm": 0.30859375, "learning_rate": 0.00027245861227217827, "loss": 3.4717, "mean_token_accuracy": 0.38235350102186205, "num_tokens": 328089600.0, "step": 8900 }, { "epoch": 2.367303609341826, "grad_norm": 0.326171875, "learning_rate": 0.0002723063729595144, "loss": 3.4734, "mean_token_accuracy": 0.3814633637666702, "num_tokens": 328826880.0, "step": 8920 }, { "epoch": 2.372611464968153, "grad_norm": 0.34375, "learning_rate": 0.0002721537568218455, "loss": 3.4815, "mean_token_accuracy": 0.38092656135559083, "num_tokens": 329564160.0, "step": 8940 }, { "epoch": 2.3779193205944797, "grad_norm": 0.314453125, "learning_rate": 0.00027200076432938076, "loss": 3.4872, "mean_token_accuracy": 0.38171613216400146, "num_tokens": 330301440.0, "step": 8960 }, { "epoch": 2.3832271762208066, "grad_norm": 0.30859375, "learning_rate": 0.0002718473959534889, "loss": 3.4833, "mean_token_accuracy": 0.3810393542051315, "num_tokens": 331038720.0, "step": 8980 }, { "epoch": 2.388535031847134, "grad_norm": 0.30078125, "learning_rate": 0.00027169365216669667, "loss": 3.4678, "mean_token_accuracy": 0.383356437087059, "num_tokens": 331776000.0, "step": 9000 }, { "epoch": 2.3938428874734607, "grad_norm": 0.326171875, "learning_rate": 0.00027153953344268754, "loss": 3.4893, "mean_token_accuracy": 0.3806615546345711, "num_tokens": 332513280.0, "step": 9020 }, { "epoch": 2.3991507430997876, "grad_norm": 0.322265625, "learning_rate": 0.0002713850402563001, "loss": 3.4768, "mean_token_accuracy": 0.38181533962488173, "num_tokens": 333250560.0, "step": 9040 }, { "epoch": 2.404458598726115, "grad_norm": 0.3125, "learning_rate": 0.0002712301730835268, "loss": 3.4706, "mean_token_accuracy": 0.38218498677015306, "num_tokens": 333987840.0, "step": 9060 }, { "epoch": 2.4097664543524417, "grad_norm": 0.314453125, "learning_rate": 0.000271074932401512, "loss": 3.5019, "mean_token_accuracy": 0.37945341467857363, "num_tokens": 334725120.0, "step": 9080 }, { "epoch": 2.4150743099787686, "grad_norm": 0.33203125, "learning_rate": 0.00027091931868855124, "loss": 3.4926, "mean_token_accuracy": 0.3808395892381668, "num_tokens": 335462400.0, "step": 9100 }, { "epoch": 2.4203821656050954, "grad_norm": 0.326171875, "learning_rate": 0.00027076333242408916, "loss": 3.4886, "mean_token_accuracy": 0.3811181768774986, "num_tokens": 336199680.0, "step": 9120 }, { "epoch": 2.4256900212314223, "grad_norm": 0.330078125, "learning_rate": 0.0002706069740887182, "loss": 3.4715, "mean_token_accuracy": 0.38150005489587785, "num_tokens": 336936960.0, "step": 9140 }, { "epoch": 2.4309978768577496, "grad_norm": 0.298828125, "learning_rate": 0.00027045024416417725, "loss": 3.4557, "mean_token_accuracy": 0.3845863237977028, "num_tokens": 337674240.0, "step": 9160 }, { "epoch": 2.4363057324840764, "grad_norm": 0.298828125, "learning_rate": 0.00027029314313334996, "loss": 3.4762, "mean_token_accuracy": 0.38209121525287626, "num_tokens": 338411520.0, "step": 9180 }, { "epoch": 2.4416135881104033, "grad_norm": 0.310546875, "learning_rate": 0.0002701356714802635, "loss": 3.4903, "mean_token_accuracy": 0.3801723226904869, "num_tokens": 339148800.0, "step": 9200 }, { "epoch": 2.4469214437367306, "grad_norm": 0.2890625, "learning_rate": 0.0002699778296900867, "loss": 3.452, "mean_token_accuracy": 0.38562730699777603, "num_tokens": 339886080.0, "step": 9220 }, { "epoch": 2.4522292993630574, "grad_norm": 0.388671875, "learning_rate": 0.00026981961824912906, "loss": 3.5013, "mean_token_accuracy": 0.378928841650486, "num_tokens": 340623360.0, "step": 9240 }, { "epoch": 2.4575371549893843, "grad_norm": 0.310546875, "learning_rate": 0.0002696610376448388, "loss": 3.4673, "mean_token_accuracy": 0.38399787694215776, "num_tokens": 341360640.0, "step": 9260 }, { "epoch": 2.462845010615711, "grad_norm": 0.30859375, "learning_rate": 0.0002695020883658014, "loss": 3.4535, "mean_token_accuracy": 0.38459583669900893, "num_tokens": 342097920.0, "step": 9280 }, { "epoch": 2.468152866242038, "grad_norm": 0.3125, "learning_rate": 0.0002693427709017386, "loss": 3.4814, "mean_token_accuracy": 0.38230050206184385, "num_tokens": 342835200.0, "step": 9300 }, { "epoch": 2.4734607218683653, "grad_norm": 0.3125, "learning_rate": 0.00026918308574350614, "loss": 3.4702, "mean_token_accuracy": 0.38295553624629974, "num_tokens": 343572480.0, "step": 9320 }, { "epoch": 2.478768577494692, "grad_norm": 0.330078125, "learning_rate": 0.0002690230333830929, "loss": 3.5054, "mean_token_accuracy": 0.3783064246177673, "num_tokens": 344309760.0, "step": 9340 }, { "epoch": 2.484076433121019, "grad_norm": 0.30078125, "learning_rate": 0.0002688626143136189, "loss": 3.4633, "mean_token_accuracy": 0.3837342351675034, "num_tokens": 345047040.0, "step": 9360 }, { "epoch": 2.4893842887473463, "grad_norm": 0.298828125, "learning_rate": 0.000268701829029334, "loss": 3.4875, "mean_token_accuracy": 0.38162100315093994, "num_tokens": 345784320.0, "step": 9380 }, { "epoch": 2.494692144373673, "grad_norm": 0.287109375, "learning_rate": 0.0002685406780256166, "loss": 3.4839, "mean_token_accuracy": 0.3816780850291252, "num_tokens": 346521600.0, "step": 9400 }, { "epoch": 2.5, "grad_norm": 0.28125, "learning_rate": 0.0002683791617989717, "loss": 3.4878, "mean_token_accuracy": 0.38169303089380263, "num_tokens": 347258880.0, "step": 9420 }, { "epoch": 2.505307855626327, "grad_norm": 0.30078125, "learning_rate": 0.0002682172808470294, "loss": 3.4631, "mean_token_accuracy": 0.38243368417024615, "num_tokens": 347996160.0, "step": 9440 }, { "epoch": 2.5106157112526537, "grad_norm": 0.3359375, "learning_rate": 0.00026805503566854386, "loss": 3.498, "mean_token_accuracy": 0.3794547751545906, "num_tokens": 348733440.0, "step": 9460 }, { "epoch": 2.515923566878981, "grad_norm": 0.3046875, "learning_rate": 0.00026789242676339116, "loss": 3.4957, "mean_token_accuracy": 0.3799467280507088, "num_tokens": 349470720.0, "step": 9480 }, { "epoch": 2.521231422505308, "grad_norm": 0.3046875, "learning_rate": 0.0002677294546325681, "loss": 3.4762, "mean_token_accuracy": 0.38238339871168137, "num_tokens": 350208000.0, "step": 9500 }, { "epoch": 2.5265392781316347, "grad_norm": 0.33203125, "learning_rate": 0.0002675661197781907, "loss": 3.4751, "mean_token_accuracy": 0.381023046374321, "num_tokens": 350945280.0, "step": 9520 }, { "epoch": 2.531847133757962, "grad_norm": 0.318359375, "learning_rate": 0.0002674024227034922, "loss": 3.4784, "mean_token_accuracy": 0.3817609786987305, "num_tokens": 351682560.0, "step": 9540 }, { "epoch": 2.537154989384289, "grad_norm": 0.28125, "learning_rate": 0.0002672383639128221, "loss": 3.4592, "mean_token_accuracy": 0.3835806712508202, "num_tokens": 352419840.0, "step": 9560 }, { "epoch": 2.5424628450106157, "grad_norm": 0.29296875, "learning_rate": 0.0002670739439116444, "loss": 3.4876, "mean_token_accuracy": 0.38086947947740557, "num_tokens": 353157120.0, "step": 9580 }, { "epoch": 2.5477707006369426, "grad_norm": 0.30859375, "learning_rate": 0.00026690916320653576, "loss": 3.5002, "mean_token_accuracy": 0.37965318709611895, "num_tokens": 353894400.0, "step": 9600 }, { "epoch": 2.5530785562632694, "grad_norm": 0.310546875, "learning_rate": 0.00026674402230518436, "loss": 3.4676, "mean_token_accuracy": 0.3835032105445862, "num_tokens": 354631680.0, "step": 9620 }, { "epoch": 2.5583864118895967, "grad_norm": 0.35546875, "learning_rate": 0.000266578521716388, "loss": 3.4782, "mean_token_accuracy": 0.38364318311214446, "num_tokens": 355368960.0, "step": 9640 }, { "epoch": 2.5636942675159236, "grad_norm": 0.296875, "learning_rate": 0.0002664126619500527, "loss": 3.4758, "mean_token_accuracy": 0.3825573518872261, "num_tokens": 356106240.0, "step": 9660 }, { "epoch": 2.5690021231422504, "grad_norm": 0.30078125, "learning_rate": 0.0002662464435171912, "loss": 3.4653, "mean_token_accuracy": 0.38293786495924, "num_tokens": 356843520.0, "step": 9680 }, { "epoch": 2.5743099787685777, "grad_norm": 0.3203125, "learning_rate": 0.0002660798669299213, "loss": 3.4633, "mean_token_accuracy": 0.3835942566394806, "num_tokens": 357580800.0, "step": 9700 }, { "epoch": 2.5796178343949046, "grad_norm": 0.30859375, "learning_rate": 0.00026591293270146406, "loss": 3.4659, "mean_token_accuracy": 0.38267422169446946, "num_tokens": 358318080.0, "step": 9720 }, { "epoch": 2.5849256900212314, "grad_norm": 0.294921875, "learning_rate": 0.00026574564134614265, "loss": 3.4904, "mean_token_accuracy": 0.3797401636838913, "num_tokens": 359055360.0, "step": 9740 }, { "epoch": 2.5902335456475583, "grad_norm": 0.298828125, "learning_rate": 0.00026557799337938046, "loss": 3.4599, "mean_token_accuracy": 0.38395439386367797, "num_tokens": 359792640.0, "step": 9760 }, { "epoch": 2.595541401273885, "grad_norm": 0.30078125, "learning_rate": 0.0002654099893176995, "loss": 3.4842, "mean_token_accuracy": 0.3823969841003418, "num_tokens": 360529920.0, "step": 9780 }, { "epoch": 2.6008492569002124, "grad_norm": 0.306640625, "learning_rate": 0.0002652416296787192, "loss": 3.4909, "mean_token_accuracy": 0.3812826186418533, "num_tokens": 361267200.0, "step": 9800 }, { "epoch": 2.6061571125265393, "grad_norm": 0.318359375, "learning_rate": 0.00026507291498115414, "loss": 3.4707, "mean_token_accuracy": 0.3815027728676796, "num_tokens": 362004480.0, "step": 9820 }, { "epoch": 2.611464968152866, "grad_norm": 0.31640625, "learning_rate": 0.00026490384574481314, "loss": 3.4542, "mean_token_accuracy": 0.3842397779226303, "num_tokens": 362741760.0, "step": 9840 }, { "epoch": 2.6167728237791934, "grad_norm": 0.32421875, "learning_rate": 0.00026473442249059717, "loss": 3.4594, "mean_token_accuracy": 0.38363638818264006, "num_tokens": 363479040.0, "step": 9860 }, { "epoch": 2.6220806794055203, "grad_norm": 0.306640625, "learning_rate": 0.000264564645740498, "loss": 3.4659, "mean_token_accuracy": 0.3836622089147568, "num_tokens": 364216320.0, "step": 9880 }, { "epoch": 2.627388535031847, "grad_norm": 0.302734375, "learning_rate": 0.00026439451601759653, "loss": 3.4863, "mean_token_accuracy": 0.3813451290130615, "num_tokens": 364953600.0, "step": 9900 }, { "epoch": 2.632696390658174, "grad_norm": 0.3203125, "learning_rate": 0.0002642240338460611, "loss": 3.4719, "mean_token_accuracy": 0.38228555023670197, "num_tokens": 365690880.0, "step": 9920 }, { "epoch": 2.638004246284501, "grad_norm": 0.3125, "learning_rate": 0.000264053199751146, "loss": 3.4841, "mean_token_accuracy": 0.38040063083171843, "num_tokens": 366428160.0, "step": 9940 }, { "epoch": 2.643312101910828, "grad_norm": 0.30078125, "learning_rate": 0.0002638820142591898, "loss": 3.4524, "mean_token_accuracy": 0.38496819883584976, "num_tokens": 367165440.0, "step": 9960 }, { "epoch": 2.648619957537155, "grad_norm": 0.306640625, "learning_rate": 0.0002637104778976137, "loss": 3.4896, "mean_token_accuracy": 0.38127446323633196, "num_tokens": 367902720.0, "step": 9980 }, { "epoch": 2.653927813163482, "grad_norm": 0.296875, "learning_rate": 0.0002635385911949199, "loss": 3.486, "mean_token_accuracy": 0.38092112392187116, "num_tokens": 368640000.0, "step": 10000 }, { "epoch": 2.659235668789809, "grad_norm": 0.310546875, "learning_rate": 0.00026336635468069, "loss": 3.4874, "mean_token_accuracy": 0.38107333034276963, "num_tokens": 369377280.0, "step": 10020 }, { "epoch": 2.664543524416136, "grad_norm": 0.30078125, "learning_rate": 0.0002631937688855835, "loss": 3.4679, "mean_token_accuracy": 0.3826755791902542, "num_tokens": 370114560.0, "step": 10040 }, { "epoch": 2.669851380042463, "grad_norm": 0.33984375, "learning_rate": 0.00026302083434133575, "loss": 3.4553, "mean_token_accuracy": 0.3844436302781105, "num_tokens": 370851840.0, "step": 10060 }, { "epoch": 2.6751592356687897, "grad_norm": 0.328125, "learning_rate": 0.0002628475515807569, "loss": 3.4886, "mean_token_accuracy": 0.3813967704772949, "num_tokens": 371589120.0, "step": 10080 }, { "epoch": 2.6804670912951165, "grad_norm": 0.328125, "learning_rate": 0.00026267392113772984, "loss": 3.4516, "mean_token_accuracy": 0.3859303638339043, "num_tokens": 372326400.0, "step": 10100 }, { "epoch": 2.685774946921444, "grad_norm": 0.310546875, "learning_rate": 0.00026249994354720854, "loss": 3.4815, "mean_token_accuracy": 0.38162644058465955, "num_tokens": 373063680.0, "step": 10120 }, { "epoch": 2.6910828025477707, "grad_norm": 0.310546875, "learning_rate": 0.00026232561934521665, "loss": 3.4604, "mean_token_accuracy": 0.383478744328022, "num_tokens": 373800960.0, "step": 10140 }, { "epoch": 2.6963906581740975, "grad_norm": 0.30859375, "learning_rate": 0.0002621509490688458, "loss": 3.4557, "mean_token_accuracy": 0.3843321919441223, "num_tokens": 374538240.0, "step": 10160 }, { "epoch": 2.701698513800425, "grad_norm": 0.306640625, "learning_rate": 0.0002619759332562537, "loss": 3.4884, "mean_token_accuracy": 0.38158566802740096, "num_tokens": 375275520.0, "step": 10180 }, { "epoch": 2.7070063694267517, "grad_norm": 0.33203125, "learning_rate": 0.00026180057244666274, "loss": 3.4904, "mean_token_accuracy": 0.3816875919699669, "num_tokens": 376012800.0, "step": 10200 }, { "epoch": 2.7123142250530785, "grad_norm": 0.30859375, "learning_rate": 0.00026162486718035824, "loss": 3.4688, "mean_token_accuracy": 0.38144161850214003, "num_tokens": 376750080.0, "step": 10220 }, { "epoch": 2.7176220806794054, "grad_norm": 0.3125, "learning_rate": 0.00026144881799868686, "loss": 3.4518, "mean_token_accuracy": 0.38462981283664704, "num_tokens": 377487360.0, "step": 10240 }, { "epoch": 2.722929936305732, "grad_norm": 0.302734375, "learning_rate": 0.0002612724254440547, "loss": 3.4814, "mean_token_accuracy": 0.37970075011253357, "num_tokens": 378224640.0, "step": 10260 }, { "epoch": 2.7282377919320595, "grad_norm": 0.3125, "learning_rate": 0.00026109569005992596, "loss": 3.4563, "mean_token_accuracy": 0.38405903428792953, "num_tokens": 378961920.0, "step": 10280 }, { "epoch": 2.7335456475583864, "grad_norm": 0.306640625, "learning_rate": 0.0002609186123908209, "loss": 3.4633, "mean_token_accuracy": 0.3840848580002785, "num_tokens": 379699200.0, "step": 10300 }, { "epoch": 2.738853503184713, "grad_norm": 0.30859375, "learning_rate": 0.0002607411929823146, "loss": 3.4696, "mean_token_accuracy": 0.38274760991334916, "num_tokens": 380436480.0, "step": 10320 }, { "epoch": 2.7441613588110405, "grad_norm": 0.30859375, "learning_rate": 0.00026056343238103484, "loss": 3.4891, "mean_token_accuracy": 0.3819458022713661, "num_tokens": 381173760.0, "step": 10340 }, { "epoch": 2.7494692144373674, "grad_norm": 0.318359375, "learning_rate": 0.00026038533113466073, "loss": 3.4698, "mean_token_accuracy": 0.38108827620744706, "num_tokens": 381911040.0, "step": 10360 }, { "epoch": 2.754777070063694, "grad_norm": 0.306640625, "learning_rate": 0.0002602068897919208, "loss": 3.4525, "mean_token_accuracy": 0.384018263220787, "num_tokens": 382648320.0, "step": 10380 }, { "epoch": 2.7600849256900215, "grad_norm": 0.357421875, "learning_rate": 0.00026002810890259146, "loss": 3.4754, "mean_token_accuracy": 0.3815734401345253, "num_tokens": 383385600.0, "step": 10400 }, { "epoch": 2.7653927813163484, "grad_norm": 0.306640625, "learning_rate": 0.00025984898901749537, "loss": 3.4677, "mean_token_accuracy": 0.3829827144742012, "num_tokens": 384122880.0, "step": 10420 }, { "epoch": 2.770700636942675, "grad_norm": 0.3046875, "learning_rate": 0.00025966953068849935, "loss": 3.4604, "mean_token_accuracy": 0.38251793533563616, "num_tokens": 384860160.0, "step": 10440 }, { "epoch": 2.776008492569002, "grad_norm": 0.3046875, "learning_rate": 0.00025948973446851336, "loss": 3.4516, "mean_token_accuracy": 0.3846529141068459, "num_tokens": 385597440.0, "step": 10460 }, { "epoch": 2.781316348195329, "grad_norm": 0.3046875, "learning_rate": 0.00025930960091148804, "loss": 3.4736, "mean_token_accuracy": 0.38258588761091233, "num_tokens": 386334720.0, "step": 10480 }, { "epoch": 2.786624203821656, "grad_norm": 0.302734375, "learning_rate": 0.00025912913057241365, "loss": 3.4556, "mean_token_accuracy": 0.38406854718923567, "num_tokens": 387072000.0, "step": 10500 }, { "epoch": 2.791932059447983, "grad_norm": 0.3203125, "learning_rate": 0.0002589483240073177, "loss": 3.4678, "mean_token_accuracy": 0.38241465389728546, "num_tokens": 387809280.0, "step": 10520 }, { "epoch": 2.79723991507431, "grad_norm": 0.302734375, "learning_rate": 0.00025876718177326405, "loss": 3.4439, "mean_token_accuracy": 0.3862904980778694, "num_tokens": 388546560.0, "step": 10540 }, { "epoch": 2.802547770700637, "grad_norm": 0.3515625, "learning_rate": 0.0002585857044283505, "loss": 3.4616, "mean_token_accuracy": 0.3846393197774887, "num_tokens": 389283840.0, "step": 10560 }, { "epoch": 2.807855626326964, "grad_norm": 0.322265625, "learning_rate": 0.0002584038925317073, "loss": 3.4317, "mean_token_accuracy": 0.38654870688915255, "num_tokens": 390021120.0, "step": 10580 }, { "epoch": 2.813163481953291, "grad_norm": 0.30078125, "learning_rate": 0.0002582217466434956, "loss": 3.4665, "mean_token_accuracy": 0.38321238458156587, "num_tokens": 390758400.0, "step": 10600 }, { "epoch": 2.8184713375796178, "grad_norm": 0.28125, "learning_rate": 0.0002580392673249055, "loss": 3.4562, "mean_token_accuracy": 0.3846909672021866, "num_tokens": 391495680.0, "step": 10620 }, { "epoch": 2.8237791932059446, "grad_norm": 0.314453125, "learning_rate": 0.0002578564551381543, "loss": 3.4869, "mean_token_accuracy": 0.3801777556538582, "num_tokens": 392232960.0, "step": 10640 }, { "epoch": 2.829087048832272, "grad_norm": 0.314453125, "learning_rate": 0.000257673310646485, "loss": 3.4632, "mean_token_accuracy": 0.38494101762771604, "num_tokens": 392970240.0, "step": 10660 }, { "epoch": 2.8343949044585988, "grad_norm": 0.318359375, "learning_rate": 0.00025748983441416446, "loss": 3.462, "mean_token_accuracy": 0.38452380895614624, "num_tokens": 393707520.0, "step": 10680 }, { "epoch": 2.8397027600849256, "grad_norm": 0.29296875, "learning_rate": 0.0002573060270064814, "loss": 3.4589, "mean_token_accuracy": 0.38467737585306166, "num_tokens": 394444800.0, "step": 10700 }, { "epoch": 2.845010615711253, "grad_norm": 0.296875, "learning_rate": 0.0002571218889897453, "loss": 3.4515, "mean_token_accuracy": 0.3853922039270401, "num_tokens": 395182080.0, "step": 10720 }, { "epoch": 2.8503184713375798, "grad_norm": 0.353515625, "learning_rate": 0.0002569374209312837, "loss": 3.4586, "mean_token_accuracy": 0.38473852574825285, "num_tokens": 395919360.0, "step": 10740 }, { "epoch": 2.8556263269639066, "grad_norm": 0.3203125, "learning_rate": 0.00025675262339944145, "loss": 3.4723, "mean_token_accuracy": 0.383885084092617, "num_tokens": 396656640.0, "step": 10760 }, { "epoch": 2.8609341825902335, "grad_norm": 0.310546875, "learning_rate": 0.00025656749696357833, "loss": 3.4641, "mean_token_accuracy": 0.38314307332038877, "num_tokens": 397393920.0, "step": 10780 }, { "epoch": 2.8662420382165603, "grad_norm": 0.33203125, "learning_rate": 0.00025638204219406745, "loss": 3.4674, "mean_token_accuracy": 0.38317297399044037, "num_tokens": 398131200.0, "step": 10800 }, { "epoch": 2.8715498938428876, "grad_norm": 0.333984375, "learning_rate": 0.00025619625966229355, "loss": 3.4569, "mean_token_accuracy": 0.38424249440431596, "num_tokens": 398868480.0, "step": 10820 }, { "epoch": 2.8768577494692145, "grad_norm": 0.310546875, "learning_rate": 0.0002560101499406512, "loss": 3.4745, "mean_token_accuracy": 0.38287671506404874, "num_tokens": 399605760.0, "step": 10840 }, { "epoch": 2.8821656050955413, "grad_norm": 0.306640625, "learning_rate": 0.00025582371360254307, "loss": 3.4701, "mean_token_accuracy": 0.3820721864700317, "num_tokens": 400343040.0, "step": 10860 }, { "epoch": 2.8874734607218686, "grad_norm": 0.3046875, "learning_rate": 0.000255636951222378, "loss": 3.4467, "mean_token_accuracy": 0.3853881284594536, "num_tokens": 401080320.0, "step": 10880 }, { "epoch": 2.8927813163481955, "grad_norm": 0.291015625, "learning_rate": 0.0002554498633755696, "loss": 3.4667, "mean_token_accuracy": 0.38228555470705033, "num_tokens": 401817600.0, "step": 10900 }, { "epoch": 2.8980891719745223, "grad_norm": 0.310546875, "learning_rate": 0.000255262450638534, "loss": 3.4346, "mean_token_accuracy": 0.3865962713956833, "num_tokens": 402554880.0, "step": 10920 }, { "epoch": 2.903397027600849, "grad_norm": 0.310546875, "learning_rate": 0.00025507471358868846, "loss": 3.4482, "mean_token_accuracy": 0.3856517717242241, "num_tokens": 403292160.0, "step": 10940 }, { "epoch": 2.908704883227176, "grad_norm": 0.32421875, "learning_rate": 0.00025488665280444934, "loss": 3.4564, "mean_token_accuracy": 0.3847195073962212, "num_tokens": 404029440.0, "step": 10960 }, { "epoch": 2.9140127388535033, "grad_norm": 0.322265625, "learning_rate": 0.0002546982688652306, "loss": 3.4619, "mean_token_accuracy": 0.38267829865217207, "num_tokens": 404766720.0, "step": 10980 }, { "epoch": 2.91932059447983, "grad_norm": 0.31640625, "learning_rate": 0.00025450956235144166, "loss": 3.4826, "mean_token_accuracy": 0.38089258670806886, "num_tokens": 405504000.0, "step": 11000 }, { "epoch": 2.924628450106157, "grad_norm": 0.318359375, "learning_rate": 0.00025432053384448586, "loss": 3.4408, "mean_token_accuracy": 0.38673080801963805, "num_tokens": 406241280.0, "step": 11020 }, { "epoch": 2.9299363057324843, "grad_norm": 0.3046875, "learning_rate": 0.0002541311839267586, "loss": 3.4635, "mean_token_accuracy": 0.3848363786935806, "num_tokens": 406978560.0, "step": 11040 }, { "epoch": 2.935244161358811, "grad_norm": 0.32421875, "learning_rate": 0.0002539415131816455, "loss": 3.4489, "mean_token_accuracy": 0.3865636557340622, "num_tokens": 407715840.0, "step": 11060 }, { "epoch": 2.940552016985138, "grad_norm": 0.345703125, "learning_rate": 0.0002537515221935207, "loss": 3.4203, "mean_token_accuracy": 0.38797564506530763, "num_tokens": 408453120.0, "step": 11080 }, { "epoch": 2.945859872611465, "grad_norm": 0.34375, "learning_rate": 0.00025356121154774505, "loss": 3.4483, "mean_token_accuracy": 0.3853324085474014, "num_tokens": 409190400.0, "step": 11100 }, { "epoch": 2.9511677282377917, "grad_norm": 0.326171875, "learning_rate": 0.00025337058183066416, "loss": 3.4546, "mean_token_accuracy": 0.3844762459397316, "num_tokens": 409927680.0, "step": 11120 }, { "epoch": 2.956475583864119, "grad_norm": 0.306640625, "learning_rate": 0.0002531796336296067, "loss": 3.4497, "mean_token_accuracy": 0.38540171682834623, "num_tokens": 410664960.0, "step": 11140 }, { "epoch": 2.961783439490446, "grad_norm": 0.30859375, "learning_rate": 0.00025298836753288265, "loss": 3.4254, "mean_token_accuracy": 0.38793487697839735, "num_tokens": 411402240.0, "step": 11160 }, { "epoch": 2.9670912951167727, "grad_norm": 0.3125, "learning_rate": 0.0002527967841297814, "loss": 3.4426, "mean_token_accuracy": 0.38606218844652174, "num_tokens": 412139520.0, "step": 11180 }, { "epoch": 2.9723991507431, "grad_norm": 0.30859375, "learning_rate": 0.00025260488401056996, "loss": 3.4446, "mean_token_accuracy": 0.38573467135429385, "num_tokens": 412876800.0, "step": 11200 }, { "epoch": 2.977707006369427, "grad_norm": 0.345703125, "learning_rate": 0.0002524126677664912, "loss": 3.4569, "mean_token_accuracy": 0.3837451085448265, "num_tokens": 413614080.0, "step": 11220 }, { "epoch": 2.9830148619957537, "grad_norm": 0.310546875, "learning_rate": 0.0002522201359897618, "loss": 3.4294, "mean_token_accuracy": 0.3876861840486526, "num_tokens": 414351360.0, "step": 11240 }, { "epoch": 2.9883227176220806, "grad_norm": 0.31640625, "learning_rate": 0.0002520272892735708, "loss": 3.4262, "mean_token_accuracy": 0.38786420971155167, "num_tokens": 415088640.0, "step": 11260 }, { "epoch": 2.9936305732484074, "grad_norm": 0.3125, "learning_rate": 0.0002518341282120774, "loss": 3.47, "mean_token_accuracy": 0.3832110270857811, "num_tokens": 415825920.0, "step": 11280 }, { "epoch": 2.9989384288747347, "grad_norm": 0.302734375, "learning_rate": 0.0002516406534004093, "loss": 3.4385, "mean_token_accuracy": 0.3857523396611214, "num_tokens": 416563200.0, "step": 11300 }, { "epoch": 3.0042462845010616, "grad_norm": 0.314453125, "learning_rate": 0.0002514468654346611, "loss": 3.3304, "mean_token_accuracy": 0.3982781574130058, "num_tokens": 417300480.0, "step": 11320 }, { "epoch": 3.0095541401273884, "grad_norm": 0.3125, "learning_rate": 0.00025125276491189195, "loss": 3.3131, "mean_token_accuracy": 0.4006061121821404, "num_tokens": 418037760.0, "step": 11340 }, { "epoch": 3.0148619957537157, "grad_norm": 0.298828125, "learning_rate": 0.000251058352430124, "loss": 3.3483, "mean_token_accuracy": 0.3957912057638168, "num_tokens": 418775040.0, "step": 11360 }, { "epoch": 3.0201698513800426, "grad_norm": 0.34375, "learning_rate": 0.00025086362858834076, "loss": 3.3149, "mean_token_accuracy": 0.3996140450239182, "num_tokens": 419512320.0, "step": 11380 }, { "epoch": 3.0254777070063694, "grad_norm": 0.31640625, "learning_rate": 0.00025066859398648484, "loss": 3.3212, "mean_token_accuracy": 0.39937622398138045, "num_tokens": 420249600.0, "step": 11400 }, { "epoch": 3.0307855626326963, "grad_norm": 0.298828125, "learning_rate": 0.0002504732492254564, "loss": 3.3446, "mean_token_accuracy": 0.39523265659809115, "num_tokens": 420986880.0, "step": 11420 }, { "epoch": 3.0360934182590236, "grad_norm": 0.298828125, "learning_rate": 0.0002502775949071111, "loss": 3.3319, "mean_token_accuracy": 0.3982305943965912, "num_tokens": 421724160.0, "step": 11440 }, { "epoch": 3.0414012738853504, "grad_norm": 0.30859375, "learning_rate": 0.00025008163163425855, "loss": 3.3392, "mean_token_accuracy": 0.39687567949295044, "num_tokens": 422461440.0, "step": 11460 }, { "epoch": 3.0467091295116773, "grad_norm": 0.314453125, "learning_rate": 0.00024988536001065994, "loss": 3.3472, "mean_token_accuracy": 0.3958360508084297, "num_tokens": 423198720.0, "step": 11480 }, { "epoch": 3.052016985138004, "grad_norm": 0.32421875, "learning_rate": 0.00024968878064102673, "loss": 3.3262, "mean_token_accuracy": 0.3989685237407684, "num_tokens": 423936000.0, "step": 11500 }, { "epoch": 3.0573248407643314, "grad_norm": 0.328125, "learning_rate": 0.0002494918941310184, "loss": 3.3379, "mean_token_accuracy": 0.3963334411382675, "num_tokens": 424673280.0, "step": 11520 }, { "epoch": 3.0626326963906583, "grad_norm": 0.3125, "learning_rate": 0.00024929470108724094, "loss": 3.3557, "mean_token_accuracy": 0.3951375290751457, "num_tokens": 425410560.0, "step": 11540 }, { "epoch": 3.067940552016985, "grad_norm": 0.302734375, "learning_rate": 0.0002490972021172445, "loss": 3.3485, "mean_token_accuracy": 0.3966106787323952, "num_tokens": 426147840.0, "step": 11560 }, { "epoch": 3.073248407643312, "grad_norm": 0.3359375, "learning_rate": 0.00024889939782952184, "loss": 3.3481, "mean_token_accuracy": 0.3965848535299301, "num_tokens": 426885120.0, "step": 11580 }, { "epoch": 3.0785562632696393, "grad_norm": 0.30078125, "learning_rate": 0.00024870128883350663, "loss": 3.3537, "mean_token_accuracy": 0.3954446628689766, "num_tokens": 427622400.0, "step": 11600 }, { "epoch": 3.083864118895966, "grad_norm": 0.326171875, "learning_rate": 0.0002485028757395711, "loss": 3.3421, "mean_token_accuracy": 0.3979778215289116, "num_tokens": 428359680.0, "step": 11620 }, { "epoch": 3.089171974522293, "grad_norm": 0.33984375, "learning_rate": 0.0002483041591590244, "loss": 3.3292, "mean_token_accuracy": 0.39825777262449263, "num_tokens": 429096960.0, "step": 11640 }, { "epoch": 3.09447983014862, "grad_norm": 0.314453125, "learning_rate": 0.00024810513970411095, "loss": 3.3541, "mean_token_accuracy": 0.39483447670936583, "num_tokens": 429834240.0, "step": 11660 }, { "epoch": 3.099787685774947, "grad_norm": 0.310546875, "learning_rate": 0.0002479058179880081, "loss": 3.3377, "mean_token_accuracy": 0.3968947038054466, "num_tokens": 430571520.0, "step": 11680 }, { "epoch": 3.105095541401274, "grad_norm": 0.34375, "learning_rate": 0.0002477061946248245, "loss": 3.3399, "mean_token_accuracy": 0.3966663971543312, "num_tokens": 431308800.0, "step": 11700 }, { "epoch": 3.110403397027601, "grad_norm": 0.31640625, "learning_rate": 0.0002475062702295984, "loss": 3.3365, "mean_token_accuracy": 0.39750081300735474, "num_tokens": 432046080.0, "step": 11720 }, { "epoch": 3.1157112526539277, "grad_norm": 0.375, "learning_rate": 0.0002473060454182951, "loss": 3.3449, "mean_token_accuracy": 0.3957884877920151, "num_tokens": 432783360.0, "step": 11740 }, { "epoch": 3.121019108280255, "grad_norm": 0.306640625, "learning_rate": 0.00024710552080780586, "loss": 3.3572, "mean_token_accuracy": 0.3938451290130615, "num_tokens": 433520640.0, "step": 11760 }, { "epoch": 3.126326963906582, "grad_norm": 0.310546875, "learning_rate": 0.00024690469701594544, "loss": 3.3267, "mean_token_accuracy": 0.39845075011253356, "num_tokens": 434257920.0, "step": 11780 }, { "epoch": 3.1316348195329087, "grad_norm": 0.294921875, "learning_rate": 0.00024670357466145045, "loss": 3.3498, "mean_token_accuracy": 0.3977168947458267, "num_tokens": 434995200.0, "step": 11800 }, { "epoch": 3.1369426751592355, "grad_norm": 0.33984375, "learning_rate": 0.0002465021543639773, "loss": 3.3523, "mean_token_accuracy": 0.3953128352761269, "num_tokens": 435732480.0, "step": 11820 }, { "epoch": 3.142250530785563, "grad_norm": 0.306640625, "learning_rate": 0.0002463004367441004, "loss": 3.3302, "mean_token_accuracy": 0.3987402155995369, "num_tokens": 436469760.0, "step": 11840 }, { "epoch": 3.1475583864118897, "grad_norm": 0.314453125, "learning_rate": 0.00024609842242331024, "loss": 3.3246, "mean_token_accuracy": 0.39993069171905515, "num_tokens": 437207040.0, "step": 11860 }, { "epoch": 3.1528662420382165, "grad_norm": 0.32421875, "learning_rate": 0.00024589611202401133, "loss": 3.3644, "mean_token_accuracy": 0.3956389963626862, "num_tokens": 437944320.0, "step": 11880 }, { "epoch": 3.1581740976645434, "grad_norm": 0.3359375, "learning_rate": 0.00024569350616952053, "loss": 3.357, "mean_token_accuracy": 0.3964326500892639, "num_tokens": 438681600.0, "step": 11900 }, { "epoch": 3.1634819532908707, "grad_norm": 0.61328125, "learning_rate": 0.0002454906054840649, "loss": 3.411, "mean_token_accuracy": 0.3906515002250671, "num_tokens": 439418880.0, "step": 11920 }, { "epoch": 3.1687898089171975, "grad_norm": 0.3828125, "learning_rate": 0.00024528741059278, "loss": 3.3981, "mean_token_accuracy": 0.3928652971982956, "num_tokens": 440156160.0, "step": 11940 }, { "epoch": 3.1740976645435244, "grad_norm": 0.373046875, "learning_rate": 0.00024508392212170766, "loss": 3.435, "mean_token_accuracy": 0.3890152722597122, "num_tokens": 440893440.0, "step": 11960 }, { "epoch": 3.1794055201698512, "grad_norm": 0.37109375, "learning_rate": 0.0002448801406977943, "loss": 3.4418, "mean_token_accuracy": 0.3898401826620102, "num_tokens": 441630720.0, "step": 11980 }, { "epoch": 3.1847133757961785, "grad_norm": 0.3828125, "learning_rate": 0.00024467606694888896, "loss": 3.4579, "mean_token_accuracy": 0.38716568797826767, "num_tokens": 442368000.0, "step": 12000 }, { "epoch": 3.1900212314225054, "grad_norm": 0.39453125, "learning_rate": 0.00024447170150374134, "loss": 3.486, "mean_token_accuracy": 0.383769565820694, "num_tokens": 443105280.0, "step": 12020 }, { "epoch": 3.1953290870488322, "grad_norm": 0.380859375, "learning_rate": 0.0002442670449919998, "loss": 3.4891, "mean_token_accuracy": 0.3839965209364891, "num_tokens": 443842560.0, "step": 12040 }, { "epoch": 3.200636942675159, "grad_norm": 0.421875, "learning_rate": 0.00024406209804420947, "loss": 3.5074, "mean_token_accuracy": 0.382421450316906, "num_tokens": 444579840.0, "step": 12060 }, { "epoch": 3.2059447983014864, "grad_norm": 0.40625, "learning_rate": 0.0002438568612918104, "loss": 3.5119, "mean_token_accuracy": 0.38161013275384903, "num_tokens": 445317120.0, "step": 12080 }, { "epoch": 3.2112526539278132, "grad_norm": 0.421875, "learning_rate": 0.0002436513353671354, "loss": 3.5407, "mean_token_accuracy": 0.3778973713517189, "num_tokens": 446054400.0, "step": 12100 }, { "epoch": 3.21656050955414, "grad_norm": 0.412109375, "learning_rate": 0.0002434455209034083, "loss": 3.5146, "mean_token_accuracy": 0.3814171522855759, "num_tokens": 446791680.0, "step": 12120 }, { "epoch": 3.221868365180467, "grad_norm": 0.482421875, "learning_rate": 0.00024323941853474196, "loss": 3.5168, "mean_token_accuracy": 0.3813478484749794, "num_tokens": 447528960.0, "step": 12140 }, { "epoch": 3.2271762208067942, "grad_norm": 0.392578125, "learning_rate": 0.00024303302889613608, "loss": 3.542, "mean_token_accuracy": 0.37812432199716567, "num_tokens": 448266240.0, "step": 12160 }, { "epoch": 3.232484076433121, "grad_norm": 0.447265625, "learning_rate": 0.00024282635262347566, "loss": 3.5357, "mean_token_accuracy": 0.37768944054841996, "num_tokens": 449003520.0, "step": 12180 }, { "epoch": 3.237791932059448, "grad_norm": 0.4453125, "learning_rate": 0.00024261939035352873, "loss": 3.5167, "mean_token_accuracy": 0.3802130863070488, "num_tokens": 449740800.0, "step": 12200 }, { "epoch": 3.243099787685775, "grad_norm": 0.419921875, "learning_rate": 0.0002424121427239444, "loss": 3.5141, "mean_token_accuracy": 0.3805772975087166, "num_tokens": 450478080.0, "step": 12220 }, { "epoch": 3.248407643312102, "grad_norm": 0.451171875, "learning_rate": 0.00024220461037325114, "loss": 3.5113, "mean_token_accuracy": 0.37985160052776334, "num_tokens": 451215360.0, "step": 12240 }, { "epoch": 3.253715498938429, "grad_norm": 0.41015625, "learning_rate": 0.00024199679394085449, "loss": 3.532, "mean_token_accuracy": 0.37826701402664187, "num_tokens": 451952640.0, "step": 12260 }, { "epoch": 3.259023354564756, "grad_norm": 0.455078125, "learning_rate": 0.00024178869406703526, "loss": 3.5611, "mean_token_accuracy": 0.3744183495640755, "num_tokens": 452689920.0, "step": 12280 }, { "epoch": 3.2643312101910826, "grad_norm": 0.416015625, "learning_rate": 0.00024158031139294764, "loss": 3.5234, "mean_token_accuracy": 0.37891933023929597, "num_tokens": 453427200.0, "step": 12300 }, { "epoch": 3.26963906581741, "grad_norm": 0.458984375, "learning_rate": 0.00024137164656061702, "loss": 3.5434, "mean_token_accuracy": 0.376880843937397, "num_tokens": 454164480.0, "step": 12320 }, { "epoch": 3.274946921443737, "grad_norm": 0.412109375, "learning_rate": 0.00024116270021293823, "loss": 3.5189, "mean_token_accuracy": 0.3790538713335991, "num_tokens": 454901760.0, "step": 12340 }, { "epoch": 3.2802547770700636, "grad_norm": 0.443359375, "learning_rate": 0.00024095347299367328, "loss": 3.5272, "mean_token_accuracy": 0.37857142835855484, "num_tokens": 455639040.0, "step": 12360 }, { "epoch": 3.2855626326963905, "grad_norm": 0.439453125, "learning_rate": 0.00024074396554744971, "loss": 3.53, "mean_token_accuracy": 0.37823847830295565, "num_tokens": 456376320.0, "step": 12380 }, { "epoch": 3.290870488322718, "grad_norm": 0.421875, "learning_rate": 0.0002405341785197584, "loss": 3.5457, "mean_token_accuracy": 0.3772654354572296, "num_tokens": 457113600.0, "step": 12400 }, { "epoch": 3.2961783439490446, "grad_norm": 0.392578125, "learning_rate": 0.00024032411255695148, "loss": 3.5351, "mean_token_accuracy": 0.3768414333462715, "num_tokens": 457850880.0, "step": 12420 }, { "epoch": 3.3014861995753715, "grad_norm": 0.408203125, "learning_rate": 0.00024011376830624062, "loss": 3.5403, "mean_token_accuracy": 0.37725320607423785, "num_tokens": 458588160.0, "step": 12440 }, { "epoch": 3.3067940552016983, "grad_norm": 0.451171875, "learning_rate": 0.0002399031464156949, "loss": 3.5417, "mean_token_accuracy": 0.3770711049437523, "num_tokens": 459325440.0, "step": 12460 }, { "epoch": 3.3121019108280256, "grad_norm": 0.427734375, "learning_rate": 0.0002396922475342388, "loss": 3.5512, "mean_token_accuracy": 0.37664845436811445, "num_tokens": 460062720.0, "step": 12480 }, { "epoch": 3.3174097664543525, "grad_norm": 0.396484375, "learning_rate": 0.00023948107231165007, "loss": 3.5471, "mean_token_accuracy": 0.37671097069978715, "num_tokens": 460800000.0, "step": 12500 }, { "epoch": 3.3227176220806793, "grad_norm": 0.43359375, "learning_rate": 0.000239269621398558, "loss": 3.5253, "mean_token_accuracy": 0.37745433300733566, "num_tokens": 461537280.0, "step": 12520 }, { "epoch": 3.328025477707006, "grad_norm": 0.44921875, "learning_rate": 0.00023905789544644132, "loss": 3.5403, "mean_token_accuracy": 0.3763943284749985, "num_tokens": 462274560.0, "step": 12540 }, { "epoch": 3.3333333333333335, "grad_norm": 0.396484375, "learning_rate": 0.00023884589510762594, "loss": 3.5433, "mean_token_accuracy": 0.37716622948646544, "num_tokens": 463011840.0, "step": 12560 }, { "epoch": 3.3386411889596603, "grad_norm": 0.4453125, "learning_rate": 0.00023863362103528337, "loss": 3.5432, "mean_token_accuracy": 0.3766416609287262, "num_tokens": 463749120.0, "step": 12580 }, { "epoch": 3.343949044585987, "grad_norm": 0.462890625, "learning_rate": 0.00023842107388342843, "loss": 3.5618, "mean_token_accuracy": 0.37418052852153777, "num_tokens": 464486400.0, "step": 12600 }, { "epoch": 3.349256900212314, "grad_norm": 0.423828125, "learning_rate": 0.00023820825430691725, "loss": 3.5407, "mean_token_accuracy": 0.3768998712301254, "num_tokens": 465223680.0, "step": 12620 }, { "epoch": 3.3545647558386413, "grad_norm": 0.421875, "learning_rate": 0.0002379951629614453, "loss": 3.5583, "mean_token_accuracy": 0.3759105235338211, "num_tokens": 465960960.0, "step": 12640 }, { "epoch": 3.359872611464968, "grad_norm": 0.40625, "learning_rate": 0.00023778180050354543, "loss": 3.5543, "mean_token_accuracy": 0.375577574968338, "num_tokens": 466698240.0, "step": 12660 }, { "epoch": 3.365180467091295, "grad_norm": 0.443359375, "learning_rate": 0.00023756816759058576, "loss": 3.5496, "mean_token_accuracy": 0.3751413345336914, "num_tokens": 467435520.0, "step": 12680 }, { "epoch": 3.370488322717622, "grad_norm": 0.400390625, "learning_rate": 0.00023735426488076766, "loss": 3.541, "mean_token_accuracy": 0.3748070240020752, "num_tokens": 468172800.0, "step": 12700 }, { "epoch": 3.375796178343949, "grad_norm": 0.439453125, "learning_rate": 0.00023714009303312374, "loss": 3.5488, "mean_token_accuracy": 0.37656827867031095, "num_tokens": 468910080.0, "step": 12720 }, { "epoch": 3.381104033970276, "grad_norm": 0.412109375, "learning_rate": 0.00023692565270751586, "loss": 3.5491, "mean_token_accuracy": 0.37519161850214006, "num_tokens": 469647360.0, "step": 12740 }, { "epoch": 3.386411889596603, "grad_norm": 0.41015625, "learning_rate": 0.000236710944564633, "loss": 3.522, "mean_token_accuracy": 0.3793610021471977, "num_tokens": 470384640.0, "step": 12760 }, { "epoch": 3.3917197452229297, "grad_norm": 0.474609375, "learning_rate": 0.0002364959692659893, "loss": 3.5656, "mean_token_accuracy": 0.37412752360105517, "num_tokens": 471121920.0, "step": 12780 }, { "epoch": 3.397027600849257, "grad_norm": 0.4765625, "learning_rate": 0.0002362807274739221, "loss": 3.5649, "mean_token_accuracy": 0.3737266272306442, "num_tokens": 471859200.0, "step": 12800 }, { "epoch": 3.402335456475584, "grad_norm": 0.431640625, "learning_rate": 0.00023606521985158966, "loss": 3.5531, "mean_token_accuracy": 0.3761198088526726, "num_tokens": 472596480.0, "step": 12820 }, { "epoch": 3.4076433121019107, "grad_norm": 0.4375, "learning_rate": 0.00023584944706296935, "loss": 3.5377, "mean_token_accuracy": 0.37878071665763857, "num_tokens": 473333760.0, "step": 12840 }, { "epoch": 3.412951167728238, "grad_norm": 0.41015625, "learning_rate": 0.00023563340977285547, "loss": 3.5511, "mean_token_accuracy": 0.3764147087931633, "num_tokens": 474071040.0, "step": 12860 }, { "epoch": 3.418259023354565, "grad_norm": 0.41015625, "learning_rate": 0.0002354171086468573, "loss": 3.5439, "mean_token_accuracy": 0.3756618306040764, "num_tokens": 474808320.0, "step": 12880 }, { "epoch": 3.4235668789808917, "grad_norm": 0.435546875, "learning_rate": 0.00023520054435139694, "loss": 3.5709, "mean_token_accuracy": 0.37361111044883727, "num_tokens": 475545600.0, "step": 12900 }, { "epoch": 3.4288747346072186, "grad_norm": 0.423828125, "learning_rate": 0.00023498371755370734, "loss": 3.5374, "mean_token_accuracy": 0.37763780206441877, "num_tokens": 476282880.0, "step": 12920 }, { "epoch": 3.4341825902335454, "grad_norm": 0.447265625, "learning_rate": 0.0002347666289218302, "loss": 3.5362, "mean_token_accuracy": 0.3771621540188789, "num_tokens": 477020160.0, "step": 12940 }, { "epoch": 3.4394904458598727, "grad_norm": 0.416015625, "learning_rate": 0.00023454927912461393, "loss": 3.5566, "mean_token_accuracy": 0.37479751110076903, "num_tokens": 477757440.0, "step": 12960 }, { "epoch": 3.4447983014861996, "grad_norm": 0.400390625, "learning_rate": 0.0002343316688317117, "loss": 3.5515, "mean_token_accuracy": 0.3749388426542282, "num_tokens": 478494720.0, "step": 12980 }, { "epoch": 3.4501061571125264, "grad_norm": 0.43359375, "learning_rate": 0.00023411379871357907, "loss": 3.5309, "mean_token_accuracy": 0.37839068472385406, "num_tokens": 479232000.0, "step": 13000 }, { "epoch": 3.4554140127388537, "grad_norm": 0.41015625, "learning_rate": 0.00023389566944147222, "loss": 3.5716, "mean_token_accuracy": 0.37214883863925935, "num_tokens": 479969280.0, "step": 13020 }, { "epoch": 3.4607218683651806, "grad_norm": 0.421875, "learning_rate": 0.00023367728168744584, "loss": 3.5556, "mean_token_accuracy": 0.37429060488939286, "num_tokens": 480706560.0, "step": 13040 }, { "epoch": 3.4660297239915074, "grad_norm": 0.44921875, "learning_rate": 0.0002334586361243509, "loss": 3.5182, "mean_token_accuracy": 0.3791245386004448, "num_tokens": 481443840.0, "step": 13060 }, { "epoch": 3.4713375796178343, "grad_norm": 0.41796875, "learning_rate": 0.00023323973342583273, "loss": 3.5557, "mean_token_accuracy": 0.37484235763549806, "num_tokens": 482181120.0, "step": 13080 }, { "epoch": 3.476645435244161, "grad_norm": 0.439453125, "learning_rate": 0.00023302057426632884, "loss": 3.5489, "mean_token_accuracy": 0.3754009008407593, "num_tokens": 482918400.0, "step": 13100 }, { "epoch": 3.4819532908704884, "grad_norm": 0.388671875, "learning_rate": 0.000232801159321067, "loss": 3.5571, "mean_token_accuracy": 0.37509377151727674, "num_tokens": 483655680.0, "step": 13120 }, { "epoch": 3.4872611464968153, "grad_norm": 0.447265625, "learning_rate": 0.00023258148926606295, "loss": 3.5683, "mean_token_accuracy": 0.3730389758944511, "num_tokens": 484392960.0, "step": 13140 }, { "epoch": 3.492569002123142, "grad_norm": 0.43359375, "learning_rate": 0.00023236156477811844, "loss": 3.5421, "mean_token_accuracy": 0.3776432380080223, "num_tokens": 485130240.0, "step": 13160 }, { "epoch": 3.4978768577494694, "grad_norm": 0.431640625, "learning_rate": 0.0002321413865348192, "loss": 3.5672, "mean_token_accuracy": 0.37287725508213043, "num_tokens": 485867520.0, "step": 13180 }, { "epoch": 3.5031847133757963, "grad_norm": 0.439453125, "learning_rate": 0.00023192095521453268, "loss": 3.5497, "mean_token_accuracy": 0.3762733727693558, "num_tokens": 486604800.0, "step": 13200 }, { "epoch": 3.508492569002123, "grad_norm": 0.40234375, "learning_rate": 0.00023170027149640613, "loss": 3.5633, "mean_token_accuracy": 0.37419004142284396, "num_tokens": 487342080.0, "step": 13220 }, { "epoch": 3.51380042462845, "grad_norm": 0.41796875, "learning_rate": 0.00023147933606036436, "loss": 3.5325, "mean_token_accuracy": 0.37767993062734606, "num_tokens": 488079360.0, "step": 13240 }, { "epoch": 3.519108280254777, "grad_norm": 0.443359375, "learning_rate": 0.0002312581495871078, "loss": 3.5431, "mean_token_accuracy": 0.37608311176300047, "num_tokens": 488816640.0, "step": 13260 }, { "epoch": 3.524416135881104, "grad_norm": 0.458984375, "learning_rate": 0.0002310367127581103, "loss": 3.5133, "mean_token_accuracy": 0.37851435244083403, "num_tokens": 489553920.0, "step": 13280 }, { "epoch": 3.529723991507431, "grad_norm": 0.453125, "learning_rate": 0.00023081502625561706, "loss": 3.5549, "mean_token_accuracy": 0.3750801786780357, "num_tokens": 490291200.0, "step": 13300 }, { "epoch": 3.535031847133758, "grad_norm": 0.45703125, "learning_rate": 0.00023059309076264246, "loss": 3.5394, "mean_token_accuracy": 0.37688492238521576, "num_tokens": 491028480.0, "step": 13320 }, { "epoch": 3.540339702760085, "grad_norm": 0.39453125, "learning_rate": 0.0002303709069629681, "loss": 3.5428, "mean_token_accuracy": 0.3759458586573601, "num_tokens": 491765760.0, "step": 13340 }, { "epoch": 3.545647558386412, "grad_norm": 0.43359375, "learning_rate": 0.00023014847554114054, "loss": 3.5643, "mean_token_accuracy": 0.37364508509635924, "num_tokens": 492503040.0, "step": 13360 }, { "epoch": 3.550955414012739, "grad_norm": 0.427734375, "learning_rate": 0.00022992579718246936, "loss": 3.5732, "mean_token_accuracy": 0.37307974249124526, "num_tokens": 493240320.0, "step": 13380 }, { "epoch": 3.5562632696390657, "grad_norm": 0.408203125, "learning_rate": 0.00022970287257302485, "loss": 3.5625, "mean_token_accuracy": 0.374391171336174, "num_tokens": 493977600.0, "step": 13400 }, { "epoch": 3.5615711252653925, "grad_norm": 0.41796875, "learning_rate": 0.000229479702399636, "loss": 3.5572, "mean_token_accuracy": 0.3746249213814735, "num_tokens": 494714880.0, "step": 13420 }, { "epoch": 3.56687898089172, "grad_norm": 0.3984375, "learning_rate": 0.00022925628734988857, "loss": 3.5407, "mean_token_accuracy": 0.37624347656965257, "num_tokens": 495452160.0, "step": 13440 }, { "epoch": 3.5721868365180467, "grad_norm": 0.412109375, "learning_rate": 0.00022903262811212243, "loss": 3.5489, "mean_token_accuracy": 0.37645547837018967, "num_tokens": 496189440.0, "step": 13460 }, { "epoch": 3.5774946921443735, "grad_norm": 0.431640625, "learning_rate": 0.00022880872537543007, "loss": 3.5265, "mean_token_accuracy": 0.3770547941327095, "num_tokens": 496926720.0, "step": 13480 }, { "epoch": 3.582802547770701, "grad_norm": 0.412109375, "learning_rate": 0.00022858457982965417, "loss": 3.5634, "mean_token_accuracy": 0.37307838946580885, "num_tokens": 497664000.0, "step": 13500 }, { "epoch": 3.5881104033970277, "grad_norm": 0.408203125, "learning_rate": 0.00022836019216538543, "loss": 3.5157, "mean_token_accuracy": 0.3793569251894951, "num_tokens": 498401280.0, "step": 13520 }, { "epoch": 3.5934182590233545, "grad_norm": 0.42578125, "learning_rate": 0.00022813556307396045, "loss": 3.5646, "mean_token_accuracy": 0.3745365798473358, "num_tokens": 499138560.0, "step": 13540 }, { "epoch": 3.5987261146496814, "grad_norm": 0.453125, "learning_rate": 0.0002279106932474599, "loss": 3.5592, "mean_token_accuracy": 0.37394949942827227, "num_tokens": 499875840.0, "step": 13560 }, { "epoch": 3.6040339702760082, "grad_norm": 0.4140625, "learning_rate": 0.00022768558337870587, "loss": 3.5534, "mean_token_accuracy": 0.37528946697711946, "num_tokens": 500613120.0, "step": 13580 }, { "epoch": 3.6093418259023355, "grad_norm": 0.400390625, "learning_rate": 0.00022746023416126023, "loss": 3.5393, "mean_token_accuracy": 0.37802375555038453, "num_tokens": 501350400.0, "step": 13600 }, { "epoch": 3.6146496815286624, "grad_norm": 0.404296875, "learning_rate": 0.0002272346462894222, "loss": 3.5568, "mean_token_accuracy": 0.37570396065711975, "num_tokens": 502087680.0, "step": 13620 }, { "epoch": 3.6199575371549892, "grad_norm": 0.40234375, "learning_rate": 0.0002270088204582263, "loss": 3.5523, "mean_token_accuracy": 0.3757746234536171, "num_tokens": 502824960.0, "step": 13640 }, { "epoch": 3.6252653927813165, "grad_norm": 0.423828125, "learning_rate": 0.00022678275736344014, "loss": 3.5735, "mean_token_accuracy": 0.3718321904540062, "num_tokens": 503562240.0, "step": 13660 }, { "epoch": 3.6305732484076434, "grad_norm": 0.419921875, "learning_rate": 0.00022655645770156243, "loss": 3.566, "mean_token_accuracy": 0.3746167615056038, "num_tokens": 504299520.0, "step": 13680 }, { "epoch": 3.6358811040339702, "grad_norm": 0.427734375, "learning_rate": 0.0002263299221698208, "loss": 3.5524, "mean_token_accuracy": 0.37536013424396514, "num_tokens": 505036800.0, "step": 13700 }, { "epoch": 3.641188959660297, "grad_norm": 0.486328125, "learning_rate": 0.00022610315146616935, "loss": 3.5504, "mean_token_accuracy": 0.3753574177622795, "num_tokens": 505774080.0, "step": 13720 }, { "epoch": 3.646496815286624, "grad_norm": 0.435546875, "learning_rate": 0.00022587614628928694, "loss": 3.5956, "mean_token_accuracy": 0.3704052478075027, "num_tokens": 506511360.0, "step": 13740 }, { "epoch": 3.6518046709129512, "grad_norm": 0.416015625, "learning_rate": 0.00022564890733857483, "loss": 3.5284, "mean_token_accuracy": 0.3776282876729965, "num_tokens": 507248640.0, "step": 13760 }, { "epoch": 3.657112526539278, "grad_norm": 0.43359375, "learning_rate": 0.0002254214353141545, "loss": 3.5323, "mean_token_accuracy": 0.37671912014484404, "num_tokens": 507985920.0, "step": 13780 }, { "epoch": 3.662420382165605, "grad_norm": 0.388671875, "learning_rate": 0.00022519373091686547, "loss": 3.5634, "mean_token_accuracy": 0.37327136248350146, "num_tokens": 508723200.0, "step": 13800 }, { "epoch": 3.6677282377919322, "grad_norm": 0.41015625, "learning_rate": 0.00022496579484826336, "loss": 3.5579, "mean_token_accuracy": 0.37490215301513674, "num_tokens": 509460480.0, "step": 13820 }, { "epoch": 3.673036093418259, "grad_norm": 0.4140625, "learning_rate": 0.00022473762781061738, "loss": 3.5696, "mean_token_accuracy": 0.37278892248868944, "num_tokens": 510197760.0, "step": 13840 }, { "epoch": 3.678343949044586, "grad_norm": 0.39453125, "learning_rate": 0.00022450923050690844, "loss": 3.5518, "mean_token_accuracy": 0.37455832958221436, "num_tokens": 510935040.0, "step": 13860 }, { "epoch": 3.683651804670913, "grad_norm": 0.404296875, "learning_rate": 0.00022428060364082693, "loss": 3.5597, "mean_token_accuracy": 0.37517802566289904, "num_tokens": 511672320.0, "step": 13880 }, { "epoch": 3.6889596602972397, "grad_norm": 0.412109375, "learning_rate": 0.00022405174791677047, "loss": 3.5679, "mean_token_accuracy": 0.37349695712327957, "num_tokens": 512409600.0, "step": 13900 }, { "epoch": 3.694267515923567, "grad_norm": 0.427734375, "learning_rate": 0.00022382266403984176, "loss": 3.5816, "mean_token_accuracy": 0.37140682637691497, "num_tokens": 513146880.0, "step": 13920 }, { "epoch": 3.699575371549894, "grad_norm": 0.408203125, "learning_rate": 0.0002235933527158466, "loss": 3.5721, "mean_token_accuracy": 0.37340046763420104, "num_tokens": 513884160.0, "step": 13940 }, { "epoch": 3.7048832271762207, "grad_norm": 0.419921875, "learning_rate": 0.0002233638146512913, "loss": 3.5875, "mean_token_accuracy": 0.37161067575216294, "num_tokens": 514621440.0, "step": 13960 }, { "epoch": 3.710191082802548, "grad_norm": 0.447265625, "learning_rate": 0.000223134050553381, "loss": 3.5404, "mean_token_accuracy": 0.37647450268268584, "num_tokens": 515358720.0, "step": 13980 }, { "epoch": 3.715498938428875, "grad_norm": 0.431640625, "learning_rate": 0.0002229040611300171, "loss": 3.559, "mean_token_accuracy": 0.3748491495847702, "num_tokens": 516096000.0, "step": 14000 }, { "epoch": 3.7208067940552016, "grad_norm": 0.431640625, "learning_rate": 0.00022267384708979523, "loss": 3.5212, "mean_token_accuracy": 0.3785306602716446, "num_tokens": 516833280.0, "step": 14020 }, { "epoch": 3.7261146496815285, "grad_norm": 0.419921875, "learning_rate": 0.00022244340914200314, "loss": 3.5619, "mean_token_accuracy": 0.37544167041778564, "num_tokens": 517570560.0, "step": 14040 }, { "epoch": 3.7314225053078554, "grad_norm": 0.4296875, "learning_rate": 0.00022221274799661835, "loss": 3.5438, "mean_token_accuracy": 0.37633180916309356, "num_tokens": 518307840.0, "step": 14060 }, { "epoch": 3.7367303609341826, "grad_norm": 0.439453125, "learning_rate": 0.00022198186436430615, "loss": 3.5657, "mean_token_accuracy": 0.3732727199792862, "num_tokens": 519045120.0, "step": 14080 }, { "epoch": 3.7420382165605095, "grad_norm": 0.453125, "learning_rate": 0.00022175075895641724, "loss": 3.5518, "mean_token_accuracy": 0.37569716274738313, "num_tokens": 519782400.0, "step": 14100 }, { "epoch": 3.7473460721868364, "grad_norm": 0.4453125, "learning_rate": 0.00022151943248498561, "loss": 3.5613, "mean_token_accuracy": 0.3737157553434372, "num_tokens": 520519680.0, "step": 14120 }, { "epoch": 3.7526539278131636, "grad_norm": 0.3828125, "learning_rate": 0.00022128788566272637, "loss": 3.546, "mean_token_accuracy": 0.3762978360056877, "num_tokens": 521256960.0, "step": 14140 }, { "epoch": 3.7579617834394905, "grad_norm": 0.392578125, "learning_rate": 0.0002210561192030335, "loss": 3.5766, "mean_token_accuracy": 0.3731898248195648, "num_tokens": 521994240.0, "step": 14160 }, { "epoch": 3.7632696390658174, "grad_norm": 0.458984375, "learning_rate": 0.0002208241338199777, "loss": 3.5245, "mean_token_accuracy": 0.3781039327383041, "num_tokens": 522731520.0, "step": 14180 }, { "epoch": 3.7685774946921446, "grad_norm": 0.44140625, "learning_rate": 0.0002205919302283042, "loss": 3.5717, "mean_token_accuracy": 0.3721420407295227, "num_tokens": 523468800.0, "step": 14200 }, { "epoch": 3.7738853503184715, "grad_norm": 0.42578125, "learning_rate": 0.0002203595091434305, "loss": 3.5403, "mean_token_accuracy": 0.37603419423103335, "num_tokens": 524206080.0, "step": 14220 }, { "epoch": 3.7791932059447984, "grad_norm": 0.388671875, "learning_rate": 0.00022012687128144414, "loss": 3.5527, "mean_token_accuracy": 0.37572026550769805, "num_tokens": 524943360.0, "step": 14240 }, { "epoch": 3.784501061571125, "grad_norm": 0.408203125, "learning_rate": 0.00021989401735910067, "loss": 3.5407, "mean_token_accuracy": 0.37597983330488205, "num_tokens": 525680640.0, "step": 14260 }, { "epoch": 3.789808917197452, "grad_norm": 0.443359375, "learning_rate": 0.00021966094809382122, "loss": 3.5396, "mean_token_accuracy": 0.37622853070497514, "num_tokens": 526417920.0, "step": 14280 }, { "epoch": 3.7951167728237793, "grad_norm": 0.4453125, "learning_rate": 0.00021942766420369043, "loss": 3.5486, "mean_token_accuracy": 0.3746222034096718, "num_tokens": 527155200.0, "step": 14300 }, { "epoch": 3.800424628450106, "grad_norm": 0.40625, "learning_rate": 0.00021919416640745417, "loss": 3.5201, "mean_token_accuracy": 0.3792441293597221, "num_tokens": 527892480.0, "step": 14320 }, { "epoch": 3.805732484076433, "grad_norm": 0.447265625, "learning_rate": 0.0002189604554245174, "loss": 3.5328, "mean_token_accuracy": 0.3770697444677353, "num_tokens": 528629760.0, "step": 14340 }, { "epoch": 3.8110403397027603, "grad_norm": 0.404296875, "learning_rate": 0.0002187265319749419, "loss": 3.5496, "mean_token_accuracy": 0.37602468132972716, "num_tokens": 529367040.0, "step": 14360 }, { "epoch": 3.816348195329087, "grad_norm": 0.416015625, "learning_rate": 0.00021849239677944403, "loss": 3.5535, "mean_token_accuracy": 0.37446727603673935, "num_tokens": 530104320.0, "step": 14380 }, { "epoch": 3.821656050955414, "grad_norm": 0.421875, "learning_rate": 0.0002182580505593925, "loss": 3.5593, "mean_token_accuracy": 0.37436942756175995, "num_tokens": 530841600.0, "step": 14400 }, { "epoch": 3.826963906581741, "grad_norm": 0.408203125, "learning_rate": 0.0002180234940368063, "loss": 3.5513, "mean_token_accuracy": 0.3769542306661606, "num_tokens": 531578880.0, "step": 14420 }, { "epoch": 3.8322717622080678, "grad_norm": 0.41015625, "learning_rate": 0.00021778872793435224, "loss": 3.5757, "mean_token_accuracy": 0.3723513275384903, "num_tokens": 532316160.0, "step": 14440 }, { "epoch": 3.837579617834395, "grad_norm": 0.4140625, "learning_rate": 0.0002175537529753429, "loss": 3.5528, "mean_token_accuracy": 0.37508018165826795, "num_tokens": 533053440.0, "step": 14460 }, { "epoch": 3.842887473460722, "grad_norm": 0.466796875, "learning_rate": 0.00021731856988373434, "loss": 3.5487, "mean_token_accuracy": 0.37640927731990814, "num_tokens": 533790720.0, "step": 14480 }, { "epoch": 3.8481953290870488, "grad_norm": 0.412109375, "learning_rate": 0.00021708317938412395, "loss": 3.5426, "mean_token_accuracy": 0.376823765039444, "num_tokens": 534528000.0, "step": 14500 }, { "epoch": 3.853503184713376, "grad_norm": 0.421875, "learning_rate": 0.00021684758220174796, "loss": 3.5422, "mean_token_accuracy": 0.3778321385383606, "num_tokens": 535265280.0, "step": 14520 }, { "epoch": 3.858811040339703, "grad_norm": 0.400390625, "learning_rate": 0.00021661177906247958, "loss": 3.5365, "mean_token_accuracy": 0.37749375104904176, "num_tokens": 536002560.0, "step": 14540 }, { "epoch": 3.8641188959660298, "grad_norm": 0.3984375, "learning_rate": 0.00021637577069282647, "loss": 3.5651, "mean_token_accuracy": 0.3747118949890137, "num_tokens": 536739840.0, "step": 14560 }, { "epoch": 3.8694267515923566, "grad_norm": 0.421875, "learning_rate": 0.00021613955781992855, "loss": 3.5377, "mean_token_accuracy": 0.37599477767944334, "num_tokens": 537477120.0, "step": 14580 }, { "epoch": 3.8747346072186835, "grad_norm": 0.40234375, "learning_rate": 0.00021590314117155604, "loss": 3.5697, "mean_token_accuracy": 0.3739005774259567, "num_tokens": 538214400.0, "step": 14600 }, { "epoch": 3.8800424628450108, "grad_norm": 0.41796875, "learning_rate": 0.00021566652147610667, "loss": 3.5498, "mean_token_accuracy": 0.37604642659425735, "num_tokens": 538951680.0, "step": 14620 }, { "epoch": 3.8853503184713376, "grad_norm": 0.43359375, "learning_rate": 0.000215429699462604, "loss": 3.5503, "mean_token_accuracy": 0.3760192424058914, "num_tokens": 539688960.0, "step": 14640 }, { "epoch": 3.8906581740976645, "grad_norm": 0.431640625, "learning_rate": 0.0002151926758606949, "loss": 3.5512, "mean_token_accuracy": 0.3762230917811394, "num_tokens": 540426240.0, "step": 14660 }, { "epoch": 3.8959660297239918, "grad_norm": 0.431640625, "learning_rate": 0.00021495545140064716, "loss": 3.545, "mean_token_accuracy": 0.3753927484154701, "num_tokens": 541163520.0, "step": 14680 }, { "epoch": 3.9012738853503186, "grad_norm": 0.443359375, "learning_rate": 0.00021471802681334763, "loss": 3.5645, "mean_token_accuracy": 0.3744332954287529, "num_tokens": 541900800.0, "step": 14700 }, { "epoch": 3.9065817409766455, "grad_norm": 0.416015625, "learning_rate": 0.00021448040283029965, "loss": 3.5276, "mean_token_accuracy": 0.3783349633216858, "num_tokens": 542638080.0, "step": 14720 }, { "epoch": 3.9118895966029723, "grad_norm": 0.421875, "learning_rate": 0.00021424258018362086, "loss": 3.5552, "mean_token_accuracy": 0.37394949644804, "num_tokens": 543375360.0, "step": 14740 }, { "epoch": 3.917197452229299, "grad_norm": 0.443359375, "learning_rate": 0.0002140045596060411, "loss": 3.5576, "mean_token_accuracy": 0.3751168757677078, "num_tokens": 544112640.0, "step": 14760 }, { "epoch": 3.9225053078556265, "grad_norm": 0.470703125, "learning_rate": 0.0002137663418308999, "loss": 3.5457, "mean_token_accuracy": 0.3754321590065956, "num_tokens": 544849920.0, "step": 14780 }, { "epoch": 3.9278131634819533, "grad_norm": 0.419921875, "learning_rate": 0.00021352792759214446, "loss": 3.5383, "mean_token_accuracy": 0.37565095722675323, "num_tokens": 545587200.0, "step": 14800 }, { "epoch": 3.93312101910828, "grad_norm": 0.435546875, "learning_rate": 0.00021328931762432717, "loss": 3.5315, "mean_token_accuracy": 0.3772966921329498, "num_tokens": 546324480.0, "step": 14820 }, { "epoch": 3.9384288747346075, "grad_norm": 0.404296875, "learning_rate": 0.00021305051266260356, "loss": 3.5713, "mean_token_accuracy": 0.3736885741353035, "num_tokens": 547061760.0, "step": 14840 }, { "epoch": 3.9437367303609343, "grad_norm": 0.400390625, "learning_rate": 0.00021281151344272993, "loss": 3.5539, "mean_token_accuracy": 0.3756441593170166, "num_tokens": 547799040.0, "step": 14860 }, { "epoch": 3.949044585987261, "grad_norm": 0.435546875, "learning_rate": 0.00021257232070106101, "loss": 3.5286, "mean_token_accuracy": 0.37825342267751694, "num_tokens": 548536320.0, "step": 14880 }, { "epoch": 3.954352441613588, "grad_norm": 0.42578125, "learning_rate": 0.00021233293517454788, "loss": 3.54, "mean_token_accuracy": 0.3768876358866692, "num_tokens": 549273600.0, "step": 14900 }, { "epoch": 3.959660297239915, "grad_norm": 0.419921875, "learning_rate": 0.00021209335760073546, "loss": 3.5337, "mean_token_accuracy": 0.377522286772728, "num_tokens": 550010880.0, "step": 14920 }, { "epoch": 3.964968152866242, "grad_norm": 0.423828125, "learning_rate": 0.00021185358871776046, "loss": 3.5477, "mean_token_accuracy": 0.37486138492822646, "num_tokens": 550748160.0, "step": 14940 }, { "epoch": 3.970276008492569, "grad_norm": 0.421875, "learning_rate": 0.00021161362926434897, "loss": 3.5522, "mean_token_accuracy": 0.3750475659966469, "num_tokens": 551485440.0, "step": 14960 }, { "epoch": 3.975583864118896, "grad_norm": 0.451171875, "learning_rate": 0.00021137347997981433, "loss": 3.5704, "mean_token_accuracy": 0.37339639067649844, "num_tokens": 552222720.0, "step": 14980 }, { "epoch": 3.980891719745223, "grad_norm": 0.41796875, "learning_rate": 0.00021113314160405456, "loss": 3.5386, "mean_token_accuracy": 0.377221953868866, "num_tokens": 552960000.0, "step": 15000 }, { "epoch": 3.98619957537155, "grad_norm": 0.453125, "learning_rate": 0.00021089261487755034, "loss": 3.5134, "mean_token_accuracy": 0.37978229075670245, "num_tokens": 553697280.0, "step": 15020 }, { "epoch": 3.991507430997877, "grad_norm": 0.41015625, "learning_rate": 0.0002106519005413628, "loss": 3.5515, "mean_token_accuracy": 0.3750421300530434, "num_tokens": 554434560.0, "step": 15040 }, { "epoch": 3.9968152866242037, "grad_norm": 0.439453125, "learning_rate": 0.00021041099933713083, "loss": 3.5284, "mean_token_accuracy": 0.3783336073160172, "num_tokens": 555171840.0, "step": 15060 }, { "epoch": 4.002123142250531, "grad_norm": 0.423828125, "learning_rate": 0.00021016991200706938, "loss": 3.4893, "mean_token_accuracy": 0.38321918100118635, "num_tokens": 555909120.0, "step": 15080 }, { "epoch": 4.007430997876858, "grad_norm": 0.40234375, "learning_rate": 0.0002099286392939666, "loss": 3.431, "mean_token_accuracy": 0.3881645455956459, "num_tokens": 556646400.0, "step": 15100 }, { "epoch": 4.012738853503185, "grad_norm": 0.447265625, "learning_rate": 0.00020968718194118183, "loss": 3.416, "mean_token_accuracy": 0.3915375605225563, "num_tokens": 557383680.0, "step": 15120 }, { "epoch": 4.018046709129512, "grad_norm": 0.4375, "learning_rate": 0.00020944554069264343, "loss": 3.4347, "mean_token_accuracy": 0.38841460198163985, "num_tokens": 558120960.0, "step": 15140 }, { "epoch": 4.023354564755839, "grad_norm": 0.427734375, "learning_rate": 0.00020920371629284625, "loss": 3.4646, "mean_token_accuracy": 0.3850986614823341, "num_tokens": 558858240.0, "step": 15160 }, { "epoch": 4.028662420382165, "grad_norm": 0.43359375, "learning_rate": 0.00020896170948684934, "loss": 3.4379, "mean_token_accuracy": 0.3879063382744789, "num_tokens": 559595520.0, "step": 15180 }, { "epoch": 4.033970276008493, "grad_norm": 0.412109375, "learning_rate": 0.00020871952102027386, "loss": 3.4067, "mean_token_accuracy": 0.39128206968307494, "num_tokens": 560332800.0, "step": 15200 }, { "epoch": 4.03927813163482, "grad_norm": 0.40625, "learning_rate": 0.00020847715163930066, "loss": 3.4194, "mean_token_accuracy": 0.3898293122649193, "num_tokens": 561070080.0, "step": 15220 }, { "epoch": 4.044585987261146, "grad_norm": 0.427734375, "learning_rate": 0.00020823460209066794, "loss": 3.4638, "mean_token_accuracy": 0.38508507162332534, "num_tokens": 561807360.0, "step": 15240 }, { "epoch": 4.049893842887474, "grad_norm": 0.400390625, "learning_rate": 0.00020799187312166902, "loss": 3.4261, "mean_token_accuracy": 0.3898863852024078, "num_tokens": 562544640.0, "step": 15260 }, { "epoch": 4.055201698513801, "grad_norm": 0.48828125, "learning_rate": 0.0002077489654801501, "loss": 3.4229, "mean_token_accuracy": 0.3898021295666695, "num_tokens": 563281920.0, "step": 15280 }, { "epoch": 4.060509554140127, "grad_norm": 0.447265625, "learning_rate": 0.0002075058799145076, "loss": 3.4586, "mean_token_accuracy": 0.3875978484749794, "num_tokens": 564019200.0, "step": 15300 }, { "epoch": 4.065817409766455, "grad_norm": 0.431640625, "learning_rate": 0.00020726261717368642, "loss": 3.4535, "mean_token_accuracy": 0.38507555723190307, "num_tokens": 564756480.0, "step": 15320 }, { "epoch": 4.071125265392781, "grad_norm": 0.443359375, "learning_rate": 0.00020701917800717716, "loss": 3.4664, "mean_token_accuracy": 0.38384023904800413, "num_tokens": 565493760.0, "step": 15340 }, { "epoch": 4.076433121019108, "grad_norm": 0.416015625, "learning_rate": 0.00020677556316501405, "loss": 3.4457, "mean_token_accuracy": 0.3866791695356369, "num_tokens": 566231040.0, "step": 15360 }, { "epoch": 4.081740976645436, "grad_norm": 0.439453125, "learning_rate": 0.00020653177339777264, "loss": 3.4603, "mean_token_accuracy": 0.3851774826645851, "num_tokens": 566968320.0, "step": 15380 }, { "epoch": 4.087048832271762, "grad_norm": 0.4375, "learning_rate": 0.00020628780945656727, "loss": 3.4607, "mean_token_accuracy": 0.3849179148674011, "num_tokens": 567705600.0, "step": 15400 }, { "epoch": 4.092356687898089, "grad_norm": 0.421875, "learning_rate": 0.00020604367209304899, "loss": 3.4716, "mean_token_accuracy": 0.3842098832130432, "num_tokens": 568442880.0, "step": 15420 }, { "epoch": 4.097664543524417, "grad_norm": 0.416015625, "learning_rate": 0.00020579936205940316, "loss": 3.4884, "mean_token_accuracy": 0.3825206562876701, "num_tokens": 569180160.0, "step": 15440 }, { "epoch": 4.102972399150743, "grad_norm": 0.390625, "learning_rate": 0.00020555488010834714, "loss": 3.4581, "mean_token_accuracy": 0.38622662574052813, "num_tokens": 569917440.0, "step": 15460 }, { "epoch": 4.10828025477707, "grad_norm": 0.458984375, "learning_rate": 0.000205310226993128, "loss": 3.4567, "mean_token_accuracy": 0.3850945845246315, "num_tokens": 570654720.0, "step": 15480 }, { "epoch": 4.113588110403397, "grad_norm": 0.40625, "learning_rate": 0.00020506540346752007, "loss": 3.4674, "mean_token_accuracy": 0.3833863317966461, "num_tokens": 571392000.0, "step": 15500 }, { "epoch": 4.118895966029724, "grad_norm": 0.435546875, "learning_rate": 0.00020482041028582285, "loss": 3.4692, "mean_token_accuracy": 0.38394895195961, "num_tokens": 572129280.0, "step": 15520 }, { "epoch": 4.124203821656051, "grad_norm": 0.40625, "learning_rate": 0.00020457524820285844, "loss": 3.4474, "mean_token_accuracy": 0.3862796247005463, "num_tokens": 572866560.0, "step": 15540 }, { "epoch": 4.129511677282378, "grad_norm": 0.44921875, "learning_rate": 0.0002043299179739693, "loss": 3.4643, "mean_token_accuracy": 0.38556479662656784, "num_tokens": 573603840.0, "step": 15560 }, { "epoch": 4.134819532908705, "grad_norm": 0.416015625, "learning_rate": 0.000204084420355016, "loss": 3.4858, "mean_token_accuracy": 0.3831036612391472, "num_tokens": 574341120.0, "step": 15580 }, { "epoch": 4.140127388535032, "grad_norm": 0.439453125, "learning_rate": 0.0002038387561023749, "loss": 3.4553, "mean_token_accuracy": 0.3854533568024635, "num_tokens": 575078400.0, "step": 15600 }, { "epoch": 4.145435244161359, "grad_norm": 0.44140625, "learning_rate": 0.0002035929259729357, "loss": 3.4687, "mean_token_accuracy": 0.3845373958349228, "num_tokens": 575815680.0, "step": 15620 }, { "epoch": 4.150743099787686, "grad_norm": 0.41015625, "learning_rate": 0.00020334693072409906, "loss": 3.4941, "mean_token_accuracy": 0.3811671048402786, "num_tokens": 576552960.0, "step": 15640 }, { "epoch": 4.156050955414012, "grad_norm": 0.435546875, "learning_rate": 0.00020310077111377456, "loss": 3.4745, "mean_token_accuracy": 0.3842887029051781, "num_tokens": 577290240.0, "step": 15660 }, { "epoch": 4.16135881104034, "grad_norm": 0.443359375, "learning_rate": 0.00020285444790037812, "loss": 3.4794, "mean_token_accuracy": 0.3830724060535431, "num_tokens": 578027520.0, "step": 15680 }, { "epoch": 4.166666666666667, "grad_norm": 0.42578125, "learning_rate": 0.00020260796184282958, "loss": 3.4759, "mean_token_accuracy": 0.38373423367738724, "num_tokens": 578764800.0, "step": 15700 }, { "epoch": 4.171974522292993, "grad_norm": 0.40625, "learning_rate": 0.00020236131370055067, "loss": 3.459, "mean_token_accuracy": 0.3844110146164894, "num_tokens": 579502080.0, "step": 15720 }, { "epoch": 4.177282377919321, "grad_norm": 0.466796875, "learning_rate": 0.00020211450423346242, "loss": 3.4699, "mean_token_accuracy": 0.3835643634200096, "num_tokens": 580239360.0, "step": 15740 }, { "epoch": 4.182590233545648, "grad_norm": 0.4453125, "learning_rate": 0.00020186753420198297, "loss": 3.4464, "mean_token_accuracy": 0.3866900384426117, "num_tokens": 580976640.0, "step": 15760 }, { "epoch": 4.187898089171974, "grad_norm": 0.431640625, "learning_rate": 0.00020162040436702504, "loss": 3.4756, "mean_token_accuracy": 0.38353853970766066, "num_tokens": 581713920.0, "step": 15780 }, { "epoch": 4.193205944798302, "grad_norm": 0.408203125, "learning_rate": 0.0002013731154899938, "loss": 3.466, "mean_token_accuracy": 0.384177266061306, "num_tokens": 582451200.0, "step": 15800 }, { "epoch": 4.198513800424628, "grad_norm": 0.443359375, "learning_rate": 0.00020112566833278437, "loss": 3.4705, "mean_token_accuracy": 0.3843851909041405, "num_tokens": 583188480.0, "step": 15820 }, { "epoch": 4.203821656050955, "grad_norm": 0.44921875, "learning_rate": 0.00020087806365777963, "loss": 3.4824, "mean_token_accuracy": 0.38345700353384016, "num_tokens": 583925760.0, "step": 15840 }, { "epoch": 4.209129511677283, "grad_norm": 0.416015625, "learning_rate": 0.00020063030222784766, "loss": 3.4817, "mean_token_accuracy": 0.38152043968439103, "num_tokens": 584663040.0, "step": 15860 }, { "epoch": 4.214437367303609, "grad_norm": 0.396484375, "learning_rate": 0.00020038238480633952, "loss": 3.4672, "mean_token_accuracy": 0.3842547282576561, "num_tokens": 585400320.0, "step": 15880 }, { "epoch": 4.219745222929936, "grad_norm": 0.4140625, "learning_rate": 0.00020013431215708697, "loss": 3.4604, "mean_token_accuracy": 0.3849817872047424, "num_tokens": 586137600.0, "step": 15900 }, { "epoch": 4.225053078556264, "grad_norm": 0.4296875, "learning_rate": 0.00019988608504439988, "loss": 3.4706, "mean_token_accuracy": 0.3845401152968407, "num_tokens": 586874880.0, "step": 15920 }, { "epoch": 4.23036093418259, "grad_norm": 0.44140625, "learning_rate": 0.00019963770423306416, "loss": 3.4918, "mean_token_accuracy": 0.3802837565541267, "num_tokens": 587612160.0, "step": 15940 }, { "epoch": 4.235668789808917, "grad_norm": 0.44140625, "learning_rate": 0.00019938917048833914, "loss": 3.4769, "mean_token_accuracy": 0.3829827129840851, "num_tokens": 588349440.0, "step": 15960 }, { "epoch": 4.240976645435244, "grad_norm": 0.416015625, "learning_rate": 0.00019914048457595547, "loss": 3.5114, "mean_token_accuracy": 0.37871683984994886, "num_tokens": 589086720.0, "step": 15980 }, { "epoch": 4.246284501061571, "grad_norm": 0.447265625, "learning_rate": 0.00019889164726211243, "loss": 3.4578, "mean_token_accuracy": 0.38584882766008377, "num_tokens": 589824000.0, "step": 16000 }, { "epoch": 4.251592356687898, "grad_norm": 0.390625, "learning_rate": 0.000198642659313476, "loss": 3.4835, "mean_token_accuracy": 0.3813668772578239, "num_tokens": 590561280.0, "step": 16020 }, { "epoch": 4.256900212314225, "grad_norm": 0.4140625, "learning_rate": 0.00019839352149717617, "loss": 3.4653, "mean_token_accuracy": 0.3846433997154236, "num_tokens": 591298560.0, "step": 16040 }, { "epoch": 4.262208067940552, "grad_norm": 0.4296875, "learning_rate": 0.00019814423458080458, "loss": 3.4847, "mean_token_accuracy": 0.38290525078773496, "num_tokens": 592035840.0, "step": 16060 }, { "epoch": 4.267515923566879, "grad_norm": 0.4140625, "learning_rate": 0.00019789479933241238, "loss": 3.4654, "mean_token_accuracy": 0.38467465788125993, "num_tokens": 592773120.0, "step": 16080 }, { "epoch": 4.272823779193206, "grad_norm": 0.44140625, "learning_rate": 0.00019764521652050764, "loss": 3.4868, "mean_token_accuracy": 0.38249075710773467, "num_tokens": 593510400.0, "step": 16100 }, { "epoch": 4.278131634819533, "grad_norm": 0.44140625, "learning_rate": 0.00019739548691405313, "loss": 3.4826, "mean_token_accuracy": 0.3829636886715889, "num_tokens": 594247680.0, "step": 16120 }, { "epoch": 4.2834394904458595, "grad_norm": 0.388671875, "learning_rate": 0.0001971456112824639, "loss": 3.4855, "mean_token_accuracy": 0.38228011429309844, "num_tokens": 594984960.0, "step": 16140 }, { "epoch": 4.288747346072187, "grad_norm": 0.43359375, "learning_rate": 0.0001968955903956048, "loss": 3.5043, "mean_token_accuracy": 0.3808722019195557, "num_tokens": 595722240.0, "step": 16160 }, { "epoch": 4.294055201698514, "grad_norm": 0.451171875, "learning_rate": 0.00019664542502378823, "loss": 3.491, "mean_token_accuracy": 0.38240514546632765, "num_tokens": 596459520.0, "step": 16180 }, { "epoch": 4.2993630573248405, "grad_norm": 0.4296875, "learning_rate": 0.00019639511593777195, "loss": 3.4913, "mean_token_accuracy": 0.381588388979435, "num_tokens": 597196800.0, "step": 16200 }, { "epoch": 4.304670912951168, "grad_norm": 0.404296875, "learning_rate": 0.00019614466390875627, "loss": 3.4617, "mean_token_accuracy": 0.38506876230239867, "num_tokens": 597934080.0, "step": 16220 }, { "epoch": 4.309978768577495, "grad_norm": 0.427734375, "learning_rate": 0.00019589406970838198, "loss": 3.4547, "mean_token_accuracy": 0.38539220839738847, "num_tokens": 598671360.0, "step": 16240 }, { "epoch": 4.3152866242038215, "grad_norm": 0.431640625, "learning_rate": 0.00019564333410872792, "loss": 3.4803, "mean_token_accuracy": 0.38305881768465044, "num_tokens": 599408640.0, "step": 16260 }, { "epoch": 4.320594479830149, "grad_norm": 0.41796875, "learning_rate": 0.00019539245788230855, "loss": 3.5064, "mean_token_accuracy": 0.38093063831329343, "num_tokens": 600145920.0, "step": 16280 }, { "epoch": 4.325902335456475, "grad_norm": 0.421875, "learning_rate": 0.00019514144180207167, "loss": 3.4875, "mean_token_accuracy": 0.38310094773769376, "num_tokens": 600883200.0, "step": 16300 }, { "epoch": 4.3312101910828025, "grad_norm": 0.4296875, "learning_rate": 0.0001948902866413959, "loss": 3.4668, "mean_token_accuracy": 0.3842316284775734, "num_tokens": 601620480.0, "step": 16320 }, { "epoch": 4.33651804670913, "grad_norm": 0.435546875, "learning_rate": 0.0001946389931740883, "loss": 3.4878, "mean_token_accuracy": 0.3822991386055946, "num_tokens": 602357760.0, "step": 16340 }, { "epoch": 4.341825902335456, "grad_norm": 0.4453125, "learning_rate": 0.0001943875621743822, "loss": 3.4602, "mean_token_accuracy": 0.3848621994256973, "num_tokens": 603095040.0, "step": 16360 }, { "epoch": 4.3471337579617835, "grad_norm": 0.408203125, "learning_rate": 0.0001941359944169346, "loss": 3.4983, "mean_token_accuracy": 0.38078114539384844, "num_tokens": 603832320.0, "step": 16380 }, { "epoch": 4.352441613588111, "grad_norm": 0.41015625, "learning_rate": 0.00019388429067682383, "loss": 3.4978, "mean_token_accuracy": 0.38153674602508547, "num_tokens": 604569600.0, "step": 16400 }, { "epoch": 4.357749469214437, "grad_norm": 0.400390625, "learning_rate": 0.0001936324517295472, "loss": 3.494, "mean_token_accuracy": 0.3812133088707924, "num_tokens": 605306880.0, "step": 16420 }, { "epoch": 4.3630573248407645, "grad_norm": 0.455078125, "learning_rate": 0.0001933804783510186, "loss": 3.5075, "mean_token_accuracy": 0.3791884079575539, "num_tokens": 606044160.0, "step": 16440 }, { "epoch": 4.368365180467091, "grad_norm": 0.41015625, "learning_rate": 0.00019312837131756603, "loss": 3.4996, "mean_token_accuracy": 0.3807172730565071, "num_tokens": 606781440.0, "step": 16460 }, { "epoch": 4.373673036093418, "grad_norm": 0.41796875, "learning_rate": 0.00019287613140592951, "loss": 3.4827, "mean_token_accuracy": 0.3823806792497635, "num_tokens": 607518720.0, "step": 16480 }, { "epoch": 4.3789808917197455, "grad_norm": 0.435546875, "learning_rate": 0.0001926237593932581, "loss": 3.4728, "mean_token_accuracy": 0.38421939611434935, "num_tokens": 608256000.0, "step": 16500 }, { "epoch": 4.384288747346072, "grad_norm": 0.4453125, "learning_rate": 0.00019237125605710818, "loss": 3.4854, "mean_token_accuracy": 0.3818166986107826, "num_tokens": 608993280.0, "step": 16520 }, { "epoch": 4.389596602972399, "grad_norm": 0.4140625, "learning_rate": 0.00019211862217544055, "loss": 3.4861, "mean_token_accuracy": 0.3816495418548584, "num_tokens": 609730560.0, "step": 16540 }, { "epoch": 4.3949044585987265, "grad_norm": 0.44140625, "learning_rate": 0.00019186585852661825, "loss": 3.518, "mean_token_accuracy": 0.3780060902237892, "num_tokens": 610467840.0, "step": 16560 }, { "epoch": 4.400212314225053, "grad_norm": 0.4140625, "learning_rate": 0.00019161296588940426, "loss": 3.4988, "mean_token_accuracy": 0.3817840814590454, "num_tokens": 611205120.0, "step": 16580 }, { "epoch": 4.40552016985138, "grad_norm": 0.470703125, "learning_rate": 0.00019135994504295876, "loss": 3.5105, "mean_token_accuracy": 0.38076483756303786, "num_tokens": 611942400.0, "step": 16600 }, { "epoch": 4.4108280254777075, "grad_norm": 0.44140625, "learning_rate": 0.00019110679676683713, "loss": 3.4797, "mean_token_accuracy": 0.3810977965593338, "num_tokens": 612679680.0, "step": 16620 }, { "epoch": 4.416135881104034, "grad_norm": 0.412109375, "learning_rate": 0.00019085352184098725, "loss": 3.4773, "mean_token_accuracy": 0.38288078755140303, "num_tokens": 613416960.0, "step": 16640 }, { "epoch": 4.421443736730361, "grad_norm": 0.431640625, "learning_rate": 0.00019060012104574724, "loss": 3.5075, "mean_token_accuracy": 0.37970210313797, "num_tokens": 614154240.0, "step": 16660 }, { "epoch": 4.426751592356688, "grad_norm": 0.421875, "learning_rate": 0.00019034659516184302, "loss": 3.5084, "mean_token_accuracy": 0.37966813147068024, "num_tokens": 614891520.0, "step": 16680 }, { "epoch": 4.432059447983015, "grad_norm": 0.447265625, "learning_rate": 0.00019009294497038595, "loss": 3.4976, "mean_token_accuracy": 0.3812894135713577, "num_tokens": 615628800.0, "step": 16700 }, { "epoch": 4.437367303609342, "grad_norm": 0.40234375, "learning_rate": 0.00018983917125287033, "loss": 3.4964, "mean_token_accuracy": 0.3812622308731079, "num_tokens": 616366080.0, "step": 16720 }, { "epoch": 4.442675159235669, "grad_norm": 0.4375, "learning_rate": 0.000189585274791171, "loss": 3.4938, "mean_token_accuracy": 0.38207762539386747, "num_tokens": 617103360.0, "step": 16740 }, { "epoch": 4.447983014861996, "grad_norm": 0.42578125, "learning_rate": 0.0001893312563675411, "loss": 3.522, "mean_token_accuracy": 0.37772477716207503, "num_tokens": 617840640.0, "step": 16760 }, { "epoch": 4.453290870488322, "grad_norm": 0.466796875, "learning_rate": 0.00018907711676460938, "loss": 3.4864, "mean_token_accuracy": 0.38133018314838407, "num_tokens": 618577920.0, "step": 16780 }, { "epoch": 4.45859872611465, "grad_norm": 0.38671875, "learning_rate": 0.0001888228567653781, "loss": 3.468, "mean_token_accuracy": 0.3842615231871605, "num_tokens": 619315200.0, "step": 16800 }, { "epoch": 4.463906581740977, "grad_norm": 0.427734375, "learning_rate": 0.00018856847715322032, "loss": 3.4908, "mean_token_accuracy": 0.38211295902729037, "num_tokens": 620052480.0, "step": 16820 }, { "epoch": 4.469214437367303, "grad_norm": 0.42578125, "learning_rate": 0.0001883139787118778, "loss": 3.4775, "mean_token_accuracy": 0.3831349194049835, "num_tokens": 620789760.0, "step": 16840 }, { "epoch": 4.474522292993631, "grad_norm": 0.455078125, "learning_rate": 0.00018805936222545815, "loss": 3.4717, "mean_token_accuracy": 0.38367715775966643, "num_tokens": 621527040.0, "step": 16860 }, { "epoch": 4.479830148619958, "grad_norm": 0.4140625, "learning_rate": 0.00018780462847843297, "loss": 3.4837, "mean_token_accuracy": 0.38377908021211626, "num_tokens": 622264320.0, "step": 16880 }, { "epoch": 4.485138004246284, "grad_norm": 0.443359375, "learning_rate": 0.00018754977825563492, "loss": 3.4938, "mean_token_accuracy": 0.38215644657611847, "num_tokens": 623001600.0, "step": 16900 }, { "epoch": 4.490445859872612, "grad_norm": 0.427734375, "learning_rate": 0.0001872948123422556, "loss": 3.4821, "mean_token_accuracy": 0.38251929730176926, "num_tokens": 623738880.0, "step": 16920 }, { "epoch": 4.495753715498939, "grad_norm": 0.43359375, "learning_rate": 0.00018703973152384306, "loss": 3.4976, "mean_token_accuracy": 0.38083414882421496, "num_tokens": 624476160.0, "step": 16940 }, { "epoch": 4.501061571125265, "grad_norm": 0.408203125, "learning_rate": 0.00018678453658629935, "loss": 3.4814, "mean_token_accuracy": 0.38357522934675214, "num_tokens": 625213440.0, "step": 16960 }, { "epoch": 4.506369426751593, "grad_norm": 0.40625, "learning_rate": 0.00018652922831587814, "loss": 3.4785, "mean_token_accuracy": 0.3836730822920799, "num_tokens": 625950720.0, "step": 16980 }, { "epoch": 4.511677282377919, "grad_norm": 0.439453125, "learning_rate": 0.0001862738074991823, "loss": 3.5015, "mean_token_accuracy": 0.38143754154443743, "num_tokens": 626688000.0, "step": 17000 }, { "epoch": 4.516985138004246, "grad_norm": 0.39453125, "learning_rate": 0.0001860182749231614, "loss": 3.4616, "mean_token_accuracy": 0.38407669961452484, "num_tokens": 627425280.0, "step": 17020 }, { "epoch": 4.522292993630574, "grad_norm": 0.40625, "learning_rate": 0.00018576263137510935, "loss": 3.5136, "mean_token_accuracy": 0.37969531267881396, "num_tokens": 628162560.0, "step": 17040 }, { "epoch": 4.5276008492569, "grad_norm": 0.412109375, "learning_rate": 0.000185506877642662, "loss": 3.4809, "mean_token_accuracy": 0.38406582921743393, "num_tokens": 628899840.0, "step": 17060 }, { "epoch": 4.532908704883227, "grad_norm": 0.4375, "learning_rate": 0.00018525101451379462, "loss": 3.4787, "mean_token_accuracy": 0.3832844078540802, "num_tokens": 629637120.0, "step": 17080 }, { "epoch": 4.538216560509554, "grad_norm": 0.416015625, "learning_rate": 0.00018499504277681966, "loss": 3.5096, "mean_token_accuracy": 0.37931615859270096, "num_tokens": 630374400.0, "step": 17100 }, { "epoch": 4.543524416135881, "grad_norm": 0.4140625, "learning_rate": 0.00018473896322038395, "loss": 3.4823, "mean_token_accuracy": 0.3824527055025101, "num_tokens": 631111680.0, "step": 17120 }, { "epoch": 4.548832271762208, "grad_norm": 0.41796875, "learning_rate": 0.00018448277663346677, "loss": 3.4839, "mean_token_accuracy": 0.383738312125206, "num_tokens": 631848960.0, "step": 17140 }, { "epoch": 4.554140127388535, "grad_norm": 0.408203125, "learning_rate": 0.00018422648380537698, "loss": 3.5036, "mean_token_accuracy": 0.3808912247419357, "num_tokens": 632586240.0, "step": 17160 }, { "epoch": 4.559447983014862, "grad_norm": 0.40234375, "learning_rate": 0.00018397008552575086, "loss": 3.4825, "mean_token_accuracy": 0.38228283524513246, "num_tokens": 633323520.0, "step": 17180 }, { "epoch": 4.564755838641189, "grad_norm": 0.458984375, "learning_rate": 0.0001837135825845496, "loss": 3.4842, "mean_token_accuracy": 0.3820749059319496, "num_tokens": 634060800.0, "step": 17200 }, { "epoch": 4.570063694267516, "grad_norm": 0.447265625, "learning_rate": 0.00018345697577205671, "loss": 3.516, "mean_token_accuracy": 0.37865840196609496, "num_tokens": 634798080.0, "step": 17220 }, { "epoch": 4.575371549893843, "grad_norm": 0.41015625, "learning_rate": 0.00018320026587887594, "loss": 3.4914, "mean_token_accuracy": 0.3823589369654655, "num_tokens": 635535360.0, "step": 17240 }, { "epoch": 4.58067940552017, "grad_norm": 0.400390625, "learning_rate": 0.00018294345369592846, "loss": 3.4701, "mean_token_accuracy": 0.3840060368180275, "num_tokens": 636272640.0, "step": 17260 }, { "epoch": 4.585987261146497, "grad_norm": 0.40234375, "learning_rate": 0.00018268654001445068, "loss": 3.4751, "mean_token_accuracy": 0.38362823575735094, "num_tokens": 637009920.0, "step": 17280 }, { "epoch": 4.591295116772824, "grad_norm": 0.43359375, "learning_rate": 0.00018242952562599166, "loss": 3.4978, "mean_token_accuracy": 0.3818520322442055, "num_tokens": 637747200.0, "step": 17300 }, { "epoch": 4.59660297239915, "grad_norm": 0.40234375, "learning_rate": 0.00018217241132241081, "loss": 3.4773, "mean_token_accuracy": 0.38367579579353334, "num_tokens": 638484480.0, "step": 17320 }, { "epoch": 4.601910828025478, "grad_norm": 0.39453125, "learning_rate": 0.00018191519789587533, "loss": 3.4911, "mean_token_accuracy": 0.38264296352863314, "num_tokens": 639221760.0, "step": 17340 }, { "epoch": 4.607218683651805, "grad_norm": 0.40625, "learning_rate": 0.0001816578861388578, "loss": 3.4614, "mean_token_accuracy": 0.38676750510931013, "num_tokens": 639959040.0, "step": 17360 }, { "epoch": 4.612526539278131, "grad_norm": 0.40625, "learning_rate": 0.00018140047684413383, "loss": 3.4649, "mean_token_accuracy": 0.38576456904411316, "num_tokens": 640696320.0, "step": 17380 }, { "epoch": 4.617834394904459, "grad_norm": 0.431640625, "learning_rate": 0.0001811429708047795, "loss": 3.4633, "mean_token_accuracy": 0.38581077456474305, "num_tokens": 641433600.0, "step": 17400 }, { "epoch": 4.623142250530785, "grad_norm": 0.42578125, "learning_rate": 0.00018088536881416894, "loss": 3.4689, "mean_token_accuracy": 0.38413105905056, "num_tokens": 642170880.0, "step": 17420 }, { "epoch": 4.628450106157112, "grad_norm": 0.392578125, "learning_rate": 0.00018062767166597194, "loss": 3.4613, "mean_token_accuracy": 0.3854125902056694, "num_tokens": 642908160.0, "step": 17440 }, { "epoch": 4.63375796178344, "grad_norm": 0.435546875, "learning_rate": 0.00018036988015415143, "loss": 3.4668, "mean_token_accuracy": 0.38515845835208895, "num_tokens": 643645440.0, "step": 17460 }, { "epoch": 4.639065817409766, "grad_norm": 0.4453125, "learning_rate": 0.00018011199507296117, "loss": 3.4863, "mean_token_accuracy": 0.38251794278621676, "num_tokens": 644382720.0, "step": 17480 }, { "epoch": 4.644373673036093, "grad_norm": 0.419921875, "learning_rate": 0.00017985401721694303, "loss": 3.4877, "mean_token_accuracy": 0.38327081948518754, "num_tokens": 645120000.0, "step": 17500 }, { "epoch": 4.649681528662421, "grad_norm": 0.40234375, "learning_rate": 0.00017959594738092485, "loss": 3.4504, "mean_token_accuracy": 0.38596570044755935, "num_tokens": 645857280.0, "step": 17520 }, { "epoch": 4.654989384288747, "grad_norm": 0.423828125, "learning_rate": 0.00017933778636001785, "loss": 3.4759, "mean_token_accuracy": 0.3848798617720604, "num_tokens": 646594560.0, "step": 17540 }, { "epoch": 4.660297239915074, "grad_norm": 0.40234375, "learning_rate": 0.0001790795349496142, "loss": 3.4882, "mean_token_accuracy": 0.3821061596274376, "num_tokens": 647331840.0, "step": 17560 }, { "epoch": 4.665605095541402, "grad_norm": 0.41796875, "learning_rate": 0.0001788211939453845, "loss": 3.4545, "mean_token_accuracy": 0.3864603728055954, "num_tokens": 648069120.0, "step": 17580 }, { "epoch": 4.670912951167728, "grad_norm": 0.41796875, "learning_rate": 0.00017856276414327543, "loss": 3.4568, "mean_token_accuracy": 0.38588551729917525, "num_tokens": 648806400.0, "step": 17600 }, { "epoch": 4.676220806794055, "grad_norm": 0.41796875, "learning_rate": 0.00017830424633950726, "loss": 3.4601, "mean_token_accuracy": 0.38639106303453447, "num_tokens": 649543680.0, "step": 17620 }, { "epoch": 4.681528662420382, "grad_norm": 0.421875, "learning_rate": 0.0001780456413305713, "loss": 3.4739, "mean_token_accuracy": 0.38336051553487777, "num_tokens": 650280960.0, "step": 17640 }, { "epoch": 4.686836518046709, "grad_norm": 0.421875, "learning_rate": 0.00017778694991322778, "loss": 3.4522, "mean_token_accuracy": 0.3864345520734787, "num_tokens": 651018240.0, "step": 17660 }, { "epoch": 4.692144373673036, "grad_norm": 0.431640625, "learning_rate": 0.0001775281728845028, "loss": 3.4385, "mean_token_accuracy": 0.38862116634845734, "num_tokens": 651755520.0, "step": 17680 }, { "epoch": 4.697452229299363, "grad_norm": 0.423828125, "learning_rate": 0.00017726931104168655, "loss": 3.4741, "mean_token_accuracy": 0.3849247172474861, "num_tokens": 652492800.0, "step": 17700 }, { "epoch": 4.70276008492569, "grad_norm": 0.431640625, "learning_rate": 0.00017701036518233033, "loss": 3.4699, "mean_token_accuracy": 0.38390818536281585, "num_tokens": 653230080.0, "step": 17720 }, { "epoch": 4.7080679405520165, "grad_norm": 0.427734375, "learning_rate": 0.00017675133610424435, "loss": 3.4609, "mean_token_accuracy": 0.3854139491915703, "num_tokens": 653967360.0, "step": 17740 }, { "epoch": 4.713375796178344, "grad_norm": 0.39453125, "learning_rate": 0.00017649222460549527, "loss": 3.4641, "mean_token_accuracy": 0.38550771623849867, "num_tokens": 654704640.0, "step": 17760 }, { "epoch": 4.718683651804671, "grad_norm": 0.435546875, "learning_rate": 0.00017623303148440366, "loss": 3.4687, "mean_token_accuracy": 0.384574094414711, "num_tokens": 655441920.0, "step": 17780 }, { "epoch": 4.7239915074309975, "grad_norm": 0.41015625, "learning_rate": 0.00017597375753954152, "loss": 3.437, "mean_token_accuracy": 0.3877853900194168, "num_tokens": 656179200.0, "step": 17800 }, { "epoch": 4.729299363057325, "grad_norm": 0.462890625, "learning_rate": 0.0001757144035697299, "loss": 3.4378, "mean_token_accuracy": 0.38779897540807723, "num_tokens": 656916480.0, "step": 17820 }, { "epoch": 4.734607218683652, "grad_norm": 0.41796875, "learning_rate": 0.00017545497037403646, "loss": 3.4798, "mean_token_accuracy": 0.3845985531806946, "num_tokens": 657653760.0, "step": 17840 }, { "epoch": 4.7399150743099785, "grad_norm": 0.484375, "learning_rate": 0.00017519545875177288, "loss": 3.4356, "mean_token_accuracy": 0.38808980137109755, "num_tokens": 658391040.0, "step": 17860 }, { "epoch": 4.745222929936306, "grad_norm": 0.416015625, "learning_rate": 0.00017493586950249247, "loss": 3.4558, "mean_token_accuracy": 0.3856775939464569, "num_tokens": 659128320.0, "step": 17880 }, { "epoch": 4.750530785562633, "grad_norm": 0.46484375, "learning_rate": 0.00017467620342598777, "loss": 3.4384, "mean_token_accuracy": 0.3883548051118851, "num_tokens": 659865600.0, "step": 17900 }, { "epoch": 4.7558386411889595, "grad_norm": 0.470703125, "learning_rate": 0.00017441646132228792, "loss": 3.4551, "mean_token_accuracy": 0.38619536757469175, "num_tokens": 660602880.0, "step": 17920 }, { "epoch": 4.761146496815287, "grad_norm": 0.416015625, "learning_rate": 0.00017415664399165643, "loss": 3.4784, "mean_token_accuracy": 0.3841392144560814, "num_tokens": 661340160.0, "step": 17940 }, { "epoch": 4.766454352441613, "grad_norm": 0.400390625, "learning_rate": 0.00017389675223458852, "loss": 3.4799, "mean_token_accuracy": 0.38443955183029177, "num_tokens": 662077440.0, "step": 17960 }, { "epoch": 4.7717622080679405, "grad_norm": 0.3984375, "learning_rate": 0.0001736367868518087, "loss": 3.4537, "mean_token_accuracy": 0.386271470785141, "num_tokens": 662814720.0, "step": 17980 }, { "epoch": 4.777070063694268, "grad_norm": 0.42578125, "learning_rate": 0.00017337674864426828, "loss": 3.4492, "mean_token_accuracy": 0.3870841473340988, "num_tokens": 663552000.0, "step": 18000 }, { "epoch": 4.782377919320594, "grad_norm": 0.396484375, "learning_rate": 0.00017311663841314308, "loss": 3.4494, "mean_token_accuracy": 0.387364099919796, "num_tokens": 664289280.0, "step": 18020 }, { "epoch": 4.7876857749469215, "grad_norm": 0.40625, "learning_rate": 0.0001728564569598308, "loss": 3.4637, "mean_token_accuracy": 0.3844327539205551, "num_tokens": 665026560.0, "step": 18040 }, { "epoch": 4.792993630573249, "grad_norm": 0.443359375, "learning_rate": 0.00017259620508594834, "loss": 3.4657, "mean_token_accuracy": 0.3850538179278374, "num_tokens": 665763840.0, "step": 18060 }, { "epoch": 4.798301486199575, "grad_norm": 0.443359375, "learning_rate": 0.00017233588359332993, "loss": 3.4577, "mean_token_accuracy": 0.385654491186142, "num_tokens": 666501120.0, "step": 18080 }, { "epoch": 4.8036093418259025, "grad_norm": 0.38671875, "learning_rate": 0.00017207549328402397, "loss": 3.4685, "mean_token_accuracy": 0.38422075659036636, "num_tokens": 667238400.0, "step": 18100 }, { "epoch": 4.80891719745223, "grad_norm": 0.404296875, "learning_rate": 0.00017181503496029109, "loss": 3.4589, "mean_token_accuracy": 0.38632583022117617, "num_tokens": 667975680.0, "step": 18120 }, { "epoch": 4.814225053078556, "grad_norm": 0.396484375, "learning_rate": 0.00017155450942460133, "loss": 3.4699, "mean_token_accuracy": 0.3844640150666237, "num_tokens": 668712960.0, "step": 18140 }, { "epoch": 4.8195329087048835, "grad_norm": 0.4453125, "learning_rate": 0.000171293917479632, "loss": 3.4472, "mean_token_accuracy": 0.38674575835466385, "num_tokens": 669450240.0, "step": 18160 }, { "epoch": 4.82484076433121, "grad_norm": 0.431640625, "learning_rate": 0.00017103325992826483, "loss": 3.4602, "mean_token_accuracy": 0.38557159155607224, "num_tokens": 670187520.0, "step": 18180 }, { "epoch": 4.830148619957537, "grad_norm": 0.45703125, "learning_rate": 0.0001707725375735838, "loss": 3.4728, "mean_token_accuracy": 0.3843457818031311, "num_tokens": 670924800.0, "step": 18200 }, { "epoch": 4.8354564755838645, "grad_norm": 0.431640625, "learning_rate": 0.00017051175121887243, "loss": 3.4387, "mean_token_accuracy": 0.38811290413141253, "num_tokens": 671662080.0, "step": 18220 }, { "epoch": 4.840764331210191, "grad_norm": 0.41796875, "learning_rate": 0.00017025090166761152, "loss": 3.4536, "mean_token_accuracy": 0.3859113365411758, "num_tokens": 672399360.0, "step": 18240 }, { "epoch": 4.846072186836518, "grad_norm": 0.419921875, "learning_rate": 0.00016998998972347665, "loss": 3.4562, "mean_token_accuracy": 0.38556343615055083, "num_tokens": 673136640.0, "step": 18260 }, { "epoch": 4.851380042462845, "grad_norm": 0.427734375, "learning_rate": 0.00016972901619033542, "loss": 3.451, "mean_token_accuracy": 0.38674712032079694, "num_tokens": 673873920.0, "step": 18280 }, { "epoch": 4.856687898089172, "grad_norm": 0.4296875, "learning_rate": 0.0001694679818722453, "loss": 3.4485, "mean_token_accuracy": 0.38768753707408904, "num_tokens": 674611200.0, "step": 18300 }, { "epoch": 4.861995753715499, "grad_norm": 0.412109375, "learning_rate": 0.0001692068875734511, "loss": 3.4946, "mean_token_accuracy": 0.38318112641572954, "num_tokens": 675348480.0, "step": 18320 }, { "epoch": 4.867303609341826, "grad_norm": 0.435546875, "learning_rate": 0.00016894573409838236, "loss": 3.4596, "mean_token_accuracy": 0.38521009832620623, "num_tokens": 676085760.0, "step": 18340 }, { "epoch": 4.872611464968153, "grad_norm": 0.439453125, "learning_rate": 0.00016868452225165091, "loss": 3.452, "mean_token_accuracy": 0.38609480261802676, "num_tokens": 676823040.0, "step": 18360 }, { "epoch": 4.87791932059448, "grad_norm": 0.4296875, "learning_rate": 0.00016842325283804847, "loss": 3.4701, "mean_token_accuracy": 0.38518971502780913, "num_tokens": 677560320.0, "step": 18380 }, { "epoch": 4.883227176220807, "grad_norm": 0.439453125, "learning_rate": 0.00016816192666254418, "loss": 3.4549, "mean_token_accuracy": 0.38550636172294617, "num_tokens": 678297600.0, "step": 18400 }, { "epoch": 4.888535031847134, "grad_norm": 0.427734375, "learning_rate": 0.00016790054453028194, "loss": 3.4569, "mean_token_accuracy": 0.38620895594358445, "num_tokens": 679034880.0, "step": 18420 }, { "epoch": 4.893842887473461, "grad_norm": 0.455078125, "learning_rate": 0.00016763910724657815, "loss": 3.4549, "mean_token_accuracy": 0.38618313819169997, "num_tokens": 679772160.0, "step": 18440 }, { "epoch": 4.899150743099788, "grad_norm": 0.423828125, "learning_rate": 0.00016737761561691906, "loss": 3.4465, "mean_token_accuracy": 0.387161611020565, "num_tokens": 680509440.0, "step": 18460 }, { "epoch": 4.904458598726115, "grad_norm": 0.423828125, "learning_rate": 0.0001671160704469584, "loss": 3.4589, "mean_token_accuracy": 0.3848785042762756, "num_tokens": 681246720.0, "step": 18480 }, { "epoch": 4.909766454352441, "grad_norm": 0.45703125, "learning_rate": 0.00016685447254251483, "loss": 3.4482, "mean_token_accuracy": 0.3865296795964241, "num_tokens": 681984000.0, "step": 18500 }, { "epoch": 4.915074309978769, "grad_norm": 0.400390625, "learning_rate": 0.00016659282270956954, "loss": 3.4597, "mean_token_accuracy": 0.3861151859164238, "num_tokens": 682721280.0, "step": 18520 }, { "epoch": 4.920382165605096, "grad_norm": 0.41015625, "learning_rate": 0.00016633112175426366, "loss": 3.4364, "mean_token_accuracy": 0.388728529214859, "num_tokens": 683458560.0, "step": 18540 }, { "epoch": 4.925690021231422, "grad_norm": 0.453125, "learning_rate": 0.00016606937048289588, "loss": 3.4521, "mean_token_accuracy": 0.3866098612546921, "num_tokens": 684195840.0, "step": 18560 }, { "epoch": 4.93099787685775, "grad_norm": 0.439453125, "learning_rate": 0.0001658075697019198, "loss": 3.4343, "mean_token_accuracy": 0.38814823925495145, "num_tokens": 684933120.0, "step": 18580 }, { "epoch": 4.936305732484076, "grad_norm": 0.435546875, "learning_rate": 0.00016554572021794167, "loss": 3.4612, "mean_token_accuracy": 0.38505924940109254, "num_tokens": 685670400.0, "step": 18600 }, { "epoch": 4.941613588110403, "grad_norm": 0.419921875, "learning_rate": 0.00016528382283771779, "loss": 3.4528, "mean_token_accuracy": 0.3864875555038452, "num_tokens": 686407680.0, "step": 18620 }, { "epoch": 4.946921443736731, "grad_norm": 0.431640625, "learning_rate": 0.00016502187836815198, "loss": 3.4594, "mean_token_accuracy": 0.38623613864183426, "num_tokens": 687144960.0, "step": 18640 }, { "epoch": 4.952229299363057, "grad_norm": 0.455078125, "learning_rate": 0.00016475988761629313, "loss": 3.4485, "mean_token_accuracy": 0.38618857115507127, "num_tokens": 687882240.0, "step": 18660 }, { "epoch": 4.957537154989384, "grad_norm": 0.419921875, "learning_rate": 0.00016449785138933278, "loss": 3.4684, "mean_token_accuracy": 0.3851394310593605, "num_tokens": 688619520.0, "step": 18680 }, { "epoch": 4.962845010615712, "grad_norm": 0.416015625, "learning_rate": 0.00016423577049460254, "loss": 3.4723, "mean_token_accuracy": 0.3841813415288925, "num_tokens": 689356800.0, "step": 18700 }, { "epoch": 4.968152866242038, "grad_norm": 0.439453125, "learning_rate": 0.0001639736457395717, "loss": 3.4484, "mean_token_accuracy": 0.3873342052102089, "num_tokens": 690094080.0, "step": 18720 }, { "epoch": 4.973460721868365, "grad_norm": 0.462890625, "learning_rate": 0.00016371147793184455, "loss": 3.4545, "mean_token_accuracy": 0.3872907146811485, "num_tokens": 690831360.0, "step": 18740 }, { "epoch": 4.978768577494693, "grad_norm": 0.421875, "learning_rate": 0.00016344926787915817, "loss": 3.4576, "mean_token_accuracy": 0.38582572489976885, "num_tokens": 691568640.0, "step": 18760 }, { "epoch": 4.984076433121019, "grad_norm": 0.458984375, "learning_rate": 0.00016318701638937973, "loss": 3.4399, "mean_token_accuracy": 0.38740622997283936, "num_tokens": 692305920.0, "step": 18780 }, { "epoch": 4.989384288747346, "grad_norm": 0.4140625, "learning_rate": 0.00016292472427050409, "loss": 3.4217, "mean_token_accuracy": 0.38990949243307116, "num_tokens": 693043200.0, "step": 18800 }, { "epoch": 4.994692144373673, "grad_norm": 0.431640625, "learning_rate": 0.0001626623923306513, "loss": 3.4713, "mean_token_accuracy": 0.38434713780879975, "num_tokens": 693780480.0, "step": 18820 }, { "epoch": 5.0, "grad_norm": 0.431640625, "learning_rate": 0.00016240002137806397, "loss": 3.4471, "mean_token_accuracy": 0.3872988685965538, "num_tokens": 694517760.0, "step": 18840 }, { "epoch": 5.005307855626327, "grad_norm": 0.4140625, "learning_rate": 0.00016213761222110513, "loss": 3.2994, "mean_token_accuracy": 0.40410143733024595, "num_tokens": 695255040.0, "step": 18860 }, { "epoch": 5.010615711252654, "grad_norm": 0.474609375, "learning_rate": 0.00016187516566825528, "loss": 3.2989, "mean_token_accuracy": 0.40344096422195436, "num_tokens": 695992320.0, "step": 18880 }, { "epoch": 5.015923566878981, "grad_norm": 0.416015625, "learning_rate": 0.00016161268252811036, "loss": 3.3015, "mean_token_accuracy": 0.4042332574725151, "num_tokens": 696729600.0, "step": 18900 }, { "epoch": 5.021231422505308, "grad_norm": 0.470703125, "learning_rate": 0.0001613501636093788, "loss": 3.3091, "mean_token_accuracy": 0.40281854569911957, "num_tokens": 697466880.0, "step": 18920 }, { "epoch": 5.026539278131635, "grad_norm": 0.4609375, "learning_rate": 0.00016108760972087952, "loss": 3.3343, "mean_token_accuracy": 0.39958143085241316, "num_tokens": 698204160.0, "step": 18940 }, { "epoch": 5.031847133757962, "grad_norm": 0.4296875, "learning_rate": 0.000160825021671539, "loss": 3.3152, "mean_token_accuracy": 0.40200586915016173, "num_tokens": 698941440.0, "step": 18960 }, { "epoch": 5.037154989384288, "grad_norm": 0.453125, "learning_rate": 0.00016056240027038904, "loss": 3.3387, "mean_token_accuracy": 0.39941563457250595, "num_tokens": 699678720.0, "step": 18980 }, { "epoch": 5.042462845010616, "grad_norm": 0.431640625, "learning_rate": 0.00016029974632656417, "loss": 3.331, "mean_token_accuracy": 0.40101380944252013, "num_tokens": 700416000.0, "step": 19000 }, { "epoch": 5.047770700636943, "grad_norm": 0.435546875, "learning_rate": 0.00016003706064929917, "loss": 3.3487, "mean_token_accuracy": 0.3991438329219818, "num_tokens": 701153280.0, "step": 19020 }, { "epoch": 5.053078556263269, "grad_norm": 0.4375, "learning_rate": 0.00015977434404792668, "loss": 3.3186, "mean_token_accuracy": 0.4016933023929596, "num_tokens": 701890560.0, "step": 19040 }, { "epoch": 5.058386411889597, "grad_norm": 0.439453125, "learning_rate": 0.00015951159733187449, "loss": 3.3073, "mean_token_accuracy": 0.4030074492096901, "num_tokens": 702627840.0, "step": 19060 }, { "epoch": 5.063694267515924, "grad_norm": 0.46484375, "learning_rate": 0.00015924882131066324, "loss": 3.3444, "mean_token_accuracy": 0.3989141687750816, "num_tokens": 703365120.0, "step": 19080 }, { "epoch": 5.06900212314225, "grad_norm": 0.455078125, "learning_rate": 0.00015898601679390384, "loss": 3.3404, "mean_token_accuracy": 0.39859887808561323, "num_tokens": 704102400.0, "step": 19100 }, { "epoch": 5.074309978768578, "grad_norm": 0.455078125, "learning_rate": 0.00015872318459129507, "loss": 3.3283, "mean_token_accuracy": 0.40088470131158827, "num_tokens": 704839680.0, "step": 19120 }, { "epoch": 5.079617834394904, "grad_norm": 0.470703125, "learning_rate": 0.00015846032551262087, "loss": 3.3661, "mean_token_accuracy": 0.3950695812702179, "num_tokens": 705576960.0, "step": 19140 }, { "epoch": 5.084925690021231, "grad_norm": 0.421875, "learning_rate": 0.00015819744036774813, "loss": 3.3446, "mean_token_accuracy": 0.39824554324150085, "num_tokens": 706314240.0, "step": 19160 }, { "epoch": 5.090233545647559, "grad_norm": 0.447265625, "learning_rate": 0.00015793452996662395, "loss": 3.3536, "mean_token_accuracy": 0.3969708070158958, "num_tokens": 707051520.0, "step": 19180 }, { "epoch": 5.095541401273885, "grad_norm": 0.44921875, "learning_rate": 0.00015767159511927332, "loss": 3.3413, "mean_token_accuracy": 0.3992498368024826, "num_tokens": 707788800.0, "step": 19200 }, { "epoch": 5.100849256900212, "grad_norm": 0.421875, "learning_rate": 0.0001574086366357964, "loss": 3.3426, "mean_token_accuracy": 0.3986165463924408, "num_tokens": 708526080.0, "step": 19220 }, { "epoch": 5.10615711252654, "grad_norm": 0.4296875, "learning_rate": 0.00015714565532636641, "loss": 3.36, "mean_token_accuracy": 0.3967397794127464, "num_tokens": 709263360.0, "step": 19240 }, { "epoch": 5.111464968152866, "grad_norm": 0.4375, "learning_rate": 0.00015688265200122667, "loss": 3.3745, "mean_token_accuracy": 0.39570558816194534, "num_tokens": 710000640.0, "step": 19260 }, { "epoch": 5.116772823779193, "grad_norm": 0.451171875, "learning_rate": 0.00015661962747068846, "loss": 3.3548, "mean_token_accuracy": 0.39748451113700867, "num_tokens": 710737920.0, "step": 19280 }, { "epoch": 5.12208067940552, "grad_norm": 0.45703125, "learning_rate": 0.00015635658254512834, "loss": 3.3416, "mean_token_accuracy": 0.39921450316905976, "num_tokens": 711475200.0, "step": 19300 }, { "epoch": 5.127388535031847, "grad_norm": 0.462890625, "learning_rate": 0.0001560935180349857, "loss": 3.3568, "mean_token_accuracy": 0.397405681014061, "num_tokens": 712212480.0, "step": 19320 }, { "epoch": 5.132696390658174, "grad_norm": 0.44921875, "learning_rate": 0.00015583043475076038, "loss": 3.3301, "mean_token_accuracy": 0.40037915855646133, "num_tokens": 712949760.0, "step": 19340 }, { "epoch": 5.138004246284501, "grad_norm": 0.431640625, "learning_rate": 0.00015556733350300989, "loss": 3.3506, "mean_token_accuracy": 0.39772776812314986, "num_tokens": 713687040.0, "step": 19360 }, { "epoch": 5.143312101910828, "grad_norm": 0.478515625, "learning_rate": 0.00015530421510234727, "loss": 3.3746, "mean_token_accuracy": 0.3940503358840942, "num_tokens": 714424320.0, "step": 19380 }, { "epoch": 5.148619957537155, "grad_norm": 0.451171875, "learning_rate": 0.0001550410803594382, "loss": 3.3573, "mean_token_accuracy": 0.39747091978788374, "num_tokens": 715161600.0, "step": 19400 }, { "epoch": 5.153927813163482, "grad_norm": 0.4296875, "learning_rate": 0.00015477793008499897, "loss": 3.3771, "mean_token_accuracy": 0.39386143535375595, "num_tokens": 715898880.0, "step": 19420 }, { "epoch": 5.159235668789809, "grad_norm": 0.4453125, "learning_rate": 0.0001545147650897934, "loss": 3.3714, "mean_token_accuracy": 0.3958768218755722, "num_tokens": 716636160.0, "step": 19440 }, { "epoch": 5.1645435244161355, "grad_norm": 0.4140625, "learning_rate": 0.00015425158618463092, "loss": 3.368, "mean_token_accuracy": 0.3949826031923294, "num_tokens": 717373440.0, "step": 19460 }, { "epoch": 5.169851380042463, "grad_norm": 0.43359375, "learning_rate": 0.00015398839418036375, "loss": 3.3651, "mean_token_accuracy": 0.3955357149243355, "num_tokens": 718110720.0, "step": 19480 }, { "epoch": 5.17515923566879, "grad_norm": 0.419921875, "learning_rate": 0.00015372518988788445, "loss": 3.3809, "mean_token_accuracy": 0.39498667865991594, "num_tokens": 718848000.0, "step": 19500 }, { "epoch": 5.1804670912951165, "grad_norm": 0.458984375, "learning_rate": 0.0001534619741181234, "loss": 3.3505, "mean_token_accuracy": 0.39851462543010713, "num_tokens": 719585280.0, "step": 19520 }, { "epoch": 5.185774946921444, "grad_norm": 0.451171875, "learning_rate": 0.00015319874768204646, "loss": 3.3727, "mean_token_accuracy": 0.39443628937006, "num_tokens": 720322560.0, "step": 19540 }, { "epoch": 5.191082802547771, "grad_norm": 0.4609375, "learning_rate": 0.00015293551139065222, "loss": 3.3672, "mean_token_accuracy": 0.39669493287801744, "num_tokens": 721059840.0, "step": 19560 }, { "epoch": 5.1963906581740975, "grad_norm": 0.443359375, "learning_rate": 0.00015267226605496969, "loss": 3.3763, "mean_token_accuracy": 0.3956811264157295, "num_tokens": 721797120.0, "step": 19580 }, { "epoch": 5.201698513800425, "grad_norm": 0.439453125, "learning_rate": 0.00015240901248605583, "loss": 3.3808, "mean_token_accuracy": 0.3943465992808342, "num_tokens": 722534400.0, "step": 19600 }, { "epoch": 5.207006369426751, "grad_norm": 0.4765625, "learning_rate": 0.0001521457514949928, "loss": 3.3505, "mean_token_accuracy": 0.3979587942361832, "num_tokens": 723271680.0, "step": 19620 }, { "epoch": 5.2123142250530785, "grad_norm": 0.447265625, "learning_rate": 0.0001518824838928857, "loss": 3.4014, "mean_token_accuracy": 0.39230403304100037, "num_tokens": 724008960.0, "step": 19640 }, { "epoch": 5.217622080679406, "grad_norm": 0.439453125, "learning_rate": 0.00015161921049086007, "loss": 3.3665, "mean_token_accuracy": 0.3965576782822609, "num_tokens": 724746240.0, "step": 19660 }, { "epoch": 5.222929936305732, "grad_norm": 0.41015625, "learning_rate": 0.0001513559321000592, "loss": 3.3749, "mean_token_accuracy": 0.3951008379459381, "num_tokens": 725483520.0, "step": 19680 }, { "epoch": 5.2282377919320595, "grad_norm": 0.423828125, "learning_rate": 0.00015109264953164185, "loss": 3.3764, "mean_token_accuracy": 0.39565394520759584, "num_tokens": 726220800.0, "step": 19700 }, { "epoch": 5.233545647558387, "grad_norm": 0.451171875, "learning_rate": 0.0001508293635967796, "loss": 3.3776, "mean_token_accuracy": 0.3950954005122185, "num_tokens": 726958080.0, "step": 19720 }, { "epoch": 5.238853503184713, "grad_norm": 0.439453125, "learning_rate": 0.00015056607510665444, "loss": 3.3801, "mean_token_accuracy": 0.3945300579071045, "num_tokens": 727695360.0, "step": 19740 }, { "epoch": 5.2441613588110405, "grad_norm": 0.4375, "learning_rate": 0.00015030278487245612, "loss": 3.3705, "mean_token_accuracy": 0.39524353444576266, "num_tokens": 728432640.0, "step": 19760 }, { "epoch": 5.249469214437367, "grad_norm": 0.41015625, "learning_rate": 0.00015003949370537994, "loss": 3.3618, "mean_token_accuracy": 0.3956607446074486, "num_tokens": 729169920.0, "step": 19780 }, { "epoch": 5.254777070063694, "grad_norm": 0.43359375, "learning_rate": 0.00014977620241662397, "loss": 3.3915, "mean_token_accuracy": 0.39299847930669785, "num_tokens": 729907200.0, "step": 19800 }, { "epoch": 5.2600849256900215, "grad_norm": 0.421875, "learning_rate": 0.0001495129118173867, "loss": 3.3804, "mean_token_accuracy": 0.3947841927409172, "num_tokens": 730644480.0, "step": 19820 }, { "epoch": 5.265392781316348, "grad_norm": 0.447265625, "learning_rate": 0.00014924962271886433, "loss": 3.3843, "mean_token_accuracy": 0.3941305175423622, "num_tokens": 731381760.0, "step": 19840 }, { "epoch": 5.270700636942675, "grad_norm": 0.427734375, "learning_rate": 0.00014898633593224872, "loss": 3.3746, "mean_token_accuracy": 0.3953046858310699, "num_tokens": 732119040.0, "step": 19860 }, { "epoch": 5.2760084925690025, "grad_norm": 0.47265625, "learning_rate": 0.0001487230522687244, "loss": 3.3779, "mean_token_accuracy": 0.3954786375164986, "num_tokens": 732856320.0, "step": 19880 }, { "epoch": 5.281316348195329, "grad_norm": 0.453125, "learning_rate": 0.00014845977253946625, "loss": 3.3674, "mean_token_accuracy": 0.3960059225559235, "num_tokens": 733593600.0, "step": 19900 }, { "epoch": 5.286624203821656, "grad_norm": 0.412109375, "learning_rate": 0.00014819649755563728, "loss": 3.3873, "mean_token_accuracy": 0.39440503269433974, "num_tokens": 734330880.0, "step": 19920 }, { "epoch": 5.291932059447983, "grad_norm": 0.4375, "learning_rate": 0.00014793322812838553, "loss": 3.3943, "mean_token_accuracy": 0.3934863582253456, "num_tokens": 735068160.0, "step": 19940 }, { "epoch": 5.29723991507431, "grad_norm": 0.421875, "learning_rate": 0.0001476699650688423, "loss": 3.363, "mean_token_accuracy": 0.3960616454482079, "num_tokens": 735805440.0, "step": 19960 }, { "epoch": 5.302547770700637, "grad_norm": 0.4140625, "learning_rate": 0.00014740670918811892, "loss": 3.3912, "mean_token_accuracy": 0.3923950865864754, "num_tokens": 736542720.0, "step": 19980 }, { "epoch": 5.307855626326964, "grad_norm": 0.419921875, "learning_rate": 0.00014714346129730482, "loss": 3.379, "mean_token_accuracy": 0.39665824323892596, "num_tokens": 737280000.0, "step": 20000 }, { "epoch": 5.313163481953291, "grad_norm": 0.43359375, "learning_rate": 0.00014688022220746473, "loss": 3.3838, "mean_token_accuracy": 0.3947488576173782, "num_tokens": 738017280.0, "step": 20020 }, { "epoch": 5.318471337579618, "grad_norm": 0.453125, "learning_rate": 0.00014661699272963634, "loss": 3.3489, "mean_token_accuracy": 0.3994550406932831, "num_tokens": 738754560.0, "step": 20040 }, { "epoch": 5.323779193205945, "grad_norm": 0.431640625, "learning_rate": 0.00014635377367482752, "loss": 3.3735, "mean_token_accuracy": 0.3949907585978508, "num_tokens": 739491840.0, "step": 20060 }, { "epoch": 5.329087048832272, "grad_norm": 0.455078125, "learning_rate": 0.0001460905658540144, "loss": 3.4024, "mean_token_accuracy": 0.39215182662010195, "num_tokens": 740229120.0, "step": 20080 }, { "epoch": 5.334394904458598, "grad_norm": 0.447265625, "learning_rate": 0.00014582737007813807, "loss": 3.3759, "mean_token_accuracy": 0.39500570893287656, "num_tokens": 740966400.0, "step": 20100 }, { "epoch": 5.339702760084926, "grad_norm": 0.451171875, "learning_rate": 0.00014556418715810286, "loss": 3.3679, "mean_token_accuracy": 0.3969979897141457, "num_tokens": 741703680.0, "step": 20120 }, { "epoch": 5.345010615711253, "grad_norm": 0.435546875, "learning_rate": 0.0001453010179047732, "loss": 3.3781, "mean_token_accuracy": 0.39398646354675293, "num_tokens": 742440960.0, "step": 20140 }, { "epoch": 5.350318471337579, "grad_norm": 0.427734375, "learning_rate": 0.00014503786312897174, "loss": 3.3963, "mean_token_accuracy": 0.39267096370458604, "num_tokens": 743178240.0, "step": 20160 }, { "epoch": 5.355626326963907, "grad_norm": 0.443359375, "learning_rate": 0.0001447747236414762, "loss": 3.3649, "mean_token_accuracy": 0.3959610819816589, "num_tokens": 743915520.0, "step": 20180 }, { "epoch": 5.360934182590234, "grad_norm": 0.416015625, "learning_rate": 0.0001445116002530174, "loss": 3.3756, "mean_token_accuracy": 0.3958047941327095, "num_tokens": 744652800.0, "step": 20200 }, { "epoch": 5.36624203821656, "grad_norm": 0.392578125, "learning_rate": 0.00014424849377427644, "loss": 3.3599, "mean_token_accuracy": 0.3972303748130798, "num_tokens": 745390080.0, "step": 20220 }, { "epoch": 5.371549893842888, "grad_norm": 0.44140625, "learning_rate": 0.00014398540501588247, "loss": 3.3784, "mean_token_accuracy": 0.3943629041314125, "num_tokens": 746127360.0, "step": 20240 }, { "epoch": 5.376857749469215, "grad_norm": 0.431640625, "learning_rate": 0.00014372233478840993, "loss": 3.3658, "mean_token_accuracy": 0.39552212357521055, "num_tokens": 746864640.0, "step": 20260 }, { "epoch": 5.382165605095541, "grad_norm": 0.455078125, "learning_rate": 0.0001434592839023761, "loss": 3.3836, "mean_token_accuracy": 0.3942256450653076, "num_tokens": 747601920.0, "step": 20280 }, { "epoch": 5.387473460721869, "grad_norm": 0.462890625, "learning_rate": 0.00014319625316823887, "loss": 3.3988, "mean_token_accuracy": 0.39367661327123643, "num_tokens": 748339200.0, "step": 20300 }, { "epoch": 5.392781316348195, "grad_norm": 0.43359375, "learning_rate": 0.00014293324339639387, "loss": 3.3689, "mean_token_accuracy": 0.3966432884335518, "num_tokens": 749076480.0, "step": 20320 }, { "epoch": 5.398089171974522, "grad_norm": 0.43359375, "learning_rate": 0.00014267025539717226, "loss": 3.3804, "mean_token_accuracy": 0.39448929131031035, "num_tokens": 749813760.0, "step": 20340 }, { "epoch": 5.40339702760085, "grad_norm": 0.423828125, "learning_rate": 0.000142407289980838, "loss": 3.3672, "mean_token_accuracy": 0.39593526124954226, "num_tokens": 750551040.0, "step": 20360 }, { "epoch": 5.408704883227176, "grad_norm": 0.419921875, "learning_rate": 0.00014214434795758557, "loss": 3.3569, "mean_token_accuracy": 0.39864644706249236, "num_tokens": 751288320.0, "step": 20380 }, { "epoch": 5.414012738853503, "grad_norm": 0.443359375, "learning_rate": 0.00014188143013753736, "loss": 3.3762, "mean_token_accuracy": 0.3954541727900505, "num_tokens": 752025600.0, "step": 20400 }, { "epoch": 5.41932059447983, "grad_norm": 0.46875, "learning_rate": 0.00014161853733074117, "loss": 3.3717, "mean_token_accuracy": 0.3958346888422966, "num_tokens": 752762880.0, "step": 20420 }, { "epoch": 5.424628450106157, "grad_norm": 0.46484375, "learning_rate": 0.00014135567034716766, "loss": 3.3687, "mean_token_accuracy": 0.3959760308265686, "num_tokens": 753500160.0, "step": 20440 }, { "epoch": 5.429936305732484, "grad_norm": 0.435546875, "learning_rate": 0.00014109282999670816, "loss": 3.3783, "mean_token_accuracy": 0.39461159855127337, "num_tokens": 754237440.0, "step": 20460 }, { "epoch": 5.435244161358811, "grad_norm": 0.419921875, "learning_rate": 0.00014083001708917162, "loss": 3.3833, "mean_token_accuracy": 0.39439416378736497, "num_tokens": 754974720.0, "step": 20480 }, { "epoch": 5.440552016985138, "grad_norm": 0.443359375, "learning_rate": 0.00014056723243428278, "loss": 3.4008, "mean_token_accuracy": 0.392479345202446, "num_tokens": 755712000.0, "step": 20500 }, { "epoch": 5.445859872611465, "grad_norm": 0.451171875, "learning_rate": 0.00014030447684167902, "loss": 3.3895, "mean_token_accuracy": 0.39312622249126433, "num_tokens": 756449280.0, "step": 20520 }, { "epoch": 5.451167728237792, "grad_norm": 0.4140625, "learning_rate": 0.0001400417511209084, "loss": 3.3785, "mean_token_accuracy": 0.39461839646101, "num_tokens": 757186560.0, "step": 20540 }, { "epoch": 5.456475583864119, "grad_norm": 0.45703125, "learning_rate": 0.00013977905608142686, "loss": 3.3661, "mean_token_accuracy": 0.3955887138843536, "num_tokens": 757923840.0, "step": 20560 }, { "epoch": 5.461783439490446, "grad_norm": 0.494140625, "learning_rate": 0.00013951639253259582, "loss": 3.3843, "mean_token_accuracy": 0.39391307830810546, "num_tokens": 758661120.0, "step": 20580 }, { "epoch": 5.467091295116773, "grad_norm": 0.4296875, "learning_rate": 0.0001392537612836796, "loss": 3.3727, "mean_token_accuracy": 0.39660388231277466, "num_tokens": 759398400.0, "step": 20600 }, { "epoch": 5.4723991507431, "grad_norm": 0.4609375, "learning_rate": 0.00013899116314384323, "loss": 3.3732, "mean_token_accuracy": 0.3939511299133301, "num_tokens": 760135680.0, "step": 20620 }, { "epoch": 5.477707006369426, "grad_norm": 0.474609375, "learning_rate": 0.0001387285989221495, "loss": 3.438, "mean_token_accuracy": 0.38866873234510424, "num_tokens": 760872960.0, "step": 20640 }, { "epoch": 5.483014861995754, "grad_norm": 0.46875, "learning_rate": 0.00013846606942755668, "loss": 3.3769, "mean_token_accuracy": 0.39444444328546524, "num_tokens": 761610240.0, "step": 20660 }, { "epoch": 5.488322717622081, "grad_norm": 0.4296875, "learning_rate": 0.00013820357546891633, "loss": 3.372, "mean_token_accuracy": 0.3952435329556465, "num_tokens": 762347520.0, "step": 20680 }, { "epoch": 5.493630573248407, "grad_norm": 0.447265625, "learning_rate": 0.00013794111785497018, "loss": 3.4016, "mean_token_accuracy": 0.3923352897167206, "num_tokens": 763084800.0, "step": 20700 }, { "epoch": 5.498938428874735, "grad_norm": 0.451171875, "learning_rate": 0.0001376786973943483, "loss": 3.4123, "mean_token_accuracy": 0.3907316789031029, "num_tokens": 763822080.0, "step": 20720 }, { "epoch": 5.504246284501061, "grad_norm": 0.419921875, "learning_rate": 0.00013741631489556596, "loss": 3.3681, "mean_token_accuracy": 0.39635518193244934, "num_tokens": 764559360.0, "step": 20740 }, { "epoch": 5.509554140127388, "grad_norm": 0.4609375, "learning_rate": 0.00013715397116702182, "loss": 3.3767, "mean_token_accuracy": 0.39521091133356095, "num_tokens": 765296640.0, "step": 20760 }, { "epoch": 5.514861995753716, "grad_norm": 0.4140625, "learning_rate": 0.00013689166701699477, "loss": 3.3902, "mean_token_accuracy": 0.39313709288835524, "num_tokens": 766033920.0, "step": 20780 }, { "epoch": 5.520169851380042, "grad_norm": 0.4296875, "learning_rate": 0.000136629403253642, "loss": 3.4015, "mean_token_accuracy": 0.3930732235312462, "num_tokens": 766771200.0, "step": 20800 }, { "epoch": 5.525477707006369, "grad_norm": 0.478515625, "learning_rate": 0.00013636718068499608, "loss": 3.3798, "mean_token_accuracy": 0.3945857807993889, "num_tokens": 767508480.0, "step": 20820 }, { "epoch": 5.530785562632697, "grad_norm": 0.4296875, "learning_rate": 0.00013610500011896286, "loss": 3.3832, "mean_token_accuracy": 0.3946482926607132, "num_tokens": 768245760.0, "step": 20840 }, { "epoch": 5.536093418259023, "grad_norm": 0.4296875, "learning_rate": 0.00013584286236331854, "loss": 3.3698, "mean_token_accuracy": 0.3954759180545807, "num_tokens": 768983040.0, "step": 20860 }, { "epoch": 5.54140127388535, "grad_norm": 0.41796875, "learning_rate": 0.00013558076822570768, "loss": 3.3872, "mean_token_accuracy": 0.3939742371439934, "num_tokens": 769720320.0, "step": 20880 }, { "epoch": 5.546709129511678, "grad_norm": 0.453125, "learning_rate": 0.0001353187185136402, "loss": 3.3746, "mean_token_accuracy": 0.39536040276288986, "num_tokens": 770457600.0, "step": 20900 }, { "epoch": 5.552016985138004, "grad_norm": 0.42578125, "learning_rate": 0.0001350567140344894, "loss": 3.4038, "mean_token_accuracy": 0.3911393776535988, "num_tokens": 771194880.0, "step": 20920 }, { "epoch": 5.557324840764331, "grad_norm": 0.458984375, "learning_rate": 0.000134794755595489, "loss": 3.3786, "mean_token_accuracy": 0.3939443349838257, "num_tokens": 771932160.0, "step": 20940 }, { "epoch": 5.562632696390658, "grad_norm": 0.439453125, "learning_rate": 0.000134532844003731, "loss": 3.3838, "mean_token_accuracy": 0.394221568107605, "num_tokens": 772669440.0, "step": 20960 }, { "epoch": 5.567940552016985, "grad_norm": 0.443359375, "learning_rate": 0.00013427098006616294, "loss": 3.3876, "mean_token_accuracy": 0.39428815990686417, "num_tokens": 773406720.0, "step": 20980 }, { "epoch": 5.573248407643312, "grad_norm": 0.45703125, "learning_rate": 0.00013400916458958576, "loss": 3.3573, "mean_token_accuracy": 0.3980444103479385, "num_tokens": 774144000.0, "step": 21000 }, { "epoch": 5.578556263269639, "grad_norm": 0.478515625, "learning_rate": 0.00013374739838065082, "loss": 3.3959, "mean_token_accuracy": 0.39286121875047686, "num_tokens": 774881280.0, "step": 21020 }, { "epoch": 5.583864118895966, "grad_norm": 0.43359375, "learning_rate": 0.00013348568224585793, "loss": 3.4093, "mean_token_accuracy": 0.3921885207295418, "num_tokens": 775618560.0, "step": 21040 }, { "epoch": 5.5891719745222925, "grad_norm": 0.43359375, "learning_rate": 0.00013322401699155244, "loss": 3.3806, "mean_token_accuracy": 0.3939769506454468, "num_tokens": 776355840.0, "step": 21060 }, { "epoch": 5.59447983014862, "grad_norm": 0.419921875, "learning_rate": 0.000132962403423923, "loss": 3.3976, "mean_token_accuracy": 0.39207708090543747, "num_tokens": 777093120.0, "step": 21080 }, { "epoch": 5.599787685774947, "grad_norm": 0.435546875, "learning_rate": 0.0001327008423489991, "loss": 3.3884, "mean_token_accuracy": 0.3927266806364059, "num_tokens": 777830400.0, "step": 21100 }, { "epoch": 5.6050955414012735, "grad_norm": 0.455078125, "learning_rate": 0.00013243933457264827, "loss": 3.3729, "mean_token_accuracy": 0.3953087627887726, "num_tokens": 778567680.0, "step": 21120 }, { "epoch": 5.610403397027601, "grad_norm": 0.423828125, "learning_rate": 0.0001321778809005742, "loss": 3.3739, "mean_token_accuracy": 0.3948983460664749, "num_tokens": 779304960.0, "step": 21140 }, { "epoch": 5.615711252653928, "grad_norm": 0.435546875, "learning_rate": 0.00013191648213831347, "loss": 3.3675, "mean_token_accuracy": 0.39575994610786436, "num_tokens": 780042240.0, "step": 21160 }, { "epoch": 5.6210191082802545, "grad_norm": 0.443359375, "learning_rate": 0.0001316551390912338, "loss": 3.3848, "mean_token_accuracy": 0.39362225532531736, "num_tokens": 780779520.0, "step": 21180 }, { "epoch": 5.626326963906582, "grad_norm": 0.44140625, "learning_rate": 0.00013139385256453104, "loss": 3.3788, "mean_token_accuracy": 0.39530604630708693, "num_tokens": 781516800.0, "step": 21200 }, { "epoch": 5.631634819532909, "grad_norm": 0.431640625, "learning_rate": 0.0001311326233632271, "loss": 3.3905, "mean_token_accuracy": 0.3929495558142662, "num_tokens": 782254080.0, "step": 21220 }, { "epoch": 5.6369426751592355, "grad_norm": 0.427734375, "learning_rate": 0.000130871452292167, "loss": 3.3903, "mean_token_accuracy": 0.3930392488837242, "num_tokens": 782991360.0, "step": 21240 }, { "epoch": 5.642250530785563, "grad_norm": 0.439453125, "learning_rate": 0.00013061034015601704, "loss": 3.3751, "mean_token_accuracy": 0.3956036642193794, "num_tokens": 783728640.0, "step": 21260 }, { "epoch": 5.647558386411889, "grad_norm": 0.453125, "learning_rate": 0.0001303492877592615, "loss": 3.3981, "mean_token_accuracy": 0.3927796810865402, "num_tokens": 784465920.0, "step": 21280 }, { "epoch": 5.6528662420382165, "grad_norm": 0.447265625, "learning_rate": 0.00013008829590620104, "loss": 3.4011, "mean_token_accuracy": 0.3923203408718109, "num_tokens": 785203200.0, "step": 21300 }, { "epoch": 5.658174097664544, "grad_norm": 0.4765625, "learning_rate": 0.00012982736540094944, "loss": 3.3783, "mean_token_accuracy": 0.3955329954624176, "num_tokens": 785940480.0, "step": 21320 }, { "epoch": 5.66348195329087, "grad_norm": 0.443359375, "learning_rate": 0.0001295664970474316, "loss": 3.3972, "mean_token_accuracy": 0.3916123077273369, "num_tokens": 786677760.0, "step": 21340 }, { "epoch": 5.6687898089171975, "grad_norm": 0.423828125, "learning_rate": 0.000129305691649381, "loss": 3.3652, "mean_token_accuracy": 0.39509811997413635, "num_tokens": 787415040.0, "step": 21360 }, { "epoch": 5.674097664543524, "grad_norm": 0.455078125, "learning_rate": 0.0001290449500103371, "loss": 3.4031, "mean_token_accuracy": 0.3907806038856506, "num_tokens": 788152320.0, "step": 21380 }, { "epoch": 5.679405520169851, "grad_norm": 0.453125, "learning_rate": 0.00012878427293364276, "loss": 3.3912, "mean_token_accuracy": 0.3937880516052246, "num_tokens": 788889600.0, "step": 21400 }, { "epoch": 5.6847133757961785, "grad_norm": 0.419921875, "learning_rate": 0.00012852366122244227, "loss": 3.3922, "mean_token_accuracy": 0.3931044787168503, "num_tokens": 789626880.0, "step": 21420 }, { "epoch": 5.690021231422505, "grad_norm": 0.421875, "learning_rate": 0.00012826311567967825, "loss": 3.3778, "mean_token_accuracy": 0.39554522931575775, "num_tokens": 790364160.0, "step": 21440 }, { "epoch": 5.695329087048832, "grad_norm": 0.431640625, "learning_rate": 0.00012800263710808946, "loss": 3.4088, "mean_token_accuracy": 0.3915742516517639, "num_tokens": 791101440.0, "step": 21460 }, { "epoch": 5.7006369426751595, "grad_norm": 0.4453125, "learning_rate": 0.00012774222631020847, "loss": 3.371, "mean_token_accuracy": 0.3951660692691803, "num_tokens": 791838720.0, "step": 21480 }, { "epoch": 5.705944798301486, "grad_norm": 0.416015625, "learning_rate": 0.00012748188408835895, "loss": 3.395, "mean_token_accuracy": 0.3933341488242149, "num_tokens": 792576000.0, "step": 21500 }, { "epoch": 5.711252653927813, "grad_norm": 0.4296875, "learning_rate": 0.00012722161124465337, "loss": 3.4061, "mean_token_accuracy": 0.39135409742593763, "num_tokens": 793313280.0, "step": 21520 }, { "epoch": 5.7165605095541405, "grad_norm": 0.427734375, "learning_rate": 0.00012696140858099028, "loss": 3.431, "mean_token_accuracy": 0.38928843587636947, "num_tokens": 794050560.0, "step": 21540 }, { "epoch": 5.721868365180467, "grad_norm": 0.490234375, "learning_rate": 0.0001267012768990522, "loss": 3.3953, "mean_token_accuracy": 0.39261660277843474, "num_tokens": 794787840.0, "step": 21560 }, { "epoch": 5.727176220806794, "grad_norm": 0.431640625, "learning_rate": 0.00012644121700030271, "loss": 3.4059, "mean_token_accuracy": 0.39121548682451246, "num_tokens": 795525120.0, "step": 21580 }, { "epoch": 5.732484076433121, "grad_norm": 0.451171875, "learning_rate": 0.0001261812296859846, "loss": 3.408, "mean_token_accuracy": 0.3909069925546646, "num_tokens": 796262400.0, "step": 21600 }, { "epoch": 5.737791932059448, "grad_norm": 0.427734375, "learning_rate": 0.00012592131575711664, "loss": 3.4096, "mean_token_accuracy": 0.39150222539901736, "num_tokens": 796999680.0, "step": 21620 }, { "epoch": 5.743099787685775, "grad_norm": 0.4453125, "learning_rate": 0.00012566147601449177, "loss": 3.3812, "mean_token_accuracy": 0.39388318210840223, "num_tokens": 797736960.0, "step": 21640 }, { "epoch": 5.748407643312102, "grad_norm": 0.42578125, "learning_rate": 0.00012540171125867414, "loss": 3.4109, "mean_token_accuracy": 0.39168704897165296, "num_tokens": 798474240.0, "step": 21660 }, { "epoch": 5.753715498938429, "grad_norm": 0.44140625, "learning_rate": 0.00012514202228999715, "loss": 3.373, "mean_token_accuracy": 0.39493775814771653, "num_tokens": 799211520.0, "step": 21680 }, { "epoch": 5.759023354564756, "grad_norm": 0.421875, "learning_rate": 0.00012488240990856042, "loss": 3.3925, "mean_token_accuracy": 0.39244537204504015, "num_tokens": 799948800.0, "step": 21700 }, { "epoch": 5.764331210191083, "grad_norm": 0.439453125, "learning_rate": 0.00012462287491422772, "loss": 3.4172, "mean_token_accuracy": 0.39136768877506256, "num_tokens": 800686080.0, "step": 21720 }, { "epoch": 5.76963906581741, "grad_norm": 0.458984375, "learning_rate": 0.00012436341810662447, "loss": 3.3862, "mean_token_accuracy": 0.39374728351831434, "num_tokens": 801423360.0, "step": 21740 }, { "epoch": 5.774946921443737, "grad_norm": 0.46875, "learning_rate": 0.0001241040402851351, "loss": 3.3731, "mean_token_accuracy": 0.39589449018239975, "num_tokens": 802160640.0, "step": 21760 }, { "epoch": 5.780254777070064, "grad_norm": 0.4296875, "learning_rate": 0.00012384474224890062, "loss": 3.3929, "mean_token_accuracy": 0.39307730197906493, "num_tokens": 802897920.0, "step": 21780 }, { "epoch": 5.785562632696391, "grad_norm": 0.443359375, "learning_rate": 0.00012358552479681644, "loss": 3.3695, "mean_token_accuracy": 0.3972072720527649, "num_tokens": 803635200.0, "step": 21800 }, { "epoch": 5.790870488322717, "grad_norm": 0.462890625, "learning_rate": 0.00012332638872752943, "loss": 3.3989, "mean_token_accuracy": 0.3931370973587036, "num_tokens": 804372480.0, "step": 21820 }, { "epoch": 5.796178343949045, "grad_norm": 0.4375, "learning_rate": 0.00012306733483943597, "loss": 3.3744, "mean_token_accuracy": 0.3938315391540527, "num_tokens": 805109760.0, "step": 21840 }, { "epoch": 5.801486199575372, "grad_norm": 0.439453125, "learning_rate": 0.00012280836393067907, "loss": 3.3598, "mean_token_accuracy": 0.39693819284439086, "num_tokens": 805847040.0, "step": 21860 }, { "epoch": 5.806794055201698, "grad_norm": 0.427734375, "learning_rate": 0.00012254947679914605, "loss": 3.369, "mean_token_accuracy": 0.39580071717500687, "num_tokens": 806584320.0, "step": 21880 }, { "epoch": 5.812101910828026, "grad_norm": 0.451171875, "learning_rate": 0.0001222906742424663, "loss": 3.3657, "mean_token_accuracy": 0.3966514468193054, "num_tokens": 807321600.0, "step": 21900 }, { "epoch": 5.817409766454352, "grad_norm": 0.515625, "learning_rate": 0.00012203195705800848, "loss": 3.401, "mean_token_accuracy": 0.3950247302651405, "num_tokens": 808058880.0, "step": 21920 }, { "epoch": 5.822717622080679, "grad_norm": 0.53515625, "learning_rate": 0.0001217733260428783, "loss": 3.4042, "mean_token_accuracy": 0.3941808000206947, "num_tokens": 808796160.0, "step": 21940 }, { "epoch": 5.828025477707007, "grad_norm": 0.498046875, "learning_rate": 0.00012151478199391591, "loss": 3.4361, "mean_token_accuracy": 0.3916503593325615, "num_tokens": 809533440.0, "step": 21960 }, { "epoch": 5.833333333333333, "grad_norm": 0.486328125, "learning_rate": 0.00012125632570769362, "loss": 3.4101, "mean_token_accuracy": 0.39459936916828153, "num_tokens": 810270720.0, "step": 21980 }, { "epoch": 5.83864118895966, "grad_norm": 0.52734375, "learning_rate": 0.00012099795798051319, "loss": 3.3989, "mean_token_accuracy": 0.396724833548069, "num_tokens": 811008000.0, "step": 22000 }, { "epoch": 5.843949044585988, "grad_norm": 0.51953125, "learning_rate": 0.00012073967960840373, "loss": 3.3857, "mean_token_accuracy": 0.39950668811798096, "num_tokens": 811745280.0, "step": 22020 }, { "epoch": 5.849256900212314, "grad_norm": 0.48828125, "learning_rate": 0.00012048149138711881, "loss": 3.3672, "mean_token_accuracy": 0.4007542356848717, "num_tokens": 812482560.0, "step": 22040 }, { "epoch": 5.854564755838641, "grad_norm": 0.498046875, "learning_rate": 0.00012022339411213453, "loss": 3.3528, "mean_token_accuracy": 0.40057757347822187, "num_tokens": 813219840.0, "step": 22060 }, { "epoch": 5.859872611464969, "grad_norm": 0.474609375, "learning_rate": 0.0001199653885786465, "loss": 3.374, "mean_token_accuracy": 0.4003995418548584, "num_tokens": 813957120.0, "step": 22080 }, { "epoch": 5.865180467091295, "grad_norm": 0.51171875, "learning_rate": 0.0001197074755815679, "loss": 3.3503, "mean_token_accuracy": 0.4019175335764885, "num_tokens": 814694400.0, "step": 22100 }, { "epoch": 5.870488322717622, "grad_norm": 0.5, "learning_rate": 0.00011944965591552666, "loss": 3.3612, "mean_token_accuracy": 0.40145955681800843, "num_tokens": 815431680.0, "step": 22120 }, { "epoch": 5.875796178343949, "grad_norm": 0.498046875, "learning_rate": 0.00011919193037486324, "loss": 3.3533, "mean_token_accuracy": 0.40123396664857863, "num_tokens": 816168960.0, "step": 22140 }, { "epoch": 5.881104033970276, "grad_norm": 0.47265625, "learning_rate": 0.000118934299753628, "loss": 3.3643, "mean_token_accuracy": 0.4011293202638626, "num_tokens": 816906240.0, "step": 22160 }, { "epoch": 5.886411889596603, "grad_norm": 0.486328125, "learning_rate": 0.00011867676484557906, "loss": 3.3502, "mean_token_accuracy": 0.4021173074841499, "num_tokens": 817643520.0, "step": 22180 }, { "epoch": 5.89171974522293, "grad_norm": 0.51171875, "learning_rate": 0.00011841932644417935, "loss": 3.362, "mean_token_accuracy": 0.4009377032518387, "num_tokens": 818380800.0, "step": 22200 }, { "epoch": 5.897027600849257, "grad_norm": 0.50390625, "learning_rate": 0.00011816198534259477, "loss": 3.3539, "mean_token_accuracy": 0.4018726900219917, "num_tokens": 819118080.0, "step": 22220 }, { "epoch": 5.902335456475583, "grad_norm": 0.462890625, "learning_rate": 0.00011790474233369125, "loss": 3.3504, "mean_token_accuracy": 0.4015125557780266, "num_tokens": 819855360.0, "step": 22240 }, { "epoch": 5.907643312101911, "grad_norm": 0.52734375, "learning_rate": 0.00011764759821003246, "loss": 3.3402, "mean_token_accuracy": 0.4037358656525612, "num_tokens": 820592640.0, "step": 22260 }, { "epoch": 5.912951167728238, "grad_norm": 0.48046875, "learning_rate": 0.00011739055376387763, "loss": 3.3424, "mean_token_accuracy": 0.40280903279781344, "num_tokens": 821329920.0, "step": 22280 }, { "epoch": 5.918259023354564, "grad_norm": 0.490234375, "learning_rate": 0.00011713360978717863, "loss": 3.3585, "mean_token_accuracy": 0.4023537740111351, "num_tokens": 822067200.0, "step": 22300 }, { "epoch": 5.923566878980892, "grad_norm": 0.49609375, "learning_rate": 0.00011687676707157797, "loss": 3.3554, "mean_token_accuracy": 0.4012461930513382, "num_tokens": 822804480.0, "step": 22320 }, { "epoch": 5.928874734607219, "grad_norm": 0.482421875, "learning_rate": 0.00011662002640840605, "loss": 3.3431, "mean_token_accuracy": 0.4031868323683739, "num_tokens": 823541760.0, "step": 22340 }, { "epoch": 5.934182590233545, "grad_norm": 0.5078125, "learning_rate": 0.00011636338858867897, "loss": 3.3418, "mean_token_accuracy": 0.40321129411458967, "num_tokens": 824279040.0, "step": 22360 }, { "epoch": 5.939490445859873, "grad_norm": 0.498046875, "learning_rate": 0.00011610685440309579, "loss": 3.34, "mean_token_accuracy": 0.40219341069459913, "num_tokens": 825016320.0, "step": 22380 }, { "epoch": 5.9447983014862, "grad_norm": 0.46875, "learning_rate": 0.00011585042464203652, "loss": 3.3693, "mean_token_accuracy": 0.3992226555943489, "num_tokens": 825753600.0, "step": 22400 }, { "epoch": 5.950106157112526, "grad_norm": 0.490234375, "learning_rate": 0.00011559410009555917, "loss": 3.3164, "mean_token_accuracy": 0.40619156062602996, "num_tokens": 826490880.0, "step": 22420 }, { "epoch": 5.955414012738854, "grad_norm": 0.515625, "learning_rate": 0.00011533788155339785, "loss": 3.3501, "mean_token_accuracy": 0.4029163971543312, "num_tokens": 827228160.0, "step": 22440 }, { "epoch": 5.96072186836518, "grad_norm": 0.490234375, "learning_rate": 0.00011508176980495983, "loss": 3.3428, "mean_token_accuracy": 0.4032303214073181, "num_tokens": 827965440.0, "step": 22460 }, { "epoch": 5.966029723991507, "grad_norm": 0.52734375, "learning_rate": 0.00011482576563932355, "loss": 3.3331, "mean_token_accuracy": 0.4047782108187675, "num_tokens": 828702720.0, "step": 22480 }, { "epoch": 5.971337579617835, "grad_norm": 0.49609375, "learning_rate": 0.00011456986984523582, "loss": 3.3406, "mean_token_accuracy": 0.4030672416090965, "num_tokens": 829440000.0, "step": 22500 }, { "epoch": 5.976645435244161, "grad_norm": 0.51953125, "learning_rate": 0.00011431408321110969, "loss": 3.3499, "mean_token_accuracy": 0.40232115387916567, "num_tokens": 830177280.0, "step": 22520 }, { "epoch": 5.981953290870488, "grad_norm": 0.48828125, "learning_rate": 0.00011405840652502177, "loss": 3.3318, "mean_token_accuracy": 0.4042617931962013, "num_tokens": 830914560.0, "step": 22540 }, { "epoch": 5.987261146496815, "grad_norm": 0.458984375, "learning_rate": 0.00011380284057471008, "loss": 3.3387, "mean_token_accuracy": 0.40357822477817534, "num_tokens": 831651840.0, "step": 22560 }, { "epoch": 5.992569002123142, "grad_norm": 0.52734375, "learning_rate": 0.00011354738614757125, "loss": 3.3354, "mean_token_accuracy": 0.4039900004863739, "num_tokens": 832389120.0, "step": 22580 }, { "epoch": 5.997876857749469, "grad_norm": 0.47265625, "learning_rate": 0.00011329204403065857, "loss": 3.3399, "mean_token_accuracy": 0.4032044991850853, "num_tokens": 833126400.0, "step": 22600 }, { "epoch": 6.003184713375796, "grad_norm": 0.50390625, "learning_rate": 0.00011303681501067901, "loss": 3.2695, "mean_token_accuracy": 0.4103106647729874, "num_tokens": 833863680.0, "step": 22620 }, { "epoch": 6.008492569002123, "grad_norm": 0.48046875, "learning_rate": 0.00011278169987399139, "loss": 3.2137, "mean_token_accuracy": 0.4183803513646126, "num_tokens": 834600960.0, "step": 22640 }, { "epoch": 6.01380042462845, "grad_norm": 0.515625, "learning_rate": 0.00011252669940660345, "loss": 3.1938, "mean_token_accuracy": 0.4192677766084671, "num_tokens": 835338240.0, "step": 22660 }, { "epoch": 6.019108280254777, "grad_norm": 0.4921875, "learning_rate": 0.00011227181439416957, "loss": 3.1914, "mean_token_accuracy": 0.4213497504591942, "num_tokens": 836075520.0, "step": 22680 }, { "epoch": 6.024416135881104, "grad_norm": 0.515625, "learning_rate": 0.00011201704562198873, "loss": 3.2063, "mean_token_accuracy": 0.4188206672668457, "num_tokens": 836812800.0, "step": 22700 }, { "epoch": 6.029723991507431, "grad_norm": 0.4921875, "learning_rate": 0.00011176239387500143, "loss": 3.2019, "mean_token_accuracy": 0.4193044677376747, "num_tokens": 837550080.0, "step": 22720 }, { "epoch": 6.035031847133758, "grad_norm": 0.5, "learning_rate": 0.00011150785993778788, "loss": 3.1912, "mean_token_accuracy": 0.42125461995601654, "num_tokens": 838287360.0, "step": 22740 }, { "epoch": 6.040339702760085, "grad_norm": 0.51953125, "learning_rate": 0.00011125344459456508, "loss": 3.2206, "mean_token_accuracy": 0.41758942008018496, "num_tokens": 839024640.0, "step": 22760 }, { "epoch": 6.0456475583864115, "grad_norm": 0.50390625, "learning_rate": 0.00011099914862918487, "loss": 3.1962, "mean_token_accuracy": 0.4217832684516907, "num_tokens": 839761920.0, "step": 22780 }, { "epoch": 6.050955414012739, "grad_norm": 0.51171875, "learning_rate": 0.00011074497282513112, "loss": 3.2066, "mean_token_accuracy": 0.4188301801681519, "num_tokens": 840499200.0, "step": 22800 }, { "epoch": 6.056263269639066, "grad_norm": 0.54296875, "learning_rate": 0.00011049091796551763, "loss": 3.2039, "mean_token_accuracy": 0.418923956155777, "num_tokens": 841236480.0, "step": 22820 }, { "epoch": 6.0615711252653925, "grad_norm": 0.4921875, "learning_rate": 0.00011023698483308538, "loss": 3.2314, "mean_token_accuracy": 0.4159300968050957, "num_tokens": 841973760.0, "step": 22840 }, { "epoch": 6.06687898089172, "grad_norm": 0.51953125, "learning_rate": 0.00010998317421020055, "loss": 3.2047, "mean_token_accuracy": 0.4194988027215004, "num_tokens": 842711040.0, "step": 22860 }, { "epoch": 6.072186836518047, "grad_norm": 0.51171875, "learning_rate": 0.00010972948687885163, "loss": 3.229, "mean_token_accuracy": 0.4174195468425751, "num_tokens": 843448320.0, "step": 22880 }, { "epoch": 6.0774946921443735, "grad_norm": 0.50390625, "learning_rate": 0.00010947592362064742, "loss": 3.2303, "mean_token_accuracy": 0.41578604131937025, "num_tokens": 844185600.0, "step": 22900 }, { "epoch": 6.082802547770701, "grad_norm": 0.51171875, "learning_rate": 0.00010922248521681432, "loss": 3.2047, "mean_token_accuracy": 0.41897966861724856, "num_tokens": 844922880.0, "step": 22920 }, { "epoch": 6.088110403397027, "grad_norm": 0.50390625, "learning_rate": 0.00010896917244819425, "loss": 3.2112, "mean_token_accuracy": 0.4191672086715698, "num_tokens": 845660160.0, "step": 22940 }, { "epoch": 6.0934182590233545, "grad_norm": 0.53125, "learning_rate": 0.00010871598609524178, "loss": 3.2217, "mean_token_accuracy": 0.4175962179899216, "num_tokens": 846397440.0, "step": 22960 }, { "epoch": 6.098726114649682, "grad_norm": 0.5234375, "learning_rate": 0.00010846292693802225, "loss": 3.1977, "mean_token_accuracy": 0.4209542840719223, "num_tokens": 847134720.0, "step": 22980 }, { "epoch": 6.104033970276008, "grad_norm": 0.546875, "learning_rate": 0.0001082099957562089, "loss": 3.2079, "mean_token_accuracy": 0.4184143304824829, "num_tokens": 847872000.0, "step": 23000 }, { "epoch": 6.1093418259023355, "grad_norm": 0.51171875, "learning_rate": 0.00010795719332908088, "loss": 3.212, "mean_token_accuracy": 0.4182403787970543, "num_tokens": 848609280.0, "step": 23020 }, { "epoch": 6.114649681528663, "grad_norm": 0.5390625, "learning_rate": 0.00010770452043552052, "loss": 3.2349, "mean_token_accuracy": 0.41664900034666064, "num_tokens": 849346560.0, "step": 23040 }, { "epoch": 6.119957537154989, "grad_norm": 0.53125, "learning_rate": 0.000107451977854011, "loss": 3.2089, "mean_token_accuracy": 0.41843743324279786, "num_tokens": 850083840.0, "step": 23060 }, { "epoch": 6.1252653927813165, "grad_norm": 0.486328125, "learning_rate": 0.00010719956636263423, "loss": 3.2211, "mean_token_accuracy": 0.416915363073349, "num_tokens": 850821120.0, "step": 23080 }, { "epoch": 6.130573248407643, "grad_norm": 0.5234375, "learning_rate": 0.00010694728673906796, "loss": 3.2318, "mean_token_accuracy": 0.4165864855051041, "num_tokens": 851558400.0, "step": 23100 }, { "epoch": 6.13588110403397, "grad_norm": 0.5078125, "learning_rate": 0.0001066951397605839, "loss": 3.2379, "mean_token_accuracy": 0.415302237868309, "num_tokens": 852295680.0, "step": 23120 }, { "epoch": 6.1411889596602975, "grad_norm": 0.50390625, "learning_rate": 0.00010644312620404487, "loss": 3.2036, "mean_token_accuracy": 0.41928272545337675, "num_tokens": 853032960.0, "step": 23140 }, { "epoch": 6.146496815286624, "grad_norm": 0.54296875, "learning_rate": 0.00010619124684590281, "loss": 3.239, "mean_token_accuracy": 0.41411991566419604, "num_tokens": 853770240.0, "step": 23160 }, { "epoch": 6.151804670912951, "grad_norm": 0.54296875, "learning_rate": 0.00010593950246219603, "loss": 3.2149, "mean_token_accuracy": 0.41819281280040743, "num_tokens": 854507520.0, "step": 23180 }, { "epoch": 6.1571125265392785, "grad_norm": 0.578125, "learning_rate": 0.00010568789382854717, "loss": 3.2128, "mean_token_accuracy": 0.41786937564611437, "num_tokens": 855244800.0, "step": 23200 }, { "epoch": 6.162420382165605, "grad_norm": 0.52734375, "learning_rate": 0.00010543642172016036, "loss": 3.2312, "mean_token_accuracy": 0.41561072915792463, "num_tokens": 855982080.0, "step": 23220 }, { "epoch": 6.167728237791932, "grad_norm": 0.5234375, "learning_rate": 0.00010518508691181943, "loss": 3.1985, "mean_token_accuracy": 0.4203916624188423, "num_tokens": 856719360.0, "step": 23240 }, { "epoch": 6.173036093418259, "grad_norm": 0.53515625, "learning_rate": 0.00010493389017788492, "loss": 3.2231, "mean_token_accuracy": 0.41589068025350573, "num_tokens": 857456640.0, "step": 23260 }, { "epoch": 6.178343949044586, "grad_norm": 0.53125, "learning_rate": 0.00010468283229229205, "loss": 3.1945, "mean_token_accuracy": 0.4196061655879021, "num_tokens": 858193920.0, "step": 23280 }, { "epoch": 6.183651804670913, "grad_norm": 0.51171875, "learning_rate": 0.00010443191402854827, "loss": 3.2146, "mean_token_accuracy": 0.418321917951107, "num_tokens": 858931200.0, "step": 23300 }, { "epoch": 6.18895966029724, "grad_norm": 0.5703125, "learning_rate": 0.00010418113615973087, "loss": 3.2213, "mean_token_accuracy": 0.41626168340444564, "num_tokens": 859668480.0, "step": 23320 }, { "epoch": 6.194267515923567, "grad_norm": 0.5390625, "learning_rate": 0.00010393049945848447, "loss": 3.2255, "mean_token_accuracy": 0.41734752207994463, "num_tokens": 860405760.0, "step": 23340 }, { "epoch": 6.199575371549894, "grad_norm": 0.52734375, "learning_rate": 0.00010368000469701899, "loss": 3.2241, "mean_token_accuracy": 0.4172890827059746, "num_tokens": 861143040.0, "step": 23360 }, { "epoch": 6.204883227176221, "grad_norm": 0.54296875, "learning_rate": 0.0001034296526471067, "loss": 3.2305, "mean_token_accuracy": 0.4155156001448631, "num_tokens": 861880320.0, "step": 23380 }, { "epoch": 6.210191082802548, "grad_norm": 0.5078125, "learning_rate": 0.00010317944408008055, "loss": 3.2209, "mean_token_accuracy": 0.41705941408872604, "num_tokens": 862617600.0, "step": 23400 }, { "epoch": 6.215498938428874, "grad_norm": 0.51953125, "learning_rate": 0.00010292937976683112, "loss": 3.2128, "mean_token_accuracy": 0.4163935109972954, "num_tokens": 863354880.0, "step": 23420 }, { "epoch": 6.220806794055202, "grad_norm": 0.53125, "learning_rate": 0.00010267946047780463, "loss": 3.2164, "mean_token_accuracy": 0.41712192744016646, "num_tokens": 864092160.0, "step": 23440 }, { "epoch": 6.226114649681529, "grad_norm": 0.54296875, "learning_rate": 0.00010242968698300063, "loss": 3.1901, "mean_token_accuracy": 0.4204718440771103, "num_tokens": 864829440.0, "step": 23460 }, { "epoch": 6.231422505307855, "grad_norm": 0.51953125, "learning_rate": 0.00010218006005196924, "loss": 3.2174, "mean_token_accuracy": 0.4171015426516533, "num_tokens": 865566720.0, "step": 23480 }, { "epoch": 6.236730360934183, "grad_norm": 0.5234375, "learning_rate": 0.00010193058045380921, "loss": 3.2236, "mean_token_accuracy": 0.4160360962152481, "num_tokens": 866304000.0, "step": 23500 }, { "epoch": 6.24203821656051, "grad_norm": 0.51171875, "learning_rate": 0.0001016812489571652, "loss": 3.201, "mean_token_accuracy": 0.4187771797180176, "num_tokens": 867041280.0, "step": 23520 }, { "epoch": 6.247346072186836, "grad_norm": 0.6015625, "learning_rate": 0.00010143206633022579, "loss": 3.2188, "mean_token_accuracy": 0.4174603193998337, "num_tokens": 867778560.0, "step": 23540 }, { "epoch": 6.252653927813164, "grad_norm": 0.53515625, "learning_rate": 0.00010118303334072062, "loss": 3.237, "mean_token_accuracy": 0.4151744946837425, "num_tokens": 868515840.0, "step": 23560 }, { "epoch": 6.25796178343949, "grad_norm": 0.51953125, "learning_rate": 0.00010093415075591854, "loss": 3.2148, "mean_token_accuracy": 0.417611163854599, "num_tokens": 869253120.0, "step": 23580 }, { "epoch": 6.263269639065817, "grad_norm": 0.50390625, "learning_rate": 0.00010068541934262478, "loss": 3.2125, "mean_token_accuracy": 0.4187296152114868, "num_tokens": 869990400.0, "step": 23600 }, { "epoch": 6.268577494692145, "grad_norm": 0.546875, "learning_rate": 0.00010043683986717911, "loss": 3.1965, "mean_token_accuracy": 0.4207259684801102, "num_tokens": 870727680.0, "step": 23620 }, { "epoch": 6.273885350318471, "grad_norm": 0.51953125, "learning_rate": 0.00010018841309545286, "loss": 3.2379, "mean_token_accuracy": 0.41425581723451615, "num_tokens": 871464960.0, "step": 23640 }, { "epoch": 6.279193205944798, "grad_norm": 0.5234375, "learning_rate": 9.994013979284708e-05, "loss": 3.2365, "mean_token_accuracy": 0.4144528701901436, "num_tokens": 872202240.0, "step": 23660 }, { "epoch": 6.284501061571126, "grad_norm": 0.515625, "learning_rate": 9.96920207242899e-05, "loss": 3.2421, "mean_token_accuracy": 0.41495298147201537, "num_tokens": 872939520.0, "step": 23680 }, { "epoch": 6.289808917197452, "grad_norm": 0.51953125, "learning_rate": 9.944405665423436e-05, "loss": 3.2057, "mean_token_accuracy": 0.4181085556745529, "num_tokens": 873676800.0, "step": 23700 }, { "epoch": 6.295116772823779, "grad_norm": 0.498046875, "learning_rate": 9.919624834665574e-05, "loss": 3.197, "mean_token_accuracy": 0.42021363228559494, "num_tokens": 874414080.0, "step": 23720 }, { "epoch": 6.300424628450106, "grad_norm": 0.5234375, "learning_rate": 9.894859656504967e-05, "loss": 3.2115, "mean_token_accuracy": 0.41692487597465516, "num_tokens": 875151360.0, "step": 23740 }, { "epoch": 6.305732484076433, "grad_norm": 0.546875, "learning_rate": 9.870110207242933e-05, "loss": 3.2057, "mean_token_accuracy": 0.41906528621912004, "num_tokens": 875888640.0, "step": 23760 }, { "epoch": 6.31104033970276, "grad_norm": 0.5234375, "learning_rate": 9.845376563132344e-05, "loss": 3.2177, "mean_token_accuracy": 0.4164546638727188, "num_tokens": 876625920.0, "step": 23780 }, { "epoch": 6.316348195329087, "grad_norm": 0.52734375, "learning_rate": 9.820658800377359e-05, "loss": 3.2072, "mean_token_accuracy": 0.41866438537836076, "num_tokens": 877363200.0, "step": 23800 }, { "epoch": 6.321656050955414, "grad_norm": 0.5234375, "learning_rate": 9.795956995133232e-05, "loss": 3.2165, "mean_token_accuracy": 0.4167984887957573, "num_tokens": 878100480.0, "step": 23820 }, { "epoch": 6.326963906581741, "grad_norm": 0.546875, "learning_rate": 9.771271223506027e-05, "loss": 3.224, "mean_token_accuracy": 0.4172455981373787, "num_tokens": 878837760.0, "step": 23840 }, { "epoch": 6.332271762208068, "grad_norm": 0.57421875, "learning_rate": 9.746601561552419e-05, "loss": 3.2238, "mean_token_accuracy": 0.4172809287905693, "num_tokens": 879575040.0, "step": 23860 }, { "epoch": 6.337579617834395, "grad_norm": 0.5234375, "learning_rate": 9.721948085279449e-05, "loss": 3.2287, "mean_token_accuracy": 0.41562432050704956, "num_tokens": 880312320.0, "step": 23880 }, { "epoch": 6.3428874734607215, "grad_norm": 0.5390625, "learning_rate": 9.697310870644292e-05, "loss": 3.2499, "mean_token_accuracy": 0.41356137543916704, "num_tokens": 881049600.0, "step": 23900 }, { "epoch": 6.348195329087049, "grad_norm": 0.5390625, "learning_rate": 9.672689993554023e-05, "loss": 3.2188, "mean_token_accuracy": 0.4164628192782402, "num_tokens": 881786880.0, "step": 23920 }, { "epoch": 6.353503184713376, "grad_norm": 0.51171875, "learning_rate": 9.648085529865362e-05, "loss": 3.2306, "mean_token_accuracy": 0.41617063581943514, "num_tokens": 882524160.0, "step": 23940 }, { "epoch": 6.3588110403397025, "grad_norm": 0.53515625, "learning_rate": 9.623497555384493e-05, "loss": 3.2282, "mean_token_accuracy": 0.41554413735866547, "num_tokens": 883261440.0, "step": 23960 }, { "epoch": 6.36411889596603, "grad_norm": 0.53515625, "learning_rate": 9.598926145866766e-05, "loss": 3.231, "mean_token_accuracy": 0.41470020562410354, "num_tokens": 883998720.0, "step": 23980 }, { "epoch": 6.369426751592357, "grad_norm": 0.51171875, "learning_rate": 9.574371377016515e-05, "loss": 3.2092, "mean_token_accuracy": 0.41913323700428007, "num_tokens": 884736000.0, "step": 24000 }, { "epoch": 6.3747346072186835, "grad_norm": 0.53125, "learning_rate": 9.549833324486788e-05, "loss": 3.2461, "mean_token_accuracy": 0.41323792934417725, "num_tokens": 885473280.0, "step": 24020 }, { "epoch": 6.380042462845011, "grad_norm": 0.5, "learning_rate": 9.525312063879145e-05, "loss": 3.2229, "mean_token_accuracy": 0.415742552280426, "num_tokens": 886210560.0, "step": 24040 }, { "epoch": 6.385350318471337, "grad_norm": 0.5546875, "learning_rate": 9.500807670743399e-05, "loss": 3.2509, "mean_token_accuracy": 0.4131101846694946, "num_tokens": 886947840.0, "step": 24060 }, { "epoch": 6.3906581740976645, "grad_norm": 0.5, "learning_rate": 9.476320220577406e-05, "loss": 3.2402, "mean_token_accuracy": 0.41402615010738375, "num_tokens": 887685120.0, "step": 24080 }, { "epoch": 6.395966029723992, "grad_norm": 0.54296875, "learning_rate": 9.451849788826801e-05, "loss": 3.2127, "mean_token_accuracy": 0.4171531856060028, "num_tokens": 888422400.0, "step": 24100 }, { "epoch": 6.401273885350318, "grad_norm": 0.5546875, "learning_rate": 9.427396450884811e-05, "loss": 3.2409, "mean_token_accuracy": 0.41404109746217727, "num_tokens": 889159680.0, "step": 24120 }, { "epoch": 6.4065817409766455, "grad_norm": 0.52734375, "learning_rate": 9.402960282091973e-05, "loss": 3.2231, "mean_token_accuracy": 0.41719667464494703, "num_tokens": 889896960.0, "step": 24140 }, { "epoch": 6.411889596602973, "grad_norm": 0.5546875, "learning_rate": 9.378541357735952e-05, "loss": 3.2423, "mean_token_accuracy": 0.41457789540290835, "num_tokens": 890634240.0, "step": 24160 }, { "epoch": 6.417197452229299, "grad_norm": 0.5234375, "learning_rate": 9.35413975305125e-05, "loss": 3.2296, "mean_token_accuracy": 0.4144216150045395, "num_tokens": 891371520.0, "step": 24180 }, { "epoch": 6.4225053078556265, "grad_norm": 0.5234375, "learning_rate": 9.329755543219043e-05, "loss": 3.241, "mean_token_accuracy": 0.41499238908290864, "num_tokens": 892108800.0, "step": 24200 }, { "epoch": 6.427813163481954, "grad_norm": 0.53515625, "learning_rate": 9.305388803366885e-05, "loss": 3.2469, "mean_token_accuracy": 0.4142857179045677, "num_tokens": 892846080.0, "step": 24220 }, { "epoch": 6.43312101910828, "grad_norm": 0.5, "learning_rate": 9.281039608568518e-05, "loss": 3.2325, "mean_token_accuracy": 0.4159029141068459, "num_tokens": 893583360.0, "step": 24240 }, { "epoch": 6.4384288747346075, "grad_norm": 0.5078125, "learning_rate": 9.256708033843627e-05, "loss": 3.2319, "mean_token_accuracy": 0.4146594375371933, "num_tokens": 894320640.0, "step": 24260 }, { "epoch": 6.443736730360934, "grad_norm": 0.52734375, "learning_rate": 9.232394154157611e-05, "loss": 3.2289, "mean_token_accuracy": 0.41556316167116164, "num_tokens": 895057920.0, "step": 24280 }, { "epoch": 6.449044585987261, "grad_norm": 0.546875, "learning_rate": 9.208098044421346e-05, "loss": 3.2201, "mean_token_accuracy": 0.41675363928079606, "num_tokens": 895795200.0, "step": 24300 }, { "epoch": 6.4543524416135885, "grad_norm": 0.51953125, "learning_rate": 9.183819779490957e-05, "loss": 3.2455, "mean_token_accuracy": 0.41287100613117217, "num_tokens": 896532480.0, "step": 24320 }, { "epoch": 6.459660297239915, "grad_norm": 0.5390625, "learning_rate": 9.159559434167603e-05, "loss": 3.2287, "mean_token_accuracy": 0.41583360731601715, "num_tokens": 897269760.0, "step": 24340 }, { "epoch": 6.464968152866242, "grad_norm": 0.5234375, "learning_rate": 9.135317083197215e-05, "loss": 3.2335, "mean_token_accuracy": 0.41495162099599836, "num_tokens": 898007040.0, "step": 24360 }, { "epoch": 6.470276008492569, "grad_norm": 0.51171875, "learning_rate": 9.1110928012703e-05, "loss": 3.2448, "mean_token_accuracy": 0.413580396771431, "num_tokens": 898744320.0, "step": 24380 }, { "epoch": 6.475583864118896, "grad_norm": 0.53515625, "learning_rate": 9.086886663021675e-05, "loss": 3.2051, "mean_token_accuracy": 0.41860458701848985, "num_tokens": 899481600.0, "step": 24400 }, { "epoch": 6.480891719745223, "grad_norm": 0.5078125, "learning_rate": 9.062698743030283e-05, "loss": 3.2161, "mean_token_accuracy": 0.41587029695510863, "num_tokens": 900218880.0, "step": 24420 }, { "epoch": 6.48619957537155, "grad_norm": 0.51953125, "learning_rate": 9.03852911581891e-05, "loss": 3.2021, "mean_token_accuracy": 0.4186141014099121, "num_tokens": 900956160.0, "step": 24440 }, { "epoch": 6.491507430997877, "grad_norm": 0.53515625, "learning_rate": 9.014377855853999e-05, "loss": 3.2377, "mean_token_accuracy": 0.41606599390506743, "num_tokens": 901693440.0, "step": 24460 }, { "epoch": 6.496815286624204, "grad_norm": 0.51953125, "learning_rate": 8.990245037545393e-05, "loss": 3.2208, "mean_token_accuracy": 0.41715454310178757, "num_tokens": 902430720.0, "step": 24480 }, { "epoch": 6.502123142250531, "grad_norm": 0.50390625, "learning_rate": 8.966130735246128e-05, "loss": 3.2335, "mean_token_accuracy": 0.4158961161971092, "num_tokens": 903168000.0, "step": 24500 }, { "epoch": 6.507430997876858, "grad_norm": 0.5234375, "learning_rate": 8.942035023252178e-05, "loss": 3.1908, "mean_token_accuracy": 0.421935473382473, "num_tokens": 903905280.0, "step": 24520 }, { "epoch": 6.512738853503185, "grad_norm": 0.54296875, "learning_rate": 8.917957975802262e-05, "loss": 3.2393, "mean_token_accuracy": 0.41537154763936995, "num_tokens": 904642560.0, "step": 24540 }, { "epoch": 6.518046709129512, "grad_norm": 0.515625, "learning_rate": 8.893899667077562e-05, "loss": 3.2257, "mean_token_accuracy": 0.4169153615832329, "num_tokens": 905379840.0, "step": 24560 }, { "epoch": 6.523354564755839, "grad_norm": 0.5234375, "learning_rate": 8.869860171201562e-05, "loss": 3.2328, "mean_token_accuracy": 0.41652940809726713, "num_tokens": 906117120.0, "step": 24580 }, { "epoch": 6.528662420382165, "grad_norm": 0.515625, "learning_rate": 8.845839562239754e-05, "loss": 3.2276, "mean_token_accuracy": 0.4173611119389534, "num_tokens": 906854400.0, "step": 24600 }, { "epoch": 6.533970276008493, "grad_norm": 0.5390625, "learning_rate": 8.821837914199458e-05, "loss": 3.2276, "mean_token_accuracy": 0.41661094725131986, "num_tokens": 907591680.0, "step": 24620 }, { "epoch": 6.53927813163482, "grad_norm": 0.5390625, "learning_rate": 8.797855301029564e-05, "loss": 3.2368, "mean_token_accuracy": 0.4158200159668922, "num_tokens": 908328960.0, "step": 24640 }, { "epoch": 6.544585987261146, "grad_norm": 0.5078125, "learning_rate": 8.773891796620322e-05, "loss": 3.2074, "mean_token_accuracy": 0.41983991414308547, "num_tokens": 909066240.0, "step": 24660 }, { "epoch": 6.549893842887474, "grad_norm": 0.52734375, "learning_rate": 8.74994747480311e-05, "loss": 3.2004, "mean_token_accuracy": 0.4203074038028717, "num_tokens": 909803520.0, "step": 24680 }, { "epoch": 6.5552016985138, "grad_norm": 0.54296875, "learning_rate": 8.726022409350198e-05, "loss": 3.2121, "mean_token_accuracy": 0.41855023205280306, "num_tokens": 910540800.0, "step": 24700 }, { "epoch": 6.560509554140127, "grad_norm": 0.515625, "learning_rate": 8.702116673974533e-05, "loss": 3.2179, "mean_token_accuracy": 0.4167753875255585, "num_tokens": 911278080.0, "step": 24720 }, { "epoch": 6.565817409766455, "grad_norm": 0.51171875, "learning_rate": 8.678230342329503e-05, "loss": 3.2167, "mean_token_accuracy": 0.417825885117054, "num_tokens": 912015360.0, "step": 24740 }, { "epoch": 6.571125265392781, "grad_norm": 0.515625, "learning_rate": 8.654363488008721e-05, "loss": 3.2262, "mean_token_accuracy": 0.4165824085474014, "num_tokens": 912752640.0, "step": 24760 }, { "epoch": 6.576433121019108, "grad_norm": 0.494140625, "learning_rate": 8.630516184545764e-05, "loss": 3.2261, "mean_token_accuracy": 0.41712600737810135, "num_tokens": 913489920.0, "step": 24780 }, { "epoch": 6.581740976645436, "grad_norm": 0.52734375, "learning_rate": 8.606688505414019e-05, "loss": 3.2246, "mean_token_accuracy": 0.4164614602923393, "num_tokens": 914227200.0, "step": 24800 }, { "epoch": 6.587048832271762, "grad_norm": 0.5078125, "learning_rate": 8.582880524026357e-05, "loss": 3.2117, "mean_token_accuracy": 0.41775929629802705, "num_tokens": 914964480.0, "step": 24820 }, { "epoch": 6.592356687898089, "grad_norm": 0.515625, "learning_rate": 8.559092313735016e-05, "loss": 3.2111, "mean_token_accuracy": 0.418010713160038, "num_tokens": 915701760.0, "step": 24840 }, { "epoch": 6.597664543524417, "grad_norm": 0.5234375, "learning_rate": 8.535323947831262e-05, "loss": 3.2105, "mean_token_accuracy": 0.418657585978508, "num_tokens": 916439040.0, "step": 24860 }, { "epoch": 6.602972399150743, "grad_norm": 0.50390625, "learning_rate": 8.511575499545277e-05, "loss": 3.1969, "mean_token_accuracy": 0.4200464755296707, "num_tokens": 917176320.0, "step": 24880 }, { "epoch": 6.60828025477707, "grad_norm": 0.51171875, "learning_rate": 8.487847042045831e-05, "loss": 3.2235, "mean_token_accuracy": 0.41712328791618347, "num_tokens": 917913600.0, "step": 24900 }, { "epoch": 6.613588110403397, "grad_norm": 0.52734375, "learning_rate": 8.464138648440125e-05, "loss": 3.2095, "mean_token_accuracy": 0.41760708540678027, "num_tokens": 918650880.0, "step": 24920 }, { "epoch": 6.618895966029724, "grad_norm": 0.51171875, "learning_rate": 8.440450391773542e-05, "loss": 3.2101, "mean_token_accuracy": 0.4186942785978317, "num_tokens": 919388160.0, "step": 24940 }, { "epoch": 6.624203821656051, "grad_norm": 0.5390625, "learning_rate": 8.416782345029421e-05, "loss": 3.2079, "mean_token_accuracy": 0.4179495558142662, "num_tokens": 920125440.0, "step": 24960 }, { "epoch": 6.629511677282378, "grad_norm": 0.54296875, "learning_rate": 8.393134581128832e-05, "loss": 3.1958, "mean_token_accuracy": 0.42062404900789263, "num_tokens": 920862720.0, "step": 24980 }, { "epoch": 6.634819532908705, "grad_norm": 0.50390625, "learning_rate": 8.369507172930356e-05, "loss": 3.2218, "mean_token_accuracy": 0.41744265109300616, "num_tokens": 921600000.0, "step": 25000 }, { "epoch": 6.640127388535031, "grad_norm": 0.515625, "learning_rate": 8.345900193229864e-05, "loss": 3.2375, "mean_token_accuracy": 0.4162508130073547, "num_tokens": 922337280.0, "step": 25020 }, { "epoch": 6.645435244161359, "grad_norm": 0.52734375, "learning_rate": 8.322313714760265e-05, "loss": 3.2204, "mean_token_accuracy": 0.41778239905834197, "num_tokens": 923074560.0, "step": 25040 }, { "epoch": 6.650743099787686, "grad_norm": 0.5234375, "learning_rate": 8.298747810191342e-05, "loss": 3.2007, "mean_token_accuracy": 0.4189008489251137, "num_tokens": 923811840.0, "step": 25060 }, { "epoch": 6.656050955414012, "grad_norm": 0.52734375, "learning_rate": 8.275202552129445e-05, "loss": 3.2188, "mean_token_accuracy": 0.4174086764454842, "num_tokens": 924549120.0, "step": 25080 }, { "epoch": 6.66135881104034, "grad_norm": 0.52734375, "learning_rate": 8.251678013117356e-05, "loss": 3.1979, "mean_token_accuracy": 0.41940639168024063, "num_tokens": 925286400.0, "step": 25100 }, { "epoch": 6.666666666666667, "grad_norm": 0.5546875, "learning_rate": 8.228174265633986e-05, "loss": 3.2073, "mean_token_accuracy": 0.4187133088707924, "num_tokens": 926023680.0, "step": 25120 }, { "epoch": 6.671974522292993, "grad_norm": 0.53125, "learning_rate": 8.204691382094207e-05, "loss": 3.2335, "mean_token_accuracy": 0.415718087553978, "num_tokens": 926760960.0, "step": 25140 }, { "epoch": 6.677282377919321, "grad_norm": 0.54296875, "learning_rate": 8.181229434848604e-05, "loss": 3.1971, "mean_token_accuracy": 0.42101680040359496, "num_tokens": 927498240.0, "step": 25160 }, { "epoch": 6.682590233545648, "grad_norm": 0.5234375, "learning_rate": 8.15778849618326e-05, "loss": 3.2011, "mean_token_accuracy": 0.4194118306040764, "num_tokens": 928235520.0, "step": 25180 }, { "epoch": 6.687898089171974, "grad_norm": 0.55859375, "learning_rate": 8.134368638319527e-05, "loss": 3.2395, "mean_token_accuracy": 0.4134186789393425, "num_tokens": 928972800.0, "step": 25200 }, { "epoch": 6.693205944798302, "grad_norm": 0.52734375, "learning_rate": 8.110969933413811e-05, "loss": 3.2324, "mean_token_accuracy": 0.4163839936256409, "num_tokens": 929710080.0, "step": 25220 }, { "epoch": 6.698513800424628, "grad_norm": 0.53515625, "learning_rate": 8.087592453557341e-05, "loss": 3.2275, "mean_token_accuracy": 0.4163486659526825, "num_tokens": 930447360.0, "step": 25240 }, { "epoch": 6.703821656050955, "grad_norm": 0.51171875, "learning_rate": 8.064236270775958e-05, "loss": 3.2346, "mean_token_accuracy": 0.4163812786340714, "num_tokens": 931184640.0, "step": 25260 }, { "epoch": 6.709129511677283, "grad_norm": 0.51953125, "learning_rate": 8.040901457029879e-05, "loss": 3.1961, "mean_token_accuracy": 0.4196224719285965, "num_tokens": 931921920.0, "step": 25280 }, { "epoch": 6.714437367303609, "grad_norm": 0.5234375, "learning_rate": 8.017588084213492e-05, "loss": 3.2183, "mean_token_accuracy": 0.41769270598888397, "num_tokens": 932659200.0, "step": 25300 }, { "epoch": 6.719745222929936, "grad_norm": 0.52734375, "learning_rate": 7.994296224155118e-05, "loss": 3.2055, "mean_token_accuracy": 0.4184374317526817, "num_tokens": 933396480.0, "step": 25320 }, { "epoch": 6.725053078556263, "grad_norm": 0.52734375, "learning_rate": 7.971025948616809e-05, "loss": 3.2106, "mean_token_accuracy": 0.41833142936229706, "num_tokens": 934133760.0, "step": 25340 }, { "epoch": 6.73036093418259, "grad_norm": 0.51171875, "learning_rate": 7.94777732929409e-05, "loss": 3.2134, "mean_token_accuracy": 0.4174874976277351, "num_tokens": 934871040.0, "step": 25360 }, { "epoch": 6.735668789808917, "grad_norm": 0.53515625, "learning_rate": 7.924550437815802e-05, "loss": 3.1996, "mean_token_accuracy": 0.4196781888604164, "num_tokens": 935608320.0, "step": 25380 }, { "epoch": 6.740976645435244, "grad_norm": 0.5546875, "learning_rate": 7.901345345743803e-05, "loss": 3.2044, "mean_token_accuracy": 0.4185678943991661, "num_tokens": 936345600.0, "step": 25400 }, { "epoch": 6.746284501061571, "grad_norm": 0.55078125, "learning_rate": 7.878162124572812e-05, "loss": 3.2314, "mean_token_accuracy": 0.41486871987581253, "num_tokens": 937082880.0, "step": 25420 }, { "epoch": 6.751592356687898, "grad_norm": 0.52734375, "learning_rate": 7.855000845730158e-05, "loss": 3.1968, "mean_token_accuracy": 0.41947569996118544, "num_tokens": 937820160.0, "step": 25440 }, { "epoch": 6.756900212314225, "grad_norm": 0.5, "learning_rate": 7.831861580575564e-05, "loss": 3.2098, "mean_token_accuracy": 0.4171980291604996, "num_tokens": 938557440.0, "step": 25460 }, { "epoch": 6.762208067940552, "grad_norm": 0.53125, "learning_rate": 7.808744400400929e-05, "loss": 3.2216, "mean_token_accuracy": 0.4162888675928116, "num_tokens": 939294720.0, "step": 25480 }, { "epoch": 6.767515923566879, "grad_norm": 0.5234375, "learning_rate": 7.785649376430111e-05, "loss": 3.2007, "mean_token_accuracy": 0.4191522628068924, "num_tokens": 940032000.0, "step": 25500 }, { "epoch": 6.772823779193206, "grad_norm": 0.53515625, "learning_rate": 7.76257657981871e-05, "loss": 3.2361, "mean_token_accuracy": 0.4153430059552193, "num_tokens": 940769280.0, "step": 25520 }, { "epoch": 6.778131634819533, "grad_norm": 0.51171875, "learning_rate": 7.739526081653817e-05, "loss": 3.2205, "mean_token_accuracy": 0.41684333235025406, "num_tokens": 941506560.0, "step": 25540 }, { "epoch": 6.7834394904458595, "grad_norm": 0.51171875, "learning_rate": 7.716497952953863e-05, "loss": 3.196, "mean_token_accuracy": 0.4206525832414627, "num_tokens": 942243840.0, "step": 25560 }, { "epoch": 6.788747346072187, "grad_norm": 0.51953125, "learning_rate": 7.69349226466832e-05, "loss": 3.2061, "mean_token_accuracy": 0.4173461616039276, "num_tokens": 942981120.0, "step": 25580 }, { "epoch": 6.794055201698514, "grad_norm": 0.515625, "learning_rate": 7.670509087677558e-05, "loss": 3.2144, "mean_token_accuracy": 0.4169289544224739, "num_tokens": 943718400.0, "step": 25600 }, { "epoch": 6.7993630573248405, "grad_norm": 0.5625, "learning_rate": 7.647548492792548e-05, "loss": 3.2104, "mean_token_accuracy": 0.4183056116104126, "num_tokens": 944455680.0, "step": 25620 }, { "epoch": 6.804670912951168, "grad_norm": 0.515625, "learning_rate": 7.62461055075473e-05, "loss": 3.2403, "mean_token_accuracy": 0.41490541249513624, "num_tokens": 945192960.0, "step": 25640 }, { "epoch": 6.809978768577495, "grad_norm": 0.53125, "learning_rate": 7.601695332235714e-05, "loss": 3.2229, "mean_token_accuracy": 0.4161081209778786, "num_tokens": 945930240.0, "step": 25660 }, { "epoch": 6.8152866242038215, "grad_norm": 0.5390625, "learning_rate": 7.578802907837121e-05, "loss": 3.2046, "mean_token_accuracy": 0.4197393462061882, "num_tokens": 946667520.0, "step": 25680 }, { "epoch": 6.820594479830149, "grad_norm": 0.51171875, "learning_rate": 7.555933348090332e-05, "loss": 3.2169, "mean_token_accuracy": 0.4170200049877167, "num_tokens": 947404800.0, "step": 25700 }, { "epoch": 6.825902335456476, "grad_norm": 0.51953125, "learning_rate": 7.53308672345629e-05, "loss": 3.2343, "mean_token_accuracy": 0.41603065878152845, "num_tokens": 948142080.0, "step": 25720 }, { "epoch": 6.8312101910828025, "grad_norm": 0.53125, "learning_rate": 7.510263104325271e-05, "loss": 3.1761, "mean_token_accuracy": 0.4231028497219086, "num_tokens": 948879360.0, "step": 25740 }, { "epoch": 6.83651804670913, "grad_norm": 0.51953125, "learning_rate": 7.48746256101667e-05, "loss": 3.2169, "mean_token_accuracy": 0.41643699556589125, "num_tokens": 949616640.0, "step": 25760 }, { "epoch": 6.841825902335456, "grad_norm": 0.5078125, "learning_rate": 7.464685163778791e-05, "loss": 3.2286, "mean_token_accuracy": 0.41638671457767484, "num_tokens": 950353920.0, "step": 25780 }, { "epoch": 6.8471337579617835, "grad_norm": 0.5390625, "learning_rate": 7.441930982788621e-05, "loss": 3.2156, "mean_token_accuracy": 0.4182838648557663, "num_tokens": 951091200.0, "step": 25800 }, { "epoch": 6.852441613588111, "grad_norm": 0.51953125, "learning_rate": 7.419200088151622e-05, "loss": 3.2239, "mean_token_accuracy": 0.41721706092357635, "num_tokens": 951828480.0, "step": 25820 }, { "epoch": 6.857749469214437, "grad_norm": 0.546875, "learning_rate": 7.396492549901491e-05, "loss": 3.2044, "mean_token_accuracy": 0.4181221455335617, "num_tokens": 952565760.0, "step": 25840 }, { "epoch": 6.8630573248407645, "grad_norm": 0.51171875, "learning_rate": 7.373808438000006e-05, "loss": 3.2276, "mean_token_accuracy": 0.4158023461699486, "num_tokens": 953303040.0, "step": 25860 }, { "epoch": 6.868365180467091, "grad_norm": 0.5078125, "learning_rate": 7.351147822336723e-05, "loss": 3.2251, "mean_token_accuracy": 0.4161271467804909, "num_tokens": 954040320.0, "step": 25880 }, { "epoch": 6.873673036093418, "grad_norm": 0.52734375, "learning_rate": 7.328510772728834e-05, "loss": 3.1968, "mean_token_accuracy": 0.4196822628378868, "num_tokens": 954777600.0, "step": 25900 }, { "epoch": 6.8789808917197455, "grad_norm": 0.51953125, "learning_rate": 7.305897358920916e-05, "loss": 3.2301, "mean_token_accuracy": 0.41639350950717924, "num_tokens": 955514880.0, "step": 25920 }, { "epoch": 6.884288747346072, "grad_norm": 0.51953125, "learning_rate": 7.283307650584726e-05, "loss": 3.2174, "mean_token_accuracy": 0.41875815093517305, "num_tokens": 956252160.0, "step": 25940 }, { "epoch": 6.889596602972399, "grad_norm": 0.54296875, "learning_rate": 7.260741717318984e-05, "loss": 3.2241, "mean_token_accuracy": 0.4157343998551369, "num_tokens": 956989440.0, "step": 25960 }, { "epoch": 6.8949044585987265, "grad_norm": 0.5, "learning_rate": 7.238199628649155e-05, "loss": 3.2287, "mean_token_accuracy": 0.41718715876340867, "num_tokens": 957726720.0, "step": 25980 }, { "epoch": 6.900212314225053, "grad_norm": 0.5234375, "learning_rate": 7.215681454027246e-05, "loss": 3.1879, "mean_token_accuracy": 0.421465265750885, "num_tokens": 958464000.0, "step": 26000 }, { "epoch": 6.90552016985138, "grad_norm": 0.54296875, "learning_rate": 7.193187262831582e-05, "loss": 3.2147, "mean_token_accuracy": 0.41816156059503556, "num_tokens": 959201280.0, "step": 26020 }, { "epoch": 6.9108280254777075, "grad_norm": 0.5390625, "learning_rate": 7.170717124366595e-05, "loss": 3.2033, "mean_token_accuracy": 0.41960888504981997, "num_tokens": 959938560.0, "step": 26040 }, { "epoch": 6.916135881104034, "grad_norm": 0.4921875, "learning_rate": 7.14827110786261e-05, "loss": 3.2074, "mean_token_accuracy": 0.42048543095588686, "num_tokens": 960675840.0, "step": 26060 }, { "epoch": 6.921443736730361, "grad_norm": 0.53125, "learning_rate": 7.125849282475632e-05, "loss": 3.2198, "mean_token_accuracy": 0.4176614448428154, "num_tokens": 961413120.0, "step": 26080 }, { "epoch": 6.926751592356688, "grad_norm": 0.515625, "learning_rate": 7.103451717287141e-05, "loss": 3.2001, "mean_token_accuracy": 0.41974478363990786, "num_tokens": 962150400.0, "step": 26100 }, { "epoch": 6.932059447983015, "grad_norm": 0.51171875, "learning_rate": 7.081078481303849e-05, "loss": 3.2011, "mean_token_accuracy": 0.42007366120815276, "num_tokens": 962887680.0, "step": 26120 }, { "epoch": 6.937367303609342, "grad_norm": 0.51171875, "learning_rate": 7.058729643457546e-05, "loss": 3.2026, "mean_token_accuracy": 0.41908295452594757, "num_tokens": 963624960.0, "step": 26140 }, { "epoch": 6.942675159235669, "grad_norm": 0.55078125, "learning_rate": 7.036405272604813e-05, "loss": 3.2134, "mean_token_accuracy": 0.4175595223903656, "num_tokens": 964362240.0, "step": 26160 }, { "epoch": 6.947983014861996, "grad_norm": 0.5390625, "learning_rate": 7.014105437526887e-05, "loss": 3.2379, "mean_token_accuracy": 0.41605240255594256, "num_tokens": 965099520.0, "step": 26180 }, { "epoch": 6.953290870488322, "grad_norm": 0.5390625, "learning_rate": 6.991830206929374e-05, "loss": 3.2229, "mean_token_accuracy": 0.41678897440433504, "num_tokens": 965836800.0, "step": 26200 }, { "epoch": 6.95859872611465, "grad_norm": 0.53515625, "learning_rate": 6.969579649442093e-05, "loss": 3.1972, "mean_token_accuracy": 0.4199065029621124, "num_tokens": 966574080.0, "step": 26220 }, { "epoch": 6.963906581740977, "grad_norm": 0.5, "learning_rate": 6.947353833618847e-05, "loss": 3.2124, "mean_token_accuracy": 0.4174263432621956, "num_tokens": 967311360.0, "step": 26240 }, { "epoch": 6.969214437367303, "grad_norm": 0.5078125, "learning_rate": 6.925152827937202e-05, "loss": 3.2151, "mean_token_accuracy": 0.41755680590867994, "num_tokens": 968048640.0, "step": 26260 }, { "epoch": 6.974522292993631, "grad_norm": 0.5, "learning_rate": 6.902976700798289e-05, "loss": 3.1915, "mean_token_accuracy": 0.4195123925805092, "num_tokens": 968785920.0, "step": 26280 }, { "epoch": 6.979830148619958, "grad_norm": 0.53125, "learning_rate": 6.880825520526586e-05, "loss": 3.2383, "mean_token_accuracy": 0.4146580770611763, "num_tokens": 969523200.0, "step": 26300 }, { "epoch": 6.985138004246284, "grad_norm": 0.54296875, "learning_rate": 6.858699355369718e-05, "loss": 3.2059, "mean_token_accuracy": 0.4189388960599899, "num_tokens": 970260480.0, "step": 26320 }, { "epoch": 6.990445859872612, "grad_norm": 0.50390625, "learning_rate": 6.836598273498216e-05, "loss": 3.2019, "mean_token_accuracy": 0.41997717022895814, "num_tokens": 970997760.0, "step": 26340 }, { "epoch": 6.995753715498939, "grad_norm": 0.5234375, "learning_rate": 6.814522343005364e-05, "loss": 3.1912, "mean_token_accuracy": 0.42053163796663284, "num_tokens": 971735040.0, "step": 26360 }, { "epoch": 7.001061571125265, "grad_norm": 0.4921875, "learning_rate": 6.792471631906917e-05, "loss": 3.2094, "mean_token_accuracy": 0.41812486201524734, "num_tokens": 972472320.0, "step": 26380 }, { "epoch": 7.006369426751593, "grad_norm": 0.51953125, "learning_rate": 6.770446208140968e-05, "loss": 3.1173, "mean_token_accuracy": 0.43134512603282926, "num_tokens": 973209600.0, "step": 26400 }, { "epoch": 7.011677282377919, "grad_norm": 0.5234375, "learning_rate": 6.74844613956766e-05, "loss": 3.0775, "mean_token_accuracy": 0.4356803119182587, "num_tokens": 973946880.0, "step": 26420 }, { "epoch": 7.016985138004246, "grad_norm": 0.546875, "learning_rate": 6.726471493969058e-05, "loss": 3.1351, "mean_token_accuracy": 0.42938546240329745, "num_tokens": 974684160.0, "step": 26440 }, { "epoch": 7.022292993630574, "grad_norm": 0.52734375, "learning_rate": 6.70452233904886e-05, "loss": 3.0982, "mean_token_accuracy": 0.4341392174363136, "num_tokens": 975421440.0, "step": 26460 }, { "epoch": 7.0276008492569, "grad_norm": 0.609375, "learning_rate": 6.682598742432252e-05, "loss": 3.0944, "mean_token_accuracy": 0.43294737935066224, "num_tokens": 976158720.0, "step": 26480 }, { "epoch": 7.032908704883227, "grad_norm": 0.56640625, "learning_rate": 6.66070077166567e-05, "loss": 3.1255, "mean_token_accuracy": 0.4291530787944794, "num_tokens": 976896000.0, "step": 26500 }, { "epoch": 7.038216560509555, "grad_norm": 0.56640625, "learning_rate": 6.638828494216595e-05, "loss": 3.1172, "mean_token_accuracy": 0.43106517642736436, "num_tokens": 977633280.0, "step": 26520 }, { "epoch": 7.043524416135881, "grad_norm": 0.5390625, "learning_rate": 6.616981977473344e-05, "loss": 3.1043, "mean_token_accuracy": 0.43127989768981934, "num_tokens": 978370560.0, "step": 26540 }, { "epoch": 7.048832271762208, "grad_norm": 0.53515625, "learning_rate": 6.595161288744874e-05, "loss": 3.1175, "mean_token_accuracy": 0.4300595223903656, "num_tokens": 979107840.0, "step": 26560 }, { "epoch": 7.054140127388535, "grad_norm": 0.5390625, "learning_rate": 6.573366495260558e-05, "loss": 3.1255, "mean_token_accuracy": 0.429422153532505, "num_tokens": 979845120.0, "step": 26580 }, { "epoch": 7.059447983014862, "grad_norm": 0.578125, "learning_rate": 6.55159766416999e-05, "loss": 3.1272, "mean_token_accuracy": 0.4306085601449013, "num_tokens": 980582400.0, "step": 26600 }, { "epoch": 7.064755838641189, "grad_norm": 0.5390625, "learning_rate": 6.529854862542777e-05, "loss": 3.0887, "mean_token_accuracy": 0.4350388661026955, "num_tokens": 981319680.0, "step": 26620 }, { "epoch": 7.070063694267516, "grad_norm": 0.53125, "learning_rate": 6.508138157368311e-05, "loss": 3.1458, "mean_token_accuracy": 0.4276378020644188, "num_tokens": 982056960.0, "step": 26640 }, { "epoch": 7.075371549893843, "grad_norm": 0.5390625, "learning_rate": 6.486447615555617e-05, "loss": 3.1181, "mean_token_accuracy": 0.4314225912094116, "num_tokens": 982794240.0, "step": 26660 }, { "epoch": 7.08067940552017, "grad_norm": 0.5390625, "learning_rate": 6.464783303933073e-05, "loss": 3.1025, "mean_token_accuracy": 0.43205044120550157, "num_tokens": 983531520.0, "step": 26680 }, { "epoch": 7.085987261146497, "grad_norm": 0.546875, "learning_rate": 6.443145289248264e-05, "loss": 3.1119, "mean_token_accuracy": 0.4309700489044189, "num_tokens": 984268800.0, "step": 26700 }, { "epoch": 7.091295116772824, "grad_norm": 0.5234375, "learning_rate": 6.421533638167746e-05, "loss": 3.1134, "mean_token_accuracy": 0.4302810400724411, "num_tokens": 985006080.0, "step": 26720 }, { "epoch": 7.09660297239915, "grad_norm": 0.53515625, "learning_rate": 6.399948417276855e-05, "loss": 3.1574, "mean_token_accuracy": 0.4262489080429077, "num_tokens": 985743360.0, "step": 26740 }, { "epoch": 7.101910828025478, "grad_norm": 0.55859375, "learning_rate": 6.37838969307949e-05, "loss": 3.1102, "mean_token_accuracy": 0.4315163642168045, "num_tokens": 986480640.0, "step": 26760 }, { "epoch": 7.107218683651805, "grad_norm": 0.55078125, "learning_rate": 6.356857531997918e-05, "loss": 3.1157, "mean_token_accuracy": 0.4308572545647621, "num_tokens": 987217920.0, "step": 26780 }, { "epoch": 7.112526539278131, "grad_norm": 0.55859375, "learning_rate": 6.335352000372564e-05, "loss": 3.1057, "mean_token_accuracy": 0.43248396515846255, "num_tokens": 987955200.0, "step": 26800 }, { "epoch": 7.117834394904459, "grad_norm": 0.546875, "learning_rate": 6.313873164461806e-05, "loss": 3.1215, "mean_token_accuracy": 0.42917889952659605, "num_tokens": 988692480.0, "step": 26820 }, { "epoch": 7.123142250530786, "grad_norm": 0.56640625, "learning_rate": 6.292421090441775e-05, "loss": 3.123, "mean_token_accuracy": 0.4296925961971283, "num_tokens": 989429760.0, "step": 26840 }, { "epoch": 7.128450106157112, "grad_norm": 0.59765625, "learning_rate": 6.270995844406153e-05, "loss": 3.1413, "mean_token_accuracy": 0.42689171582460406, "num_tokens": 990167040.0, "step": 26860 }, { "epoch": 7.13375796178344, "grad_norm": 0.5625, "learning_rate": 6.249597492365943e-05, "loss": 3.1259, "mean_token_accuracy": 0.42952679842710495, "num_tokens": 990904320.0, "step": 26880 }, { "epoch": 7.139065817409766, "grad_norm": 0.56640625, "learning_rate": 6.228226100249328e-05, "loss": 3.1256, "mean_token_accuracy": 0.4298339322209358, "num_tokens": 991641600.0, "step": 26900 }, { "epoch": 7.144373673036093, "grad_norm": 0.55859375, "learning_rate": 6.206881733901381e-05, "loss": 3.1221, "mean_token_accuracy": 0.4291612297296524, "num_tokens": 992378880.0, "step": 26920 }, { "epoch": 7.149681528662421, "grad_norm": 0.5390625, "learning_rate": 6.185564459083956e-05, "loss": 3.128, "mean_token_accuracy": 0.42946700304746627, "num_tokens": 993116160.0, "step": 26940 }, { "epoch": 7.154989384288747, "grad_norm": 0.5625, "learning_rate": 6.164274341475391e-05, "loss": 3.1518, "mean_token_accuracy": 0.42688356190919874, "num_tokens": 993853440.0, "step": 26960 }, { "epoch": 7.160297239915074, "grad_norm": 0.54296875, "learning_rate": 6.143011446670402e-05, "loss": 3.1287, "mean_token_accuracy": 0.42900358885526657, "num_tokens": 994590720.0, "step": 26980 }, { "epoch": 7.165605095541402, "grad_norm": 0.5390625, "learning_rate": 6.121775840179786e-05, "loss": 3.1382, "mean_token_accuracy": 0.4274774357676506, "num_tokens": 995328000.0, "step": 27000 }, { "epoch": 7.170912951167728, "grad_norm": 0.55078125, "learning_rate": 6.100567587430291e-05, "loss": 3.126, "mean_token_accuracy": 0.4306697085499763, "num_tokens": 996065280.0, "step": 27020 }, { "epoch": 7.176220806794055, "grad_norm": 0.54296875, "learning_rate": 6.0793867537643854e-05, "loss": 3.1474, "mean_token_accuracy": 0.4276948794722557, "num_tokens": 996802560.0, "step": 27040 }, { "epoch": 7.181528662420382, "grad_norm": 0.52734375, "learning_rate": 6.0582334044400536e-05, "loss": 3.1093, "mean_token_accuracy": 0.43137638866901395, "num_tokens": 997539840.0, "step": 27060 }, { "epoch": 7.186836518046709, "grad_norm": 0.51953125, "learning_rate": 6.03710760463061e-05, "loss": 3.1083, "mean_token_accuracy": 0.43297048211097716, "num_tokens": 998277120.0, "step": 27080 }, { "epoch": 7.192144373673036, "grad_norm": 0.5234375, "learning_rate": 6.016009419424467e-05, "loss": 3.0826, "mean_token_accuracy": 0.4351706922054291, "num_tokens": 999014400.0, "step": 27100 }, { "epoch": 7.197452229299363, "grad_norm": 0.546875, "learning_rate": 5.9949389138249955e-05, "loss": 3.1363, "mean_token_accuracy": 0.42887040972709656, "num_tokens": 999751680.0, "step": 27120 }, { "epoch": 7.20276008492569, "grad_norm": 0.5234375, "learning_rate": 5.97389615275024e-05, "loss": 3.1214, "mean_token_accuracy": 0.43145248740911485, "num_tokens": 1000488960.0, "step": 27140 }, { "epoch": 7.208067940552017, "grad_norm": 0.53125, "learning_rate": 5.9528812010328094e-05, "loss": 3.1236, "mean_token_accuracy": 0.4297945186495781, "num_tokens": 1001226240.0, "step": 27160 }, { "epoch": 7.213375796178344, "grad_norm": 0.51953125, "learning_rate": 5.931894123419589e-05, "loss": 3.092, "mean_token_accuracy": 0.43471270650625227, "num_tokens": 1001963520.0, "step": 27180 }, { "epoch": 7.218683651804671, "grad_norm": 0.5234375, "learning_rate": 5.9109349845716286e-05, "loss": 3.1342, "mean_token_accuracy": 0.4296205684542656, "num_tokens": 1002700800.0, "step": 27200 }, { "epoch": 7.2239915074309975, "grad_norm": 0.5390625, "learning_rate": 5.890003849063855e-05, "loss": 3.1221, "mean_token_accuracy": 0.4307879418134689, "num_tokens": 1003438080.0, "step": 27220 }, { "epoch": 7.229299363057325, "grad_norm": 0.53515625, "learning_rate": 5.8691007813849466e-05, "loss": 3.1307, "mean_token_accuracy": 0.43032181113958357, "num_tokens": 1004175360.0, "step": 27240 }, { "epoch": 7.234607218683652, "grad_norm": 0.53125, "learning_rate": 5.848225845937094e-05, "loss": 3.0998, "mean_token_accuracy": 0.43258452862501146, "num_tokens": 1004912640.0, "step": 27260 }, { "epoch": 7.2399150743099785, "grad_norm": 0.5234375, "learning_rate": 5.827379107035816e-05, "loss": 3.1112, "mean_token_accuracy": 0.4324934735894203, "num_tokens": 1005649920.0, "step": 27280 }, { "epoch": 7.245222929936306, "grad_norm": 0.51171875, "learning_rate": 5.8065606289097554e-05, "loss": 3.1182, "mean_token_accuracy": 0.43106653541326523, "num_tokens": 1006387200.0, "step": 27300 }, { "epoch": 7.250530785562633, "grad_norm": 0.51953125, "learning_rate": 5.7857704757004855e-05, "loss": 3.104, "mean_token_accuracy": 0.43185067772865293, "num_tokens": 1007124480.0, "step": 27320 }, { "epoch": 7.2558386411889595, "grad_norm": 0.50390625, "learning_rate": 5.7650087114623104e-05, "loss": 3.1367, "mean_token_accuracy": 0.42982985228300097, "num_tokens": 1007861760.0, "step": 27340 }, { "epoch": 7.261146496815287, "grad_norm": 0.53515625, "learning_rate": 5.7442754001620665e-05, "loss": 3.1369, "mean_token_accuracy": 0.4306384548544884, "num_tokens": 1008599040.0, "step": 27360 }, { "epoch": 7.266454352441613, "grad_norm": 0.53515625, "learning_rate": 5.723570605678932e-05, "loss": 3.0821, "mean_token_accuracy": 0.43584339171648023, "num_tokens": 1009336320.0, "step": 27380 }, { "epoch": 7.2717622080679405, "grad_norm": 0.51171875, "learning_rate": 5.7028943918042175e-05, "loss": 3.126, "mean_token_accuracy": 0.43101081550121306, "num_tokens": 1010073600.0, "step": 27400 }, { "epoch": 7.277070063694268, "grad_norm": 0.53125, "learning_rate": 5.682246822241187e-05, "loss": 3.11, "mean_token_accuracy": 0.43198385238647463, "num_tokens": 1010810880.0, "step": 27420 }, { "epoch": 7.282377919320594, "grad_norm": 0.51953125, "learning_rate": 5.6616279606048345e-05, "loss": 3.1169, "mean_token_accuracy": 0.43114264160394666, "num_tokens": 1011548160.0, "step": 27440 }, { "epoch": 7.2876857749469215, "grad_norm": 0.51953125, "learning_rate": 5.641037870421721e-05, "loss": 3.11, "mean_token_accuracy": 0.43299358934164045, "num_tokens": 1012285440.0, "step": 27460 }, { "epoch": 7.292993630573249, "grad_norm": 0.515625, "learning_rate": 5.6204766151297596e-05, "loss": 3.1099, "mean_token_accuracy": 0.4319892913103104, "num_tokens": 1013022720.0, "step": 27480 }, { "epoch": 7.298301486199575, "grad_norm": 0.51953125, "learning_rate": 5.599944258078021e-05, "loss": 3.0974, "mean_token_accuracy": 0.43382664769887924, "num_tokens": 1013760000.0, "step": 27500 }, { "epoch": 7.3036093418259025, "grad_norm": 0.52734375, "learning_rate": 5.5794408625265394e-05, "loss": 3.0952, "mean_token_accuracy": 0.43487306833267214, "num_tokens": 1014497280.0, "step": 27520 }, { "epoch": 7.308917197452229, "grad_norm": 0.51953125, "learning_rate": 5.558966491646122e-05, "loss": 3.1174, "mean_token_accuracy": 0.43038839995861056, "num_tokens": 1015234560.0, "step": 27540 }, { "epoch": 7.314225053078556, "grad_norm": 0.53125, "learning_rate": 5.538521208518149e-05, "loss": 3.1089, "mean_token_accuracy": 0.4338497489690781, "num_tokens": 1015971840.0, "step": 27560 }, { "epoch": 7.3195329087048835, "grad_norm": 0.53515625, "learning_rate": 5.518105076134383e-05, "loss": 3.1178, "mean_token_accuracy": 0.4307974562048912, "num_tokens": 1016709120.0, "step": 27580 }, { "epoch": 7.32484076433121, "grad_norm": 0.54296875, "learning_rate": 5.497718157396773e-05, "loss": 3.1092, "mean_token_accuracy": 0.43338633328676224, "num_tokens": 1017446400.0, "step": 27600 }, { "epoch": 7.330148619957537, "grad_norm": 0.51171875, "learning_rate": 5.4773605151172586e-05, "loss": 3.1177, "mean_token_accuracy": 0.43133018016815183, "num_tokens": 1018183680.0, "step": 27620 }, { "epoch": 7.3354564755838645, "grad_norm": 0.51953125, "learning_rate": 5.457032212017583e-05, "loss": 3.1055, "mean_token_accuracy": 0.43249891102314, "num_tokens": 1018920960.0, "step": 27640 }, { "epoch": 7.340764331210191, "grad_norm": 0.515625, "learning_rate": 5.436733310729097e-05, "loss": 3.1358, "mean_token_accuracy": 0.4297754973173141, "num_tokens": 1019658240.0, "step": 27660 }, { "epoch": 7.346072186836518, "grad_norm": 0.515625, "learning_rate": 5.416463873792547e-05, "loss": 3.114, "mean_token_accuracy": 0.4319512367248535, "num_tokens": 1020395520.0, "step": 27680 }, { "epoch": 7.351380042462845, "grad_norm": 0.53125, "learning_rate": 5.396223963657934e-05, "loss": 3.1152, "mean_token_accuracy": 0.432531526684761, "num_tokens": 1021132800.0, "step": 27700 }, { "epoch": 7.356687898089172, "grad_norm": 0.51953125, "learning_rate": 5.376013642684246e-05, "loss": 3.13, "mean_token_accuracy": 0.43040878474712374, "num_tokens": 1021870080.0, "step": 27720 }, { "epoch": 7.361995753715499, "grad_norm": 0.5234375, "learning_rate": 5.35583297313935e-05, "loss": 3.1068, "mean_token_accuracy": 0.4329568952322006, "num_tokens": 1022607360.0, "step": 27740 }, { "epoch": 7.367303609341826, "grad_norm": 0.5234375, "learning_rate": 5.335682017199714e-05, "loss": 3.1173, "mean_token_accuracy": 0.4322352707386017, "num_tokens": 1023344640.0, "step": 27760 }, { "epoch": 7.372611464968153, "grad_norm": 0.51953125, "learning_rate": 5.315560836950301e-05, "loss": 3.1118, "mean_token_accuracy": 0.4313859030604362, "num_tokens": 1024081920.0, "step": 27780 }, { "epoch": 7.37791932059448, "grad_norm": 0.5390625, "learning_rate": 5.295469494384299e-05, "loss": 3.1112, "mean_token_accuracy": 0.43340535908937455, "num_tokens": 1024819200.0, "step": 27800 }, { "epoch": 7.383227176220807, "grad_norm": 0.55078125, "learning_rate": 5.275408051402987e-05, "loss": 3.0901, "mean_token_accuracy": 0.43458904325962067, "num_tokens": 1025556480.0, "step": 27820 }, { "epoch": 7.388535031847134, "grad_norm": 0.515625, "learning_rate": 5.255376569815517e-05, "loss": 3.1326, "mean_token_accuracy": 0.42870053350925447, "num_tokens": 1026293760.0, "step": 27840 }, { "epoch": 7.393842887473461, "grad_norm": 0.53515625, "learning_rate": 5.235375111338733e-05, "loss": 3.0952, "mean_token_accuracy": 0.4343783989548683, "num_tokens": 1027031040.0, "step": 27860 }, { "epoch": 7.399150743099788, "grad_norm": 0.5234375, "learning_rate": 5.215403737596981e-05, "loss": 3.1144, "mean_token_accuracy": 0.4320450112223625, "num_tokens": 1027768320.0, "step": 27880 }, { "epoch": 7.404458598726115, "grad_norm": 0.5234375, "learning_rate": 5.195462510121897e-05, "loss": 3.1309, "mean_token_accuracy": 0.4302226036787033, "num_tokens": 1028505600.0, "step": 27900 }, { "epoch": 7.409766454352441, "grad_norm": 0.5234375, "learning_rate": 5.175551490352268e-05, "loss": 3.1343, "mean_token_accuracy": 0.4284531980752945, "num_tokens": 1029242880.0, "step": 27920 }, { "epoch": 7.415074309978769, "grad_norm": 0.52734375, "learning_rate": 5.1556707396337795e-05, "loss": 3.099, "mean_token_accuracy": 0.43345564752817156, "num_tokens": 1029980160.0, "step": 27940 }, { "epoch": 7.420382165605096, "grad_norm": 0.51953125, "learning_rate": 5.1358203192188896e-05, "loss": 3.1076, "mean_token_accuracy": 0.4330805644392967, "num_tokens": 1030717440.0, "step": 27960 }, { "epoch": 7.425690021231422, "grad_norm": 0.51171875, "learning_rate": 5.11600029026658e-05, "loss": 3.1188, "mean_token_accuracy": 0.4322583690285683, "num_tokens": 1031454720.0, "step": 27980 }, { "epoch": 7.43099787685775, "grad_norm": 0.53125, "learning_rate": 5.096210713842213e-05, "loss": 3.1477, "mean_token_accuracy": 0.42786611169576644, "num_tokens": 1032192000.0, "step": 28000 }, { "epoch": 7.436305732484076, "grad_norm": 0.53125, "learning_rate": 5.076451650917321e-05, "loss": 3.1022, "mean_token_accuracy": 0.4345672965049744, "num_tokens": 1032929280.0, "step": 28020 }, { "epoch": 7.441613588110403, "grad_norm": 0.51171875, "learning_rate": 5.056723162369431e-05, "loss": 3.1308, "mean_token_accuracy": 0.4307920202612877, "num_tokens": 1033666560.0, "step": 28040 }, { "epoch": 7.446921443736731, "grad_norm": 0.56640625, "learning_rate": 5.037025308981863e-05, "loss": 3.127, "mean_token_accuracy": 0.43095373958349226, "num_tokens": 1034403840.0, "step": 28060 }, { "epoch": 7.452229299363057, "grad_norm": 0.53125, "learning_rate": 5.017358151443556e-05, "loss": 3.1027, "mean_token_accuracy": 0.43357387334108355, "num_tokens": 1035141120.0, "step": 28080 }, { "epoch": 7.457537154989384, "grad_norm": 0.51953125, "learning_rate": 4.99772175034887e-05, "loss": 3.1276, "mean_token_accuracy": 0.43118884563446047, "num_tokens": 1035878400.0, "step": 28100 }, { "epoch": 7.462845010615712, "grad_norm": 0.546875, "learning_rate": 4.978116166197409e-05, "loss": 3.1136, "mean_token_accuracy": 0.43120243549346926, "num_tokens": 1036615680.0, "step": 28120 }, { "epoch": 7.468152866242038, "grad_norm": 0.515625, "learning_rate": 4.9585414593938276e-05, "loss": 3.1233, "mean_token_accuracy": 0.43120243698358535, "num_tokens": 1037352960.0, "step": 28140 }, { "epoch": 7.473460721868365, "grad_norm": 0.5390625, "learning_rate": 4.9389976902476506e-05, "loss": 3.1067, "mean_token_accuracy": 0.43232904076576234, "num_tokens": 1038090240.0, "step": 28160 }, { "epoch": 7.478768577494693, "grad_norm": 0.54296875, "learning_rate": 4.919484918973085e-05, "loss": 3.1285, "mean_token_accuracy": 0.4302715271711349, "num_tokens": 1038827520.0, "step": 28180 }, { "epoch": 7.484076433121019, "grad_norm": 0.5234375, "learning_rate": 4.900003205688822e-05, "loss": 3.1164, "mean_token_accuracy": 0.4320558786392212, "num_tokens": 1039564800.0, "step": 28200 }, { "epoch": 7.489384288747346, "grad_norm": 0.5234375, "learning_rate": 4.8805526104178774e-05, "loss": 3.1091, "mean_token_accuracy": 0.432740817964077, "num_tokens": 1040302080.0, "step": 28220 }, { "epoch": 7.494692144373673, "grad_norm": 0.546875, "learning_rate": 4.8611331930873866e-05, "loss": 3.0887, "mean_token_accuracy": 0.435272616147995, "num_tokens": 1041039360.0, "step": 28240 }, { "epoch": 7.5, "grad_norm": 0.53125, "learning_rate": 4.841745013528427e-05, "loss": 3.1071, "mean_token_accuracy": 0.4324132949113846, "num_tokens": 1041776640.0, "step": 28260 }, { "epoch": 7.505307855626327, "grad_norm": 0.5078125, "learning_rate": 4.82238813147583e-05, "loss": 3.1265, "mean_token_accuracy": 0.42957979887723924, "num_tokens": 1042513920.0, "step": 28280 }, { "epoch": 7.510615711252654, "grad_norm": 0.51953125, "learning_rate": 4.8030626065680065e-05, "loss": 3.1215, "mean_token_accuracy": 0.4310706153512001, "num_tokens": 1043251200.0, "step": 28300 }, { "epoch": 7.515923566878981, "grad_norm": 0.52734375, "learning_rate": 4.78376849834675e-05, "loss": 3.1202, "mean_token_accuracy": 0.43079609870910646, "num_tokens": 1043988480.0, "step": 28320 }, { "epoch": 7.521231422505307, "grad_norm": 0.515625, "learning_rate": 4.76450586625706e-05, "loss": 3.1175, "mean_token_accuracy": 0.43043324500322344, "num_tokens": 1044725760.0, "step": 28340 }, { "epoch": 7.526539278131635, "grad_norm": 0.5390625, "learning_rate": 4.7452747696469584e-05, "loss": 3.1312, "mean_token_accuracy": 0.43019542545080186, "num_tokens": 1045463040.0, "step": 28360 }, { "epoch": 7.531847133757962, "grad_norm": 0.53125, "learning_rate": 4.726075267767311e-05, "loss": 3.0965, "mean_token_accuracy": 0.43414057195186617, "num_tokens": 1046200320.0, "step": 28380 }, { "epoch": 7.537154989384288, "grad_norm": 0.515625, "learning_rate": 4.7069074197716337e-05, "loss": 3.11, "mean_token_accuracy": 0.4327870190143585, "num_tokens": 1046937600.0, "step": 28400 }, { "epoch": 7.542462845010616, "grad_norm": 0.53125, "learning_rate": 4.687771284715929e-05, "loss": 3.1295, "mean_token_accuracy": 0.4299671098589897, "num_tokens": 1047674880.0, "step": 28420 }, { "epoch": 7.547770700636943, "grad_norm": 0.5390625, "learning_rate": 4.6686669215584674e-05, "loss": 3.1154, "mean_token_accuracy": 0.43205452263355254, "num_tokens": 1048412160.0, "step": 28440 }, { "epoch": 7.553078556263269, "grad_norm": 0.52734375, "learning_rate": 4.649594389159664e-05, "loss": 3.1043, "mean_token_accuracy": 0.43399380147457123, "num_tokens": 1049149440.0, "step": 28460 }, { "epoch": 7.558386411889597, "grad_norm": 0.53125, "learning_rate": 4.6305537462818274e-05, "loss": 3.1141, "mean_token_accuracy": 0.43188057094812393, "num_tokens": 1049886720.0, "step": 28480 }, { "epoch": 7.563694267515924, "grad_norm": 0.53515625, "learning_rate": 4.611545051589056e-05, "loss": 3.0977, "mean_token_accuracy": 0.43408485800027846, "num_tokens": 1050624000.0, "step": 28500 }, { "epoch": 7.56900212314225, "grad_norm": 0.53125, "learning_rate": 4.5925683636469704e-05, "loss": 3.1333, "mean_token_accuracy": 0.42929984629154205, "num_tokens": 1051361280.0, "step": 28520 }, { "epoch": 7.574309978768578, "grad_norm": 0.53125, "learning_rate": 4.573623740922622e-05, "loss": 3.105, "mean_token_accuracy": 0.433645898103714, "num_tokens": 1052098560.0, "step": 28540 }, { "epoch": 7.579617834394904, "grad_norm": 0.5234375, "learning_rate": 4.554711241784236e-05, "loss": 3.1196, "mean_token_accuracy": 0.43104479312896726, "num_tokens": 1052835840.0, "step": 28560 }, { "epoch": 7.584925690021231, "grad_norm": 0.5078125, "learning_rate": 4.535830924501083e-05, "loss": 3.1063, "mean_token_accuracy": 0.43181262165308, "num_tokens": 1053573120.0, "step": 28580 }, { "epoch": 7.590233545647559, "grad_norm": 0.53125, "learning_rate": 4.516982847243277e-05, "loss": 3.1294, "mean_token_accuracy": 0.4305392473936081, "num_tokens": 1054310400.0, "step": 28600 }, { "epoch": 7.595541401273885, "grad_norm": 0.515625, "learning_rate": 4.4981670680816014e-05, "loss": 3.1174, "mean_token_accuracy": 0.4315815955400467, "num_tokens": 1055047680.0, "step": 28620 }, { "epoch": 7.600849256900212, "grad_norm": 0.50390625, "learning_rate": 4.479383644987331e-05, "loss": 3.1312, "mean_token_accuracy": 0.42999972850084306, "num_tokens": 1055784960.0, "step": 28640 }, { "epoch": 7.606157112526539, "grad_norm": 0.5390625, "learning_rate": 4.460632635832037e-05, "loss": 3.1083, "mean_token_accuracy": 0.43263481110334395, "num_tokens": 1056522240.0, "step": 28660 }, { "epoch": 7.611464968152866, "grad_norm": 0.51953125, "learning_rate": 4.4419140983874554e-05, "loss": 3.1128, "mean_token_accuracy": 0.4320069566369057, "num_tokens": 1057259520.0, "step": 28680 }, { "epoch": 7.616772823779193, "grad_norm": 0.5234375, "learning_rate": 4.423228090325236e-05, "loss": 3.1244, "mean_token_accuracy": 0.43123776465654373, "num_tokens": 1057996800.0, "step": 28700 }, { "epoch": 7.62208067940552, "grad_norm": 0.5078125, "learning_rate": 4.404574669216851e-05, "loss": 3.0709, "mean_token_accuracy": 0.4368368163704872, "num_tokens": 1058734080.0, "step": 28720 }, { "epoch": 7.627388535031847, "grad_norm": 0.53125, "learning_rate": 4.385953892533326e-05, "loss": 3.122, "mean_token_accuracy": 0.43028783351182937, "num_tokens": 1059471360.0, "step": 28740 }, { "epoch": 7.632696390658174, "grad_norm": 0.52734375, "learning_rate": 4.3673658176451546e-05, "loss": 3.1205, "mean_token_accuracy": 0.4294004112482071, "num_tokens": 1060208640.0, "step": 28760 }, { "epoch": 7.638004246284501, "grad_norm": 0.52734375, "learning_rate": 4.348810501822038e-05, "loss": 3.1006, "mean_token_accuracy": 0.43382664769887924, "num_tokens": 1060945920.0, "step": 28780 }, { "epoch": 7.643312101910828, "grad_norm": 0.53515625, "learning_rate": 4.330288002232768e-05, "loss": 3.1217, "mean_token_accuracy": 0.4311657458543777, "num_tokens": 1061683200.0, "step": 28800 }, { "epoch": 7.648619957537155, "grad_norm": 0.53125, "learning_rate": 4.3117983759450245e-05, "loss": 3.1307, "mean_token_accuracy": 0.430170963704586, "num_tokens": 1062420480.0, "step": 28820 }, { "epoch": 7.653927813163482, "grad_norm": 0.515625, "learning_rate": 4.2933416799252064e-05, "loss": 3.1374, "mean_token_accuracy": 0.4288269177079201, "num_tokens": 1063157760.0, "step": 28840 }, { "epoch": 7.659235668789809, "grad_norm": 0.53515625, "learning_rate": 4.2749179710382535e-05, "loss": 3.1099, "mean_token_accuracy": 0.43225701302289965, "num_tokens": 1063895040.0, "step": 28860 }, { "epoch": 7.6645435244161355, "grad_norm": 0.5234375, "learning_rate": 4.2565273060474734e-05, "loss": 3.1136, "mean_token_accuracy": 0.4327571228146553, "num_tokens": 1064632320.0, "step": 28880 }, { "epoch": 7.669851380042463, "grad_norm": 0.52734375, "learning_rate": 4.238169741614365e-05, "loss": 3.1161, "mean_token_accuracy": 0.43164002597332, "num_tokens": 1065369600.0, "step": 28900 }, { "epoch": 7.67515923566879, "grad_norm": 0.53515625, "learning_rate": 4.2198453342984434e-05, "loss": 3.103, "mean_token_accuracy": 0.432827790081501, "num_tokens": 1066106880.0, "step": 28920 }, { "epoch": 7.6804670912951165, "grad_norm": 0.51953125, "learning_rate": 4.201554140557069e-05, "loss": 3.1204, "mean_token_accuracy": 0.43150549232959745, "num_tokens": 1066844160.0, "step": 28940 }, { "epoch": 7.685774946921444, "grad_norm": 0.51953125, "learning_rate": 4.183296216745271e-05, "loss": 3.1418, "mean_token_accuracy": 0.4289927169680595, "num_tokens": 1067581440.0, "step": 28960 }, { "epoch": 7.69108280254777, "grad_norm": 0.515625, "learning_rate": 4.165071619115577e-05, "loss": 3.1086, "mean_token_accuracy": 0.43389324098825455, "num_tokens": 1068318720.0, "step": 28980 }, { "epoch": 7.6963906581740975, "grad_norm": 0.53515625, "learning_rate": 4.1468804038178265e-05, "loss": 3.1223, "mean_token_accuracy": 0.4307702749967575, "num_tokens": 1069056000.0, "step": 29000 }, { "epoch": 7.701698513800425, "grad_norm": 0.52734375, "learning_rate": 4.1287226268990184e-05, "loss": 3.128, "mean_token_accuracy": 0.43051206320524216, "num_tokens": 1069793280.0, "step": 29020 }, { "epoch": 7.707006369426751, "grad_norm": 0.52734375, "learning_rate": 4.1105983443031275e-05, "loss": 3.1328, "mean_token_accuracy": 0.4291585132479668, "num_tokens": 1070530560.0, "step": 29040 }, { "epoch": 7.7123142250530785, "grad_norm": 0.5234375, "learning_rate": 4.092507611870929e-05, "loss": 3.0799, "mean_token_accuracy": 0.4351272016763687, "num_tokens": 1071267840.0, "step": 29060 }, { "epoch": 7.717622080679406, "grad_norm": 0.5390625, "learning_rate": 4.074450485339832e-05, "loss": 3.1353, "mean_token_accuracy": 0.42902125865221025, "num_tokens": 1072005120.0, "step": 29080 }, { "epoch": 7.722929936305732, "grad_norm": 0.5234375, "learning_rate": 4.056427020343706e-05, "loss": 3.1168, "mean_token_accuracy": 0.43121330738067626, "num_tokens": 1072742400.0, "step": 29100 }, { "epoch": 7.7282377919320595, "grad_norm": 0.50390625, "learning_rate": 4.03843727241271e-05, "loss": 3.1143, "mean_token_accuracy": 0.43222847282886506, "num_tokens": 1073479680.0, "step": 29120 }, { "epoch": 7.733545647558387, "grad_norm": 0.5234375, "learning_rate": 4.020481296973119e-05, "loss": 3.1167, "mean_token_accuracy": 0.43183300495147703, "num_tokens": 1074216960.0, "step": 29140 }, { "epoch": 7.738853503184713, "grad_norm": 0.5234375, "learning_rate": 4.0025591493471604e-05, "loss": 3.1191, "mean_token_accuracy": 0.43192406445741655, "num_tokens": 1074954240.0, "step": 29160 }, { "epoch": 7.7441613588110405, "grad_norm": 0.52734375, "learning_rate": 3.9846708847528354e-05, "loss": 3.1188, "mean_token_accuracy": 0.4322542935609818, "num_tokens": 1075691520.0, "step": 29180 }, { "epoch": 7.749469214437367, "grad_norm": 0.53125, "learning_rate": 3.9668165583037394e-05, "loss": 3.1315, "mean_token_accuracy": 0.4301220402121544, "num_tokens": 1076428800.0, "step": 29200 }, { "epoch": 7.754777070063694, "grad_norm": 0.498046875, "learning_rate": 3.948996225008934e-05, "loss": 3.1143, "mean_token_accuracy": 0.43169439136981963, "num_tokens": 1077166080.0, "step": 29220 }, { "epoch": 7.7600849256900215, "grad_norm": 0.53515625, "learning_rate": 3.931209939772714e-05, "loss": 3.1293, "mean_token_accuracy": 0.43014242351055143, "num_tokens": 1077903360.0, "step": 29240 }, { "epoch": 7.765392781316348, "grad_norm": 0.5390625, "learning_rate": 3.9134577573945066e-05, "loss": 3.1197, "mean_token_accuracy": 0.43117389678955076, "num_tokens": 1078640640.0, "step": 29260 }, { "epoch": 7.770700636942675, "grad_norm": 0.5390625, "learning_rate": 3.89573973256863e-05, "loss": 3.1371, "mean_token_accuracy": 0.42919384986162185, "num_tokens": 1079377920.0, "step": 29280 }, { "epoch": 7.7760084925690025, "grad_norm": 0.5234375, "learning_rate": 3.8780559198842056e-05, "loss": 3.0799, "mean_token_accuracy": 0.4360567510128021, "num_tokens": 1080115200.0, "step": 29300 }, { "epoch": 7.781316348195329, "grad_norm": 0.515625, "learning_rate": 3.86040637382491e-05, "loss": 3.1175, "mean_token_accuracy": 0.4321197524666786, "num_tokens": 1080852480.0, "step": 29320 }, { "epoch": 7.786624203821656, "grad_norm": 0.5234375, "learning_rate": 3.8427911487688654e-05, "loss": 3.1117, "mean_token_accuracy": 0.4317609816789627, "num_tokens": 1081589760.0, "step": 29340 }, { "epoch": 7.7919320594479835, "grad_norm": 0.52734375, "learning_rate": 3.825210298988445e-05, "loss": 3.1128, "mean_token_accuracy": 0.4311168178915977, "num_tokens": 1082327040.0, "step": 29360 }, { "epoch": 7.79723991507431, "grad_norm": 0.54296875, "learning_rate": 3.8076638786501165e-05, "loss": 3.1049, "mean_token_accuracy": 0.43285496681928637, "num_tokens": 1083064320.0, "step": 29380 }, { "epoch": 7.802547770700637, "grad_norm": 0.51171875, "learning_rate": 3.790151941814269e-05, "loss": 3.1002, "mean_token_accuracy": 0.43299630433321, "num_tokens": 1083801600.0, "step": 29400 }, { "epoch": 7.807855626326964, "grad_norm": 0.51953125, "learning_rate": 3.7726745424350356e-05, "loss": 3.148, "mean_token_accuracy": 0.4274176448583603, "num_tokens": 1084538880.0, "step": 29420 }, { "epoch": 7.813163481953291, "grad_norm": 0.52734375, "learning_rate": 3.755231734360167e-05, "loss": 3.0935, "mean_token_accuracy": 0.43495596945285797, "num_tokens": 1085276160.0, "step": 29440 }, { "epoch": 7.818471337579618, "grad_norm": 0.53125, "learning_rate": 3.737823571330804e-05, "loss": 3.1282, "mean_token_accuracy": 0.43050799071788787, "num_tokens": 1086013440.0, "step": 29460 }, { "epoch": 7.823779193205945, "grad_norm": 0.5234375, "learning_rate": 3.720450106981385e-05, "loss": 3.0956, "mean_token_accuracy": 0.43426152169704435, "num_tokens": 1086750720.0, "step": 29480 }, { "epoch": 7.829087048832272, "grad_norm": 0.53125, "learning_rate": 3.7031113948394006e-05, "loss": 3.1201, "mean_token_accuracy": 0.43180854618549347, "num_tokens": 1087488000.0, "step": 29500 }, { "epoch": 7.834394904458598, "grad_norm": 0.5, "learning_rate": 3.68580748832531e-05, "loss": 3.1055, "mean_token_accuracy": 0.43347194939851763, "num_tokens": 1088225280.0, "step": 29520 }, { "epoch": 7.839702760084926, "grad_norm": 0.51953125, "learning_rate": 3.668538440752305e-05, "loss": 3.1185, "mean_token_accuracy": 0.43114127963781357, "num_tokens": 1088962560.0, "step": 29540 }, { "epoch": 7.845010615711253, "grad_norm": 0.5234375, "learning_rate": 3.6513043053261945e-05, "loss": 3.1337, "mean_token_accuracy": 0.4290280506014824, "num_tokens": 1089699840.0, "step": 29560 }, { "epoch": 7.850318471337579, "grad_norm": 0.52734375, "learning_rate": 3.634105135145219e-05, "loss": 3.0888, "mean_token_accuracy": 0.435665363073349, "num_tokens": 1090437120.0, "step": 29580 }, { "epoch": 7.855626326963907, "grad_norm": 0.53125, "learning_rate": 3.6169409831998876e-05, "loss": 3.1231, "mean_token_accuracy": 0.43175282776355745, "num_tokens": 1091174400.0, "step": 29600 }, { "epoch": 7.860934182590234, "grad_norm": 0.5078125, "learning_rate": 3.5998119023728255e-05, "loss": 3.1006, "mean_token_accuracy": 0.43458768129348757, "num_tokens": 1091911680.0, "step": 29620 }, { "epoch": 7.86624203821656, "grad_norm": 0.5234375, "learning_rate": 3.5827179454386e-05, "loss": 3.0807, "mean_token_accuracy": 0.4365690886974335, "num_tokens": 1092648960.0, "step": 29640 }, { "epoch": 7.871549893842888, "grad_norm": 0.5390625, "learning_rate": 3.565659165063561e-05, "loss": 3.0989, "mean_token_accuracy": 0.43321102410554885, "num_tokens": 1093386240.0, "step": 29660 }, { "epoch": 7.876857749469215, "grad_norm": 0.51171875, "learning_rate": 3.548635613805682e-05, "loss": 3.1484, "mean_token_accuracy": 0.4289804846048355, "num_tokens": 1094123520.0, "step": 29680 }, { "epoch": 7.882165605095541, "grad_norm": 0.53515625, "learning_rate": 3.531647344114395e-05, "loss": 3.1225, "mean_token_accuracy": 0.43258316814899445, "num_tokens": 1094860800.0, "step": 29700 }, { "epoch": 7.887473460721869, "grad_norm": 0.515625, "learning_rate": 3.5146944083304295e-05, "loss": 3.1218, "mean_token_accuracy": 0.43016280680894853, "num_tokens": 1095598080.0, "step": 29720 }, { "epoch": 7.892781316348195, "grad_norm": 0.52734375, "learning_rate": 3.497776858685651e-05, "loss": 3.1147, "mean_token_accuracy": 0.43113448172807695, "num_tokens": 1096335360.0, "step": 29740 }, { "epoch": 7.898089171974522, "grad_norm": 0.5234375, "learning_rate": 3.480894747302908e-05, "loss": 3.1048, "mean_token_accuracy": 0.4321360632777214, "num_tokens": 1097072640.0, "step": 29760 }, { "epoch": 7.90339702760085, "grad_norm": 0.54296875, "learning_rate": 3.464048126195846e-05, "loss": 3.1156, "mean_token_accuracy": 0.43192269951105117, "num_tokens": 1097809920.0, "step": 29780 }, { "epoch": 7.908704883227176, "grad_norm": 0.53125, "learning_rate": 3.4472370472687824e-05, "loss": 3.1199, "mean_token_accuracy": 0.4307526096701622, "num_tokens": 1098547200.0, "step": 29800 }, { "epoch": 7.914012738853503, "grad_norm": 0.52734375, "learning_rate": 3.430461562316522e-05, "loss": 3.095, "mean_token_accuracy": 0.43481463342905047, "num_tokens": 1099284480.0, "step": 29820 }, { "epoch": 7.91932059447983, "grad_norm": 0.53515625, "learning_rate": 3.413721723024209e-05, "loss": 3.0959, "mean_token_accuracy": 0.4340699061751366, "num_tokens": 1100021760.0, "step": 29840 }, { "epoch": 7.924628450106157, "grad_norm": 0.51171875, "learning_rate": 3.397017580967159e-05, "loss": 3.1123, "mean_token_accuracy": 0.43290660679340365, "num_tokens": 1100759040.0, "step": 29860 }, { "epoch": 7.929936305732484, "grad_norm": 0.53125, "learning_rate": 3.3803491876107066e-05, "loss": 3.1245, "mean_token_accuracy": 0.43108556419610977, "num_tokens": 1101496320.0, "step": 29880 }, { "epoch": 7.935244161358811, "grad_norm": 0.5390625, "learning_rate": 3.3637165943100464e-05, "loss": 3.1158, "mean_token_accuracy": 0.43222711235284805, "num_tokens": 1102233600.0, "step": 29900 }, { "epoch": 7.940552016985138, "grad_norm": 0.51953125, "learning_rate": 3.347119852310069e-05, "loss": 3.1131, "mean_token_accuracy": 0.43230185806751253, "num_tokens": 1102970880.0, "step": 29920 }, { "epoch": 7.945859872611465, "grad_norm": 0.51953125, "learning_rate": 3.3305590127452134e-05, "loss": 3.1102, "mean_token_accuracy": 0.4318044677376747, "num_tokens": 1103708160.0, "step": 29940 }, { "epoch": 7.951167728237792, "grad_norm": 0.5234375, "learning_rate": 3.314034126639299e-05, "loss": 3.1111, "mean_token_accuracy": 0.4332259729504585, "num_tokens": 1104445440.0, "step": 29960 }, { "epoch": 7.956475583864119, "grad_norm": 0.515625, "learning_rate": 3.2975452449053783e-05, "loss": 3.113, "mean_token_accuracy": 0.4313954100012779, "num_tokens": 1105182720.0, "step": 29980 }, { "epoch": 7.961783439490446, "grad_norm": 0.51953125, "learning_rate": 3.281092418345559e-05, "loss": 3.0921, "mean_token_accuracy": 0.4351204067468643, "num_tokens": 1105920000.0, "step": 30000 }, { "epoch": 7.967091295116773, "grad_norm": 0.5078125, "learning_rate": 3.26467569765089e-05, "loss": 3.1182, "mean_token_accuracy": 0.430518864095211, "num_tokens": 1106657280.0, "step": 30020 }, { "epoch": 7.9723991507431, "grad_norm": 0.51171875, "learning_rate": 3.2482951334011484e-05, "loss": 3.0888, "mean_token_accuracy": 0.4351136103272438, "num_tokens": 1107394560.0, "step": 30040 }, { "epoch": 7.977707006369426, "grad_norm": 0.515625, "learning_rate": 3.2319507760647405e-05, "loss": 3.1105, "mean_token_accuracy": 0.4316930338740349, "num_tokens": 1108131840.0, "step": 30060 }, { "epoch": 7.983014861995754, "grad_norm": 0.53125, "learning_rate": 3.2156426759984944e-05, "loss": 3.1131, "mean_token_accuracy": 0.43254647254943845, "num_tokens": 1108869120.0, "step": 30080 }, { "epoch": 7.988322717622081, "grad_norm": 0.53515625, "learning_rate": 3.1993708834475576e-05, "loss": 3.1018, "mean_token_accuracy": 0.4330656126141548, "num_tokens": 1109606400.0, "step": 30100 }, { "epoch": 7.993630573248407, "grad_norm": 0.53125, "learning_rate": 3.183135448545187e-05, "loss": 3.1083, "mean_token_accuracy": 0.4327503263950348, "num_tokens": 1110343680.0, "step": 30120 }, { "epoch": 7.998938428874735, "grad_norm": 0.5390625, "learning_rate": 3.166936421312637e-05, "loss": 3.114, "mean_token_accuracy": 0.4327299416065216, "num_tokens": 1111080960.0, "step": 30140 }, { "epoch": 8.004246284501061, "grad_norm": 0.5390625, "learning_rate": 3.1507738516589884e-05, "loss": 3.0439, "mean_token_accuracy": 0.4424317806959152, "num_tokens": 1111818240.0, "step": 30160 }, { "epoch": 8.009554140127388, "grad_norm": 0.515625, "learning_rate": 3.1346477893809955e-05, "loss": 3.0727, "mean_token_accuracy": 0.4389473259449005, "num_tokens": 1112555520.0, "step": 30180 }, { "epoch": 8.014861995753716, "grad_norm": 0.51953125, "learning_rate": 3.1185582841629384e-05, "loss": 3.0453, "mean_token_accuracy": 0.44230947047472, "num_tokens": 1113292800.0, "step": 30200 }, { "epoch": 8.020169851380043, "grad_norm": 0.52734375, "learning_rate": 3.102505385576446e-05, "loss": 3.0371, "mean_token_accuracy": 0.4422754943370819, "num_tokens": 1114030080.0, "step": 30220 }, { "epoch": 8.02547770700637, "grad_norm": 0.53125, "learning_rate": 3.0864891430803944e-05, "loss": 3.0619, "mean_token_accuracy": 0.44077244848012925, "num_tokens": 1114767360.0, "step": 30240 }, { "epoch": 8.030785562632696, "grad_norm": 0.53125, "learning_rate": 3.0705096060206886e-05, "loss": 3.0757, "mean_token_accuracy": 0.4379987522959709, "num_tokens": 1115504640.0, "step": 30260 }, { "epoch": 8.036093418259023, "grad_norm": 0.52734375, "learning_rate": 3.0545668236301776e-05, "loss": 3.0513, "mean_token_accuracy": 0.4417713075876236, "num_tokens": 1116241920.0, "step": 30280 }, { "epoch": 8.04140127388535, "grad_norm": 0.51953125, "learning_rate": 3.0386608450284344e-05, "loss": 3.0199, "mean_token_accuracy": 0.4446007266640663, "num_tokens": 1116979200.0, "step": 30300 }, { "epoch": 8.046709129511678, "grad_norm": 0.53515625, "learning_rate": 3.0227917192216755e-05, "loss": 3.0477, "mean_token_accuracy": 0.44074255377054217, "num_tokens": 1117716480.0, "step": 30320 }, { "epoch": 8.052016985138005, "grad_norm": 0.515625, "learning_rate": 3.0069594951025457e-05, "loss": 3.0378, "mean_token_accuracy": 0.44205805510282514, "num_tokens": 1118453760.0, "step": 30340 }, { "epoch": 8.05732484076433, "grad_norm": 0.515625, "learning_rate": 2.9911642214500104e-05, "loss": 3.0335, "mean_token_accuracy": 0.4426098093390465, "num_tokens": 1119191040.0, "step": 30360 }, { "epoch": 8.062632696390658, "grad_norm": 0.515625, "learning_rate": 2.975405946929187e-05, "loss": 3.0509, "mean_token_accuracy": 0.44109589159488677, "num_tokens": 1119928320.0, "step": 30380 }, { "epoch": 8.067940552016985, "grad_norm": 0.515625, "learning_rate": 2.9596847200912e-05, "loss": 3.0158, "mean_token_accuracy": 0.4454134076833725, "num_tokens": 1120665600.0, "step": 30400 }, { "epoch": 8.073248407643312, "grad_norm": 0.54296875, "learning_rate": 2.944000589373029e-05, "loss": 3.0521, "mean_token_accuracy": 0.4404666796326637, "num_tokens": 1121402880.0, "step": 30420 }, { "epoch": 8.07855626326964, "grad_norm": 0.515625, "learning_rate": 2.928353603097363e-05, "loss": 3.0148, "mean_token_accuracy": 0.44638915956020353, "num_tokens": 1122140160.0, "step": 30440 }, { "epoch": 8.083864118895965, "grad_norm": 0.52734375, "learning_rate": 2.9127438094724483e-05, "loss": 3.0391, "mean_token_accuracy": 0.44218715876340864, "num_tokens": 1122877440.0, "step": 30460 }, { "epoch": 8.089171974522293, "grad_norm": 0.51953125, "learning_rate": 2.8971712565919363e-05, "loss": 3.0238, "mean_token_accuracy": 0.4462573379278183, "num_tokens": 1123614720.0, "step": 30480 }, { "epoch": 8.09447983014862, "grad_norm": 0.53125, "learning_rate": 2.8816359924347487e-05, "loss": 3.0245, "mean_token_accuracy": 0.4441522628068924, "num_tokens": 1124352000.0, "step": 30500 }, { "epoch": 8.099787685774947, "grad_norm": 0.5234375, "learning_rate": 2.8661380648649162e-05, "loss": 3.0508, "mean_token_accuracy": 0.44161502420902254, "num_tokens": 1125089280.0, "step": 30520 }, { "epoch": 8.105095541401274, "grad_norm": 0.52734375, "learning_rate": 2.850677521631427e-05, "loss": 3.0307, "mean_token_accuracy": 0.4439606428146362, "num_tokens": 1125826560.0, "step": 30540 }, { "epoch": 8.110403397027602, "grad_norm": 0.52734375, "learning_rate": 2.835254410368113e-05, "loss": 3.032, "mean_token_accuracy": 0.44320911914110184, "num_tokens": 1126563840.0, "step": 30560 }, { "epoch": 8.115711252653927, "grad_norm": 0.515625, "learning_rate": 2.819868778593448e-05, "loss": 3.0122, "mean_token_accuracy": 0.44591487348079684, "num_tokens": 1127301120.0, "step": 30580 }, { "epoch": 8.121019108280255, "grad_norm": 0.5234375, "learning_rate": 2.8045206737104524e-05, "loss": 3.0236, "mean_token_accuracy": 0.44325532764196396, "num_tokens": 1128038400.0, "step": 30600 }, { "epoch": 8.126326963906582, "grad_norm": 0.51171875, "learning_rate": 2.7892101430065217e-05, "loss": 3.0436, "mean_token_accuracy": 0.44251739531755446, "num_tokens": 1128775680.0, "step": 30620 }, { "epoch": 8.13163481953291, "grad_norm": 0.51953125, "learning_rate": 2.773937233653282e-05, "loss": 3.0456, "mean_token_accuracy": 0.4424793466925621, "num_tokens": 1129512960.0, "step": 30640 }, { "epoch": 8.136942675159236, "grad_norm": 0.5546875, "learning_rate": 2.7587019927064534e-05, "loss": 3.0346, "mean_token_accuracy": 0.4439660802483559, "num_tokens": 1130250240.0, "step": 30660 }, { "epoch": 8.142250530785562, "grad_norm": 0.51953125, "learning_rate": 2.743504467105696e-05, "loss": 3.0543, "mean_token_accuracy": 0.4415715381503105, "num_tokens": 1130987520.0, "step": 30680 }, { "epoch": 8.14755838641189, "grad_norm": 0.5234375, "learning_rate": 2.7283447036744698e-05, "loss": 3.0397, "mean_token_accuracy": 0.4427701637148857, "num_tokens": 1131724800.0, "step": 30700 }, { "epoch": 8.152866242038217, "grad_norm": 0.5234375, "learning_rate": 2.713222749119893e-05, "loss": 3.0442, "mean_token_accuracy": 0.4420200064778328, "num_tokens": 1132462080.0, "step": 30720 }, { "epoch": 8.158174097664544, "grad_norm": 0.54296875, "learning_rate": 2.698138650032594e-05, "loss": 3.0478, "mean_token_accuracy": 0.44175499975681304, "num_tokens": 1133199360.0, "step": 30740 }, { "epoch": 8.163481953290871, "grad_norm": 0.51171875, "learning_rate": 2.6830924528865573e-05, "loss": 3.0526, "mean_token_accuracy": 0.44029816091060636, "num_tokens": 1133936640.0, "step": 30760 }, { "epoch": 8.168789808917197, "grad_norm": 0.51953125, "learning_rate": 2.6680842040390172e-05, "loss": 3.0111, "mean_token_accuracy": 0.44637013524770736, "num_tokens": 1134673920.0, "step": 30780 }, { "epoch": 8.174097664543524, "grad_norm": 0.52734375, "learning_rate": 2.653113949730257e-05, "loss": 3.0419, "mean_token_accuracy": 0.44378533214330673, "num_tokens": 1135411200.0, "step": 30800 }, { "epoch": 8.179405520169851, "grad_norm": 0.51953125, "learning_rate": 2.638181736083536e-05, "loss": 3.0556, "mean_token_accuracy": 0.44185692369937896, "num_tokens": 1136148480.0, "step": 30820 }, { "epoch": 8.184713375796179, "grad_norm": 0.51171875, "learning_rate": 2.6232876091048712e-05, "loss": 3.0145, "mean_token_accuracy": 0.4443058282136917, "num_tokens": 1136885760.0, "step": 30840 }, { "epoch": 8.190021231422506, "grad_norm": 0.5234375, "learning_rate": 2.6084316146829713e-05, "loss": 3.0225, "mean_token_accuracy": 0.44618803113698957, "num_tokens": 1137623040.0, "step": 30860 }, { "epoch": 8.195329087048833, "grad_norm": 0.53515625, "learning_rate": 2.5936137985890326e-05, "loss": 3.0467, "mean_token_accuracy": 0.4414682537317276, "num_tokens": 1138360320.0, "step": 30880 }, { "epoch": 8.200636942675159, "grad_norm": 0.5078125, "learning_rate": 2.5788342064766338e-05, "loss": 3.0397, "mean_token_accuracy": 0.4425024434924126, "num_tokens": 1139097600.0, "step": 30900 }, { "epoch": 8.205944798301486, "grad_norm": 0.53125, "learning_rate": 2.5640928838815898e-05, "loss": 3.0342, "mean_token_accuracy": 0.4437622338533401, "num_tokens": 1139834880.0, "step": 30920 }, { "epoch": 8.211252653927813, "grad_norm": 0.53125, "learning_rate": 2.5493898762218008e-05, "loss": 3.0266, "mean_token_accuracy": 0.4457042276859283, "num_tokens": 1140572160.0, "step": 30940 }, { "epoch": 8.21656050955414, "grad_norm": 0.52734375, "learning_rate": 2.5347252287971265e-05, "loss": 3.0316, "mean_token_accuracy": 0.4445205479860306, "num_tokens": 1141309440.0, "step": 30960 }, { "epoch": 8.221868365180468, "grad_norm": 0.515625, "learning_rate": 2.5200989867892223e-05, "loss": 3.0378, "mean_token_accuracy": 0.4425432190299034, "num_tokens": 1142046720.0, "step": 30980 }, { "epoch": 8.227176220806793, "grad_norm": 0.5234375, "learning_rate": 2.5055111952614432e-05, "loss": 3.0686, "mean_token_accuracy": 0.4386279657483101, "num_tokens": 1142784000.0, "step": 31000 }, { "epoch": 8.23248407643312, "grad_norm": 0.51953125, "learning_rate": 2.49096189915865e-05, "loss": 3.048, "mean_token_accuracy": 0.4402003139257431, "num_tokens": 1143521280.0, "step": 31020 }, { "epoch": 8.237791932059448, "grad_norm": 0.5390625, "learning_rate": 2.4764511433071273e-05, "loss": 3.0241, "mean_token_accuracy": 0.444570828974247, "num_tokens": 1144258560.0, "step": 31040 }, { "epoch": 8.243099787685775, "grad_norm": 0.5078125, "learning_rate": 2.461978972414389e-05, "loss": 3.033, "mean_token_accuracy": 0.4428856834769249, "num_tokens": 1144995840.0, "step": 31060 }, { "epoch": 8.248407643312103, "grad_norm": 0.52734375, "learning_rate": 2.4475454310691e-05, "loss": 3.0343, "mean_token_accuracy": 0.4431493282318115, "num_tokens": 1145733120.0, "step": 31080 }, { "epoch": 8.253715498938428, "grad_norm": 0.51953125, "learning_rate": 2.4331505637408782e-05, "loss": 3.0261, "mean_token_accuracy": 0.4445504456758499, "num_tokens": 1146470400.0, "step": 31100 }, { "epoch": 8.259023354564755, "grad_norm": 0.52734375, "learning_rate": 2.418794414780205e-05, "loss": 3.0358, "mean_token_accuracy": 0.4432362988591194, "num_tokens": 1147207680.0, "step": 31120 }, { "epoch": 8.264331210191083, "grad_norm": 0.51171875, "learning_rate": 2.4044770284182663e-05, "loss": 3.0423, "mean_token_accuracy": 0.4426750376820564, "num_tokens": 1147944960.0, "step": 31140 }, { "epoch": 8.26963906581741, "grad_norm": 0.5234375, "learning_rate": 2.390198448766819e-05, "loss": 3.0494, "mean_token_accuracy": 0.44255001097917557, "num_tokens": 1148682240.0, "step": 31160 }, { "epoch": 8.274946921443737, "grad_norm": 0.54296875, "learning_rate": 2.3759587198180603e-05, "loss": 3.0301, "mean_token_accuracy": 0.4435393542051315, "num_tokens": 1149419520.0, "step": 31180 }, { "epoch": 8.280254777070065, "grad_norm": 0.53125, "learning_rate": 2.3617578854444867e-05, "loss": 3.0383, "mean_token_accuracy": 0.44319417178630827, "num_tokens": 1150156800.0, "step": 31200 }, { "epoch": 8.28556263269639, "grad_norm": 0.53125, "learning_rate": 2.347595989398759e-05, "loss": 3.043, "mean_token_accuracy": 0.44220890253782275, "num_tokens": 1150894080.0, "step": 31220 }, { "epoch": 8.290870488322717, "grad_norm": 0.53125, "learning_rate": 2.3334730753135737e-05, "loss": 3.0314, "mean_token_accuracy": 0.44293188899755476, "num_tokens": 1151631360.0, "step": 31240 }, { "epoch": 8.296178343949045, "grad_norm": 0.51171875, "learning_rate": 2.3193891867015217e-05, "loss": 3.0328, "mean_token_accuracy": 0.4426465004682541, "num_tokens": 1152368640.0, "step": 31260 }, { "epoch": 8.301486199575372, "grad_norm": 0.515625, "learning_rate": 2.3053443669549542e-05, "loss": 3.0498, "mean_token_accuracy": 0.4422170609235764, "num_tokens": 1153105920.0, "step": 31280 }, { "epoch": 8.3067940552017, "grad_norm": 0.53125, "learning_rate": 2.2913386593458556e-05, "loss": 3.0335, "mean_token_accuracy": 0.4436086669564247, "num_tokens": 1153843200.0, "step": 31300 }, { "epoch": 8.312101910828025, "grad_norm": 0.53125, "learning_rate": 2.277372107025708e-05, "loss": 3.0268, "mean_token_accuracy": 0.44341432899236677, "num_tokens": 1154580480.0, "step": 31320 }, { "epoch": 8.317409766454352, "grad_norm": 0.5234375, "learning_rate": 2.263444753025342e-05, "loss": 3.0441, "mean_token_accuracy": 0.4423067539930344, "num_tokens": 1155317760.0, "step": 31340 }, { "epoch": 8.32271762208068, "grad_norm": 0.5234375, "learning_rate": 2.249556640254845e-05, "loss": 3.061, "mean_token_accuracy": 0.44002228826284406, "num_tokens": 1156055040.0, "step": 31360 }, { "epoch": 8.328025477707007, "grad_norm": 0.5234375, "learning_rate": 2.2357078115033738e-05, "loss": 3.0365, "mean_token_accuracy": 0.4448236033320427, "num_tokens": 1156792320.0, "step": 31380 }, { "epoch": 8.333333333333334, "grad_norm": 0.54296875, "learning_rate": 2.2218983094390697e-05, "loss": 3.0298, "mean_token_accuracy": 0.4433518186211586, "num_tokens": 1157529600.0, "step": 31400 }, { "epoch": 8.33864118895966, "grad_norm": 0.51953125, "learning_rate": 2.2081281766089015e-05, "loss": 3.0304, "mean_token_accuracy": 0.44322271049022677, "num_tokens": 1158266880.0, "step": 31420 }, { "epoch": 8.343949044585987, "grad_norm": 0.5078125, "learning_rate": 2.1943974554385474e-05, "loss": 3.0573, "mean_token_accuracy": 0.441566102206707, "num_tokens": 1159004160.0, "step": 31440 }, { "epoch": 8.349256900212314, "grad_norm": 0.5390625, "learning_rate": 2.1807061882322525e-05, "loss": 3.0248, "mean_token_accuracy": 0.44576402455568315, "num_tokens": 1159741440.0, "step": 31460 }, { "epoch": 8.354564755838641, "grad_norm": 0.5234375, "learning_rate": 2.1670544171727056e-05, "loss": 3.036, "mean_token_accuracy": 0.44409382343292236, "num_tokens": 1160478720.0, "step": 31480 }, { "epoch": 8.359872611464969, "grad_norm": 0.51171875, "learning_rate": 2.1534421843209165e-05, "loss": 3.0337, "mean_token_accuracy": 0.44428272545337677, "num_tokens": 1161216000.0, "step": 31500 }, { "epoch": 8.365180467091296, "grad_norm": 0.53125, "learning_rate": 2.1398695316160576e-05, "loss": 3.034, "mean_token_accuracy": 0.44349586963653564, "num_tokens": 1161953280.0, "step": 31520 }, { "epoch": 8.370488322717621, "grad_norm": 0.53125, "learning_rate": 2.126336500875383e-05, "loss": 3.0315, "mean_token_accuracy": 0.4432825043797493, "num_tokens": 1162690560.0, "step": 31540 }, { "epoch": 8.375796178343949, "grad_norm": 0.5390625, "learning_rate": 2.1128431337940454e-05, "loss": 3.0666, "mean_token_accuracy": 0.44078060537576674, "num_tokens": 1163427840.0, "step": 31560 }, { "epoch": 8.381104033970276, "grad_norm": 0.515625, "learning_rate": 2.0993894719450198e-05, "loss": 3.0466, "mean_token_accuracy": 0.44165443629026413, "num_tokens": 1164165120.0, "step": 31580 }, { "epoch": 8.386411889596603, "grad_norm": 0.51953125, "learning_rate": 2.085975556778921e-05, "loss": 3.067, "mean_token_accuracy": 0.43758833408355713, "num_tokens": 1164902400.0, "step": 31600 }, { "epoch": 8.39171974522293, "grad_norm": 0.53515625, "learning_rate": 2.072601429623933e-05, "loss": 3.0424, "mean_token_accuracy": 0.44372417628765104, "num_tokens": 1165639680.0, "step": 31620 }, { "epoch": 8.397027600849256, "grad_norm": 0.5234375, "learning_rate": 2.0592671316856314e-05, "loss": 3.0315, "mean_token_accuracy": 0.4430596336722374, "num_tokens": 1166376960.0, "step": 31640 }, { "epoch": 8.402335456475583, "grad_norm": 0.515625, "learning_rate": 2.0459727040468883e-05, "loss": 3.0462, "mean_token_accuracy": 0.44181615561246873, "num_tokens": 1167114240.0, "step": 31660 }, { "epoch": 8.40764331210191, "grad_norm": 0.498046875, "learning_rate": 2.032718187667734e-05, "loss": 3.0662, "mean_token_accuracy": 0.44017585515975954, "num_tokens": 1167851520.0, "step": 31680 }, { "epoch": 8.412951167728238, "grad_norm": 0.5234375, "learning_rate": 2.0195036233852357e-05, "loss": 3.033, "mean_token_accuracy": 0.44319961220026016, "num_tokens": 1168588800.0, "step": 31700 }, { "epoch": 8.418259023354565, "grad_norm": 0.5078125, "learning_rate": 2.0063290519133623e-05, "loss": 3.0395, "mean_token_accuracy": 0.4428870439529419, "num_tokens": 1169326080.0, "step": 31720 }, { "epoch": 8.423566878980893, "grad_norm": 0.5234375, "learning_rate": 1.9931945138428695e-05, "loss": 3.0296, "mean_token_accuracy": 0.44371194690465926, "num_tokens": 1170063360.0, "step": 31740 }, { "epoch": 8.428874734607218, "grad_norm": 0.51953125, "learning_rate": 1.9801000496411735e-05, "loss": 3.0391, "mean_token_accuracy": 0.44295906871557233, "num_tokens": 1170800640.0, "step": 31760 }, { "epoch": 8.434182590233545, "grad_norm": 0.515625, "learning_rate": 1.9670456996522072e-05, "loss": 3.0374, "mean_token_accuracy": 0.4435964360833168, "num_tokens": 1171537920.0, "step": 31780 }, { "epoch": 8.439490445859873, "grad_norm": 0.5234375, "learning_rate": 1.9540315040963378e-05, "loss": 3.0396, "mean_token_accuracy": 0.44247526675462723, "num_tokens": 1172275200.0, "step": 31800 }, { "epoch": 8.4447983014862, "grad_norm": 0.51953125, "learning_rate": 1.9410575030701897e-05, "loss": 3.0248, "mean_token_accuracy": 0.4452788636088371, "num_tokens": 1173012480.0, "step": 31820 }, { "epoch": 8.450106157112527, "grad_norm": 0.5234375, "learning_rate": 1.928123736546573e-05, "loss": 3.0387, "mean_token_accuracy": 0.4432471752166748, "num_tokens": 1173749760.0, "step": 31840 }, { "epoch": 8.455414012738853, "grad_norm": 0.5078125, "learning_rate": 1.9152302443743168e-05, "loss": 3.0473, "mean_token_accuracy": 0.4427266761660576, "num_tokens": 1174487040.0, "step": 31860 }, { "epoch": 8.46072186836518, "grad_norm": 0.52734375, "learning_rate": 1.9023770662781747e-05, "loss": 3.0571, "mean_token_accuracy": 0.44038920998573305, "num_tokens": 1175224320.0, "step": 31880 }, { "epoch": 8.466029723991507, "grad_norm": 0.51171875, "learning_rate": 1.889564241858689e-05, "loss": 3.0229, "mean_token_accuracy": 0.44396335929632186, "num_tokens": 1175961600.0, "step": 31900 }, { "epoch": 8.471337579617835, "grad_norm": 0.51953125, "learning_rate": 1.8767918105920765e-05, "loss": 3.0433, "mean_token_accuracy": 0.4419343903660774, "num_tokens": 1176698880.0, "step": 31920 }, { "epoch": 8.476645435244162, "grad_norm": 0.51953125, "learning_rate": 1.8640598118301e-05, "loss": 3.0412, "mean_token_accuracy": 0.44268318712711335, "num_tokens": 1177436160.0, "step": 31940 }, { "epoch": 8.481953290870488, "grad_norm": 0.51171875, "learning_rate": 1.8513682847999523e-05, "loss": 3.0228, "mean_token_accuracy": 0.44567432850599287, "num_tokens": 1178173440.0, "step": 31960 }, { "epoch": 8.487261146496815, "grad_norm": 0.53125, "learning_rate": 1.8387172686041284e-05, "loss": 3.0579, "mean_token_accuracy": 0.4393495887517929, "num_tokens": 1178910720.0, "step": 31980 }, { "epoch": 8.492569002123142, "grad_norm": 0.51953125, "learning_rate": 1.8261068022203145e-05, "loss": 3.0341, "mean_token_accuracy": 0.4434945106506348, "num_tokens": 1179648000.0, "step": 32000 }, { "epoch": 8.49787685774947, "grad_norm": 0.53125, "learning_rate": 1.8135369245012633e-05, "loss": 3.0367, "mean_token_accuracy": 0.4424032434821129, "num_tokens": 1180385280.0, "step": 32020 }, { "epoch": 8.503184713375797, "grad_norm": 0.5234375, "learning_rate": 1.8010076741746704e-05, "loss": 3.041, "mean_token_accuracy": 0.44355294704437254, "num_tokens": 1181122560.0, "step": 32040 }, { "epoch": 8.508492569002122, "grad_norm": 0.51953125, "learning_rate": 1.7885190898430624e-05, "loss": 3.0543, "mean_token_accuracy": 0.44071944802999496, "num_tokens": 1181859840.0, "step": 32060 }, { "epoch": 8.51380042462845, "grad_norm": 0.5234375, "learning_rate": 1.7760712099836738e-05, "loss": 3.0434, "mean_token_accuracy": 0.4418854683637619, "num_tokens": 1182597120.0, "step": 32080 }, { "epoch": 8.519108280254777, "grad_norm": 0.5234375, "learning_rate": 1.7636640729483198e-05, "loss": 3.0256, "mean_token_accuracy": 0.44331512451171873, "num_tokens": 1183334400.0, "step": 32100 }, { "epoch": 8.524416135881104, "grad_norm": 0.53125, "learning_rate": 1.7512977169633076e-05, "loss": 3.0432, "mean_token_accuracy": 0.4425568044185638, "num_tokens": 1184071680.0, "step": 32120 }, { "epoch": 8.529723991507431, "grad_norm": 0.51953125, "learning_rate": 1.738972180129273e-05, "loss": 3.0344, "mean_token_accuracy": 0.44282724559307096, "num_tokens": 1184808960.0, "step": 32140 }, { "epoch": 8.535031847133759, "grad_norm": 0.52734375, "learning_rate": 1.726687500421114e-05, "loss": 3.0397, "mean_token_accuracy": 0.442525552213192, "num_tokens": 1185546240.0, "step": 32160 }, { "epoch": 8.540339702760084, "grad_norm": 0.5390625, "learning_rate": 1.7144437156878267e-05, "loss": 3.0477, "mean_token_accuracy": 0.44227957129478457, "num_tokens": 1186283520.0, "step": 32180 }, { "epoch": 8.545647558386412, "grad_norm": 0.53515625, "learning_rate": 1.7022408636524203e-05, "loss": 3.0602, "mean_token_accuracy": 0.4394460737705231, "num_tokens": 1187020800.0, "step": 32200 }, { "epoch": 8.550955414012739, "grad_norm": 0.51953125, "learning_rate": 1.6900789819117912e-05, "loss": 3.0175, "mean_token_accuracy": 0.4453576818108559, "num_tokens": 1187758080.0, "step": 32220 }, { "epoch": 8.556263269639066, "grad_norm": 0.515625, "learning_rate": 1.6779581079366017e-05, "loss": 3.0508, "mean_token_accuracy": 0.4418718755245209, "num_tokens": 1188495360.0, "step": 32240 }, { "epoch": 8.561571125265393, "grad_norm": 0.5390625, "learning_rate": 1.665878279071175e-05, "loss": 3.0181, "mean_token_accuracy": 0.4453454494476318, "num_tokens": 1189232640.0, "step": 32260 }, { "epoch": 8.566878980891719, "grad_norm": 0.5234375, "learning_rate": 1.6538395325333714e-05, "loss": 3.0304, "mean_token_accuracy": 0.4425554484128952, "num_tokens": 1189969920.0, "step": 32280 }, { "epoch": 8.572186836518046, "grad_norm": 0.5234375, "learning_rate": 1.641841905414481e-05, "loss": 3.051, "mean_token_accuracy": 0.4409776538610458, "num_tokens": 1190707200.0, "step": 32300 }, { "epoch": 8.577494692144374, "grad_norm": 0.515625, "learning_rate": 1.6298854346790903e-05, "loss": 3.0331, "mean_token_accuracy": 0.44316155910491944, "num_tokens": 1191444480.0, "step": 32320 }, { "epoch": 8.5828025477707, "grad_norm": 0.5390625, "learning_rate": 1.6179701571650084e-05, "loss": 3.0516, "mean_token_accuracy": 0.4405047282576561, "num_tokens": 1192181760.0, "step": 32340 }, { "epoch": 8.588110403397028, "grad_norm": 0.5, "learning_rate": 1.6060961095831015e-05, "loss": 3.0182, "mean_token_accuracy": 0.4461540564894676, "num_tokens": 1192919040.0, "step": 32360 }, { "epoch": 8.593418259023355, "grad_norm": 0.51953125, "learning_rate": 1.594263328517234e-05, "loss": 3.0375, "mean_token_accuracy": 0.4425404965877533, "num_tokens": 1193656320.0, "step": 32380 }, { "epoch": 8.598726114649681, "grad_norm": 0.52734375, "learning_rate": 1.5824718504241018e-05, "loss": 3.0613, "mean_token_accuracy": 0.43913758248090745, "num_tokens": 1194393600.0, "step": 32400 }, { "epoch": 8.604033970276008, "grad_norm": 0.52734375, "learning_rate": 1.5707217116331696e-05, "loss": 3.0217, "mean_token_accuracy": 0.44489834606647494, "num_tokens": 1195130880.0, "step": 32420 }, { "epoch": 8.609341825902336, "grad_norm": 0.515625, "learning_rate": 1.559012948346518e-05, "loss": 3.0333, "mean_token_accuracy": 0.4435543030500412, "num_tokens": 1195868160.0, "step": 32440 }, { "epoch": 8.614649681528663, "grad_norm": 0.51953125, "learning_rate": 1.547345596638761e-05, "loss": 3.0325, "mean_token_accuracy": 0.44381794780492784, "num_tokens": 1196605440.0, "step": 32460 }, { "epoch": 8.61995753715499, "grad_norm": 0.51953125, "learning_rate": 1.5357196924569203e-05, "loss": 3.0375, "mean_token_accuracy": 0.4440000519156456, "num_tokens": 1197342720.0, "step": 32480 }, { "epoch": 8.625265392781316, "grad_norm": 0.5234375, "learning_rate": 1.524135271620317e-05, "loss": 3.0434, "mean_token_accuracy": 0.4430039137601852, "num_tokens": 1198080000.0, "step": 32500 }, { "epoch": 8.630573248407643, "grad_norm": 0.51953125, "learning_rate": 1.5125923698204656e-05, "loss": 3.0532, "mean_token_accuracy": 0.4410333767533302, "num_tokens": 1198817280.0, "step": 32520 }, { "epoch": 8.63588110403397, "grad_norm": 0.5234375, "learning_rate": 1.5010910226209565e-05, "loss": 3.072, "mean_token_accuracy": 0.43983610570430753, "num_tokens": 1199554560.0, "step": 32540 }, { "epoch": 8.641188959660298, "grad_norm": 0.51953125, "learning_rate": 1.4896312654573561e-05, "loss": 3.0627, "mean_token_accuracy": 0.44137448370456694, "num_tokens": 1200291840.0, "step": 32560 }, { "epoch": 8.646496815286625, "grad_norm": 0.51171875, "learning_rate": 1.4782131336370829e-05, "loss": 3.0657, "mean_token_accuracy": 0.43982659131288526, "num_tokens": 1201029120.0, "step": 32580 }, { "epoch": 8.65180467091295, "grad_norm": 0.5234375, "learning_rate": 1.4668366623393246e-05, "loss": 3.0667, "mean_token_accuracy": 0.4401595503091812, "num_tokens": 1201766400.0, "step": 32600 }, { "epoch": 8.657112526539278, "grad_norm": 0.498046875, "learning_rate": 1.4555018866148905e-05, "loss": 3.0267, "mean_token_accuracy": 0.44404082745313644, "num_tokens": 1202503680.0, "step": 32620 }, { "epoch": 8.662420382165605, "grad_norm": 0.53125, "learning_rate": 1.4442088413861486e-05, "loss": 3.0752, "mean_token_accuracy": 0.43762502670288084, "num_tokens": 1203240960.0, "step": 32640 }, { "epoch": 8.667728237791932, "grad_norm": 0.5234375, "learning_rate": 1.4329575614468802e-05, "loss": 3.0541, "mean_token_accuracy": 0.4408009871840477, "num_tokens": 1203978240.0, "step": 32660 }, { "epoch": 8.67303609341826, "grad_norm": 0.53125, "learning_rate": 1.4217480814621929e-05, "loss": 3.0574, "mean_token_accuracy": 0.4411937355995178, "num_tokens": 1204715520.0, "step": 32680 }, { "epoch": 8.678343949044585, "grad_norm": 0.51953125, "learning_rate": 1.410580435968412e-05, "loss": 3.0455, "mean_token_accuracy": 0.44275114089250567, "num_tokens": 1205452800.0, "step": 32700 }, { "epoch": 8.683651804670912, "grad_norm": 0.53125, "learning_rate": 1.399454659372966e-05, "loss": 3.055, "mean_token_accuracy": 0.4398700788617134, "num_tokens": 1206190080.0, "step": 32720 }, { "epoch": 8.68895966029724, "grad_norm": 0.52734375, "learning_rate": 1.3883707859542887e-05, "loss": 3.0625, "mean_token_accuracy": 0.44120324999094007, "num_tokens": 1206927360.0, "step": 32740 }, { "epoch": 8.694267515923567, "grad_norm": 0.515625, "learning_rate": 1.3773288498617113e-05, "loss": 3.0493, "mean_token_accuracy": 0.4423013165593147, "num_tokens": 1207664640.0, "step": 32760 }, { "epoch": 8.699575371549894, "grad_norm": 0.515625, "learning_rate": 1.3663288851153526e-05, "loss": 3.0409, "mean_token_accuracy": 0.4422591894865036, "num_tokens": 1208401920.0, "step": 32780 }, { "epoch": 8.704883227176222, "grad_norm": 0.5234375, "learning_rate": 1.3553709256060225e-05, "loss": 3.0451, "mean_token_accuracy": 0.44175771474838255, "num_tokens": 1209139200.0, "step": 32800 }, { "epoch": 8.710191082802547, "grad_norm": 0.5, "learning_rate": 1.3444550050951097e-05, "loss": 3.0411, "mean_token_accuracy": 0.4431167095899582, "num_tokens": 1209876480.0, "step": 32820 }, { "epoch": 8.715498938428874, "grad_norm": 0.515625, "learning_rate": 1.333581157214485e-05, "loss": 3.0215, "mean_token_accuracy": 0.44449608623981474, "num_tokens": 1210613760.0, "step": 32840 }, { "epoch": 8.720806794055202, "grad_norm": 0.5078125, "learning_rate": 1.3227494154663825e-05, "loss": 3.0128, "mean_token_accuracy": 0.44654680639505384, "num_tokens": 1211351040.0, "step": 32860 }, { "epoch": 8.726114649681529, "grad_norm": 0.51953125, "learning_rate": 1.3119598132233288e-05, "loss": 3.0055, "mean_token_accuracy": 0.4465019553899765, "num_tokens": 1212088320.0, "step": 32880 }, { "epoch": 8.731422505307856, "grad_norm": 0.51953125, "learning_rate": 1.3012123837279925e-05, "loss": 3.0355, "mean_token_accuracy": 0.44270493388175963, "num_tokens": 1212825600.0, "step": 32900 }, { "epoch": 8.736730360934182, "grad_norm": 0.5234375, "learning_rate": 1.290507160093136e-05, "loss": 3.0346, "mean_token_accuracy": 0.4435665398836136, "num_tokens": 1213562880.0, "step": 32920 }, { "epoch": 8.742038216560509, "grad_norm": 0.51171875, "learning_rate": 1.2798441753014626e-05, "loss": 3.0447, "mean_token_accuracy": 0.44206349104642867, "num_tokens": 1214300160.0, "step": 32940 }, { "epoch": 8.747346072186836, "grad_norm": 0.53515625, "learning_rate": 1.2692234622055497e-05, "loss": 3.0659, "mean_token_accuracy": 0.44040552228689195, "num_tokens": 1215037440.0, "step": 32960 }, { "epoch": 8.752653927813164, "grad_norm": 0.515625, "learning_rate": 1.2586450535277354e-05, "loss": 3.049, "mean_token_accuracy": 0.44186643958091737, "num_tokens": 1215774720.0, "step": 32980 }, { "epoch": 8.757961783439491, "grad_norm": 0.50390625, "learning_rate": 1.2481089818600121e-05, "loss": 3.0397, "mean_token_accuracy": 0.4426655247807503, "num_tokens": 1216512000.0, "step": 33000 }, { "epoch": 8.763269639065818, "grad_norm": 0.5234375, "learning_rate": 1.2376152796639415e-05, "loss": 3.0387, "mean_token_accuracy": 0.44353936016559603, "num_tokens": 1217249280.0, "step": 33020 }, { "epoch": 8.768577494692144, "grad_norm": 0.51171875, "learning_rate": 1.2271639792705368e-05, "loss": 3.0737, "mean_token_accuracy": 0.43900304287672043, "num_tokens": 1217986560.0, "step": 33040 }, { "epoch": 8.773885350318471, "grad_norm": 0.51171875, "learning_rate": 1.21675511288018e-05, "loss": 3.0471, "mean_token_accuracy": 0.4434265598654747, "num_tokens": 1218723840.0, "step": 33060 }, { "epoch": 8.779193205944798, "grad_norm": 0.51171875, "learning_rate": 1.2063887125624999e-05, "loss": 3.0411, "mean_token_accuracy": 0.44133235663175585, "num_tokens": 1219461120.0, "step": 33080 }, { "epoch": 8.784501061571126, "grad_norm": 0.5234375, "learning_rate": 1.1960648102563081e-05, "loss": 3.0558, "mean_token_accuracy": 0.44175092726945875, "num_tokens": 1220198400.0, "step": 33100 }, { "epoch": 8.789808917197453, "grad_norm": 0.51953125, "learning_rate": 1.1857834377694575e-05, "loss": 3.0272, "mean_token_accuracy": 0.44299440234899523, "num_tokens": 1220935680.0, "step": 33120 }, { "epoch": 8.795116772823778, "grad_norm": 0.51953125, "learning_rate": 1.175544626778791e-05, "loss": 3.038, "mean_token_accuracy": 0.4431819453835487, "num_tokens": 1221672960.0, "step": 33140 }, { "epoch": 8.800424628450106, "grad_norm": 0.53125, "learning_rate": 1.1653484088299963e-05, "loss": 3.0286, "mean_token_accuracy": 0.44414682537317274, "num_tokens": 1222410240.0, "step": 33160 }, { "epoch": 8.805732484076433, "grad_norm": 0.515625, "learning_rate": 1.1551948153375529e-05, "loss": 3.0486, "mean_token_accuracy": 0.44163269102573394, "num_tokens": 1223147520.0, "step": 33180 }, { "epoch": 8.81104033970276, "grad_norm": 0.53125, "learning_rate": 1.1450838775845984e-05, "loss": 3.0336, "mean_token_accuracy": 0.443615460395813, "num_tokens": 1223884800.0, "step": 33200 }, { "epoch": 8.816348195329088, "grad_norm": 0.5078125, "learning_rate": 1.1350156267228571e-05, "loss": 3.0466, "mean_token_accuracy": 0.4418012037873268, "num_tokens": 1224622080.0, "step": 33220 }, { "epoch": 8.821656050955415, "grad_norm": 0.52734375, "learning_rate": 1.1249900937725326e-05, "loss": 3.0461, "mean_token_accuracy": 0.4424412935972214, "num_tokens": 1225359360.0, "step": 33240 }, { "epoch": 8.82696390658174, "grad_norm": 0.51953125, "learning_rate": 1.1150073096222162e-05, "loss": 3.0762, "mean_token_accuracy": 0.4398592084646225, "num_tokens": 1226096640.0, "step": 33260 }, { "epoch": 8.832271762208068, "grad_norm": 0.53125, "learning_rate": 1.1050673050287918e-05, "loss": 3.0628, "mean_token_accuracy": 0.4410551220178604, "num_tokens": 1226833920.0, "step": 33280 }, { "epoch": 8.837579617834395, "grad_norm": 0.5234375, "learning_rate": 1.0951701106173366e-05, "loss": 3.049, "mean_token_accuracy": 0.44209338873624804, "num_tokens": 1227571200.0, "step": 33300 }, { "epoch": 8.842887473460722, "grad_norm": 0.53515625, "learning_rate": 1.0853157568810289e-05, "loss": 3.0575, "mean_token_accuracy": 0.43956430852413175, "num_tokens": 1228308480.0, "step": 33320 }, { "epoch": 8.84819532908705, "grad_norm": 0.51953125, "learning_rate": 1.0755042741810621e-05, "loss": 3.0359, "mean_token_accuracy": 0.44249565005302427, "num_tokens": 1229045760.0, "step": 33340 }, { "epoch": 8.853503184713375, "grad_norm": 0.51953125, "learning_rate": 1.0657356927465389e-05, "loss": 3.0341, "mean_token_accuracy": 0.4446754723787308, "num_tokens": 1229783040.0, "step": 33360 }, { "epoch": 8.858811040339702, "grad_norm": 0.52734375, "learning_rate": 1.056010042674379e-05, "loss": 3.0255, "mean_token_accuracy": 0.44473798871040343, "num_tokens": 1230520320.0, "step": 33380 }, { "epoch": 8.86411889596603, "grad_norm": 0.5078125, "learning_rate": 1.046327353929245e-05, "loss": 3.0538, "mean_token_accuracy": 0.4412290707230568, "num_tokens": 1231257600.0, "step": 33400 }, { "epoch": 8.869426751592357, "grad_norm": 0.52734375, "learning_rate": 1.03668765634342e-05, "loss": 3.0451, "mean_token_accuracy": 0.4428856819868088, "num_tokens": 1231994880.0, "step": 33420 }, { "epoch": 8.874734607218684, "grad_norm": 0.515625, "learning_rate": 1.0270909796167421e-05, "loss": 3.0572, "mean_token_accuracy": 0.44158104956150057, "num_tokens": 1232732160.0, "step": 33440 }, { "epoch": 8.88004246284501, "grad_norm": 0.51953125, "learning_rate": 1.0175373533164994e-05, "loss": 3.0382, "mean_token_accuracy": 0.44212192893028257, "num_tokens": 1233469440.0, "step": 33460 }, { "epoch": 8.885350318471337, "grad_norm": 0.51171875, "learning_rate": 1.00802680687734e-05, "loss": 3.0617, "mean_token_accuracy": 0.44071129709482193, "num_tokens": 1234206720.0, "step": 33480 }, { "epoch": 8.890658174097664, "grad_norm": 0.51953125, "learning_rate": 9.985593696011868e-06, "loss": 3.0366, "mean_token_accuracy": 0.442959064245224, "num_tokens": 1234944000.0, "step": 33500 }, { "epoch": 8.895966029723992, "grad_norm": 0.5234375, "learning_rate": 9.891350706571416e-06, "loss": 3.0609, "mean_token_accuracy": 0.44066916704177855, "num_tokens": 1235681280.0, "step": 33520 }, { "epoch": 8.901273885350319, "grad_norm": 0.5234375, "learning_rate": 9.797539390813996e-06, "loss": 3.0559, "mean_token_accuracy": 0.4405346244573593, "num_tokens": 1236418560.0, "step": 33540 }, { "epoch": 8.906581740976645, "grad_norm": 0.52734375, "learning_rate": 9.704160037771525e-06, "loss": 3.0485, "mean_token_accuracy": 0.4419534161686897, "num_tokens": 1237155840.0, "step": 33560 }, { "epoch": 8.911889596602972, "grad_norm": 0.51953125, "learning_rate": 9.611212935145095e-06, "loss": 3.0325, "mean_token_accuracy": 0.4433001711964607, "num_tokens": 1237893120.0, "step": 33580 }, { "epoch": 8.9171974522293, "grad_norm": 0.51171875, "learning_rate": 9.51869836930404e-06, "loss": 3.0424, "mean_token_accuracy": 0.4419846698641777, "num_tokens": 1238630400.0, "step": 33600 }, { "epoch": 8.922505307855626, "grad_norm": 0.515625, "learning_rate": 9.426616625285e-06, "loss": 3.0543, "mean_token_accuracy": 0.44127663820981977, "num_tokens": 1239367680.0, "step": 33620 }, { "epoch": 8.927813163481954, "grad_norm": 0.53515625, "learning_rate": 9.334967986791186e-06, "loss": 3.0612, "mean_token_accuracy": 0.4414628207683563, "num_tokens": 1240104960.0, "step": 33640 }, { "epoch": 8.933121019108281, "grad_norm": 0.51171875, "learning_rate": 9.243752736191246e-06, "loss": 3.0355, "mean_token_accuracy": 0.44378397464752195, "num_tokens": 1240842240.0, "step": 33660 }, { "epoch": 8.938428874734607, "grad_norm": 0.5234375, "learning_rate": 9.152971154518807e-06, "loss": 3.0569, "mean_token_accuracy": 0.44049250036478044, "num_tokens": 1241579520.0, "step": 33680 }, { "epoch": 8.943736730360934, "grad_norm": 0.515625, "learning_rate": 9.06262352147112e-06, "loss": 3.0283, "mean_token_accuracy": 0.44455859810113907, "num_tokens": 1242316800.0, "step": 33700 }, { "epoch": 8.949044585987261, "grad_norm": 0.51171875, "learning_rate": 8.97271011540865e-06, "loss": 3.0759, "mean_token_accuracy": 0.43836296200752256, "num_tokens": 1243054080.0, "step": 33720 }, { "epoch": 8.954352441613588, "grad_norm": 0.51953125, "learning_rate": 8.883231213353853e-06, "loss": 3.0514, "mean_token_accuracy": 0.44123042970895765, "num_tokens": 1243791360.0, "step": 33740 }, { "epoch": 8.959660297239916, "grad_norm": 0.51171875, "learning_rate": 8.794187090990557e-06, "loss": 3.0498, "mean_token_accuracy": 0.4421192079782486, "num_tokens": 1244528640.0, "step": 33760 }, { "epoch": 8.964968152866241, "grad_norm": 0.5234375, "learning_rate": 8.70557802266304e-06, "loss": 3.0434, "mean_token_accuracy": 0.44232034385204316, "num_tokens": 1245265920.0, "step": 33780 }, { "epoch": 8.970276008492569, "grad_norm": 0.5078125, "learning_rate": 8.61740428137514e-06, "loss": 3.0336, "mean_token_accuracy": 0.44355702251195905, "num_tokens": 1246003200.0, "step": 33800 }, { "epoch": 8.975583864118896, "grad_norm": 0.5234375, "learning_rate": 8.529666138789531e-06, "loss": 3.0334, "mean_token_accuracy": 0.44320912212133406, "num_tokens": 1246740480.0, "step": 33820 }, { "epoch": 8.980891719745223, "grad_norm": 0.53125, "learning_rate": 8.442363865226715e-06, "loss": 3.0305, "mean_token_accuracy": 0.4440720796585083, "num_tokens": 1247477760.0, "step": 33840 }, { "epoch": 8.98619957537155, "grad_norm": 0.51171875, "learning_rate": 8.35549772966438e-06, "loss": 3.0452, "mean_token_accuracy": 0.4425799086689949, "num_tokens": 1248215040.0, "step": 33860 }, { "epoch": 8.991507430997878, "grad_norm": 0.51953125, "learning_rate": 8.269067999736372e-06, "loss": 3.0287, "mean_token_accuracy": 0.4444988042116165, "num_tokens": 1248952320.0, "step": 33880 }, { "epoch": 8.996815286624203, "grad_norm": 0.51171875, "learning_rate": 8.183074941732088e-06, "loss": 3.0375, "mean_token_accuracy": 0.44381115436553953, "num_tokens": 1249689600.0, "step": 33900 }, { "epoch": 9.00212314225053, "grad_norm": 0.5234375, "learning_rate": 8.097518820595439e-06, "loss": 3.0218, "mean_token_accuracy": 0.44414274841547013, "num_tokens": 1250426880.0, "step": 33920 }, { "epoch": 9.007430997876858, "grad_norm": 0.5234375, "learning_rate": 8.01239989992422e-06, "loss": 3.0494, "mean_token_accuracy": 0.4417509213089943, "num_tokens": 1251164160.0, "step": 33940 }, { "epoch": 9.012738853503185, "grad_norm": 0.5234375, "learning_rate": 7.927718441969172e-06, "loss": 3.0177, "mean_token_accuracy": 0.44589448869228365, "num_tokens": 1251901440.0, "step": 33960 }, { "epoch": 9.018046709129512, "grad_norm": 0.5078125, "learning_rate": 7.84347470763319e-06, "loss": 2.997, "mean_token_accuracy": 0.4493517592549324, "num_tokens": 1252638720.0, "step": 33980 }, { "epoch": 9.023354564755838, "grad_norm": 0.515625, "learning_rate": 7.759668956470606e-06, "loss": 3.0092, "mean_token_accuracy": 0.44694634675979616, "num_tokens": 1253376000.0, "step": 34000 }, { "epoch": 9.028662420382165, "grad_norm": 0.53125, "learning_rate": 7.676301446686307e-06, "loss": 3.0388, "mean_token_accuracy": 0.4432580456137657, "num_tokens": 1254113280.0, "step": 34020 }, { "epoch": 9.033970276008493, "grad_norm": 0.50390625, "learning_rate": 7.593372435134948e-06, "loss": 3.0093, "mean_token_accuracy": 0.4470781683921814, "num_tokens": 1254850560.0, "step": 34040 }, { "epoch": 9.03927813163482, "grad_norm": 0.51171875, "learning_rate": 7.510882177320182e-06, "loss": 3.0115, "mean_token_accuracy": 0.44583605527877807, "num_tokens": 1255587840.0, "step": 34060 }, { "epoch": 9.044585987261147, "grad_norm": 0.51171875, "learning_rate": 7.428830927393875e-06, "loss": 3.0183, "mean_token_accuracy": 0.4447108045220375, "num_tokens": 1256325120.0, "step": 34080 }, { "epoch": 9.049893842887473, "grad_norm": 0.51953125, "learning_rate": 7.347218938155291e-06, "loss": 3.0155, "mean_token_accuracy": 0.4459515631198883, "num_tokens": 1257062400.0, "step": 34100 }, { "epoch": 9.0552016985138, "grad_norm": 0.515625, "learning_rate": 7.266046461050345e-06, "loss": 3.0465, "mean_token_accuracy": 0.44282724261283873, "num_tokens": 1257799680.0, "step": 34120 }, { "epoch": 9.060509554140127, "grad_norm": 0.51171875, "learning_rate": 7.185313746170851e-06, "loss": 3.0341, "mean_token_accuracy": 0.4437894120812416, "num_tokens": 1258536960.0, "step": 34140 }, { "epoch": 9.065817409766455, "grad_norm": 0.52734375, "learning_rate": 7.1050210422536724e-06, "loss": 3.008, "mean_token_accuracy": 0.4476788446307182, "num_tokens": 1259274240.0, "step": 34160 }, { "epoch": 9.071125265392782, "grad_norm": 0.52734375, "learning_rate": 7.025168596679992e-06, "loss": 3.0409, "mean_token_accuracy": 0.44247798025608065, "num_tokens": 1260011520.0, "step": 34180 }, { "epoch": 9.07643312101911, "grad_norm": 0.5078125, "learning_rate": 6.945756655474627e-06, "loss": 3.0678, "mean_token_accuracy": 0.4403620347380638, "num_tokens": 1260748800.0, "step": 34200 }, { "epoch": 9.081740976645435, "grad_norm": 0.515625, "learning_rate": 6.8667854633051465e-06, "loss": 3.0058, "mean_token_accuracy": 0.4478881284594536, "num_tokens": 1261486080.0, "step": 34220 }, { "epoch": 9.087048832271762, "grad_norm": 0.5234375, "learning_rate": 6.78825526348119e-06, "loss": 3.0548, "mean_token_accuracy": 0.4418868228793144, "num_tokens": 1262223360.0, "step": 34240 }, { "epoch": 9.09235668789809, "grad_norm": 0.51953125, "learning_rate": 6.710166297953717e-06, "loss": 3.0387, "mean_token_accuracy": 0.44418487548828123, "num_tokens": 1262960640.0, "step": 34260 }, { "epoch": 9.097664543524417, "grad_norm": 0.5078125, "learning_rate": 6.632518807314224e-06, "loss": 3.0218, "mean_token_accuracy": 0.44539030343294145, "num_tokens": 1263697920.0, "step": 34280 }, { "epoch": 9.102972399150744, "grad_norm": 0.51171875, "learning_rate": 6.555313030794046e-06, "loss": 3.0396, "mean_token_accuracy": 0.44385327845811845, "num_tokens": 1264435200.0, "step": 34300 }, { "epoch": 9.10828025477707, "grad_norm": 0.5078125, "learning_rate": 6.478549206263606e-06, "loss": 3.0073, "mean_token_accuracy": 0.4470931217074394, "num_tokens": 1265172480.0, "step": 34320 }, { "epoch": 9.113588110403397, "grad_norm": 0.5078125, "learning_rate": 6.402227570231633e-06, "loss": 3.0128, "mean_token_accuracy": 0.44709583669900893, "num_tokens": 1265909760.0, "step": 34340 }, { "epoch": 9.118895966029724, "grad_norm": 0.5234375, "learning_rate": 6.326348357844546e-06, "loss": 3.0387, "mean_token_accuracy": 0.4422591865062714, "num_tokens": 1266647040.0, "step": 34360 }, { "epoch": 9.124203821656051, "grad_norm": 0.5, "learning_rate": 6.250911802885589e-06, "loss": 3.0165, "mean_token_accuracy": 0.4454990208148956, "num_tokens": 1267384320.0, "step": 34380 }, { "epoch": 9.129511677282379, "grad_norm": 0.5234375, "learning_rate": 6.175918137774244e-06, "loss": 3.0371, "mean_token_accuracy": 0.44354887008666993, "num_tokens": 1268121600.0, "step": 34400 }, { "epoch": 9.134819532908704, "grad_norm": 0.50390625, "learning_rate": 6.10136759356537e-06, "loss": 3.0133, "mean_token_accuracy": 0.44830125868320464, "num_tokens": 1268858880.0, "step": 34420 }, { "epoch": 9.140127388535031, "grad_norm": 0.51953125, "learning_rate": 6.027260399948686e-06, "loss": 2.995, "mean_token_accuracy": 0.44861654490232467, "num_tokens": 1269596160.0, "step": 34440 }, { "epoch": 9.145435244161359, "grad_norm": 0.5234375, "learning_rate": 5.953596785247849e-06, "loss": 3.0136, "mean_token_accuracy": 0.4463551864027977, "num_tokens": 1270333440.0, "step": 34460 }, { "epoch": 9.150743099787686, "grad_norm": 0.50390625, "learning_rate": 5.880376976419948e-06, "loss": 3.0351, "mean_token_accuracy": 0.44369699656963346, "num_tokens": 1271070720.0, "step": 34480 }, { "epoch": 9.156050955414013, "grad_norm": 0.51171875, "learning_rate": 5.807601199054579e-06, "loss": 3.0383, "mean_token_accuracy": 0.44268047362565993, "num_tokens": 1271808000.0, "step": 34500 }, { "epoch": 9.16135881104034, "grad_norm": 0.51171875, "learning_rate": 5.73526967737345e-06, "loss": 3.0138, "mean_token_accuracy": 0.4466487288475037, "num_tokens": 1272545280.0, "step": 34520 }, { "epoch": 9.166666666666666, "grad_norm": 0.5234375, "learning_rate": 5.663382634229363e-06, "loss": 3.0253, "mean_token_accuracy": 0.44473798424005506, "num_tokens": 1273282560.0, "step": 34540 }, { "epoch": 9.171974522292993, "grad_norm": 0.52734375, "learning_rate": 5.591940291105751e-06, "loss": 3.0254, "mean_token_accuracy": 0.44481273293495177, "num_tokens": 1274019840.0, "step": 34560 }, { "epoch": 9.17728237791932, "grad_norm": 0.51171875, "learning_rate": 5.520942868115974e-06, "loss": 3.0113, "mean_token_accuracy": 0.4465726241469383, "num_tokens": 1274757120.0, "step": 34580 }, { "epoch": 9.182590233545648, "grad_norm": 0.51953125, "learning_rate": 5.450390584002523e-06, "loss": 3.0368, "mean_token_accuracy": 0.4441359519958496, "num_tokens": 1275494400.0, "step": 34600 }, { "epoch": 9.187898089171975, "grad_norm": 0.52734375, "learning_rate": 5.38028365613647e-06, "loss": 3.017, "mean_token_accuracy": 0.44736084043979646, "num_tokens": 1276231680.0, "step": 34620 }, { "epoch": 9.1932059447983, "grad_norm": 0.51953125, "learning_rate": 5.310622300516665e-06, "loss": 3.0153, "mean_token_accuracy": 0.44620162099599836, "num_tokens": 1276968960.0, "step": 34640 }, { "epoch": 9.198513800424628, "grad_norm": 0.515625, "learning_rate": 5.2414067317693095e-06, "loss": 3.0132, "mean_token_accuracy": 0.44735268503427505, "num_tokens": 1277706240.0, "step": 34660 }, { "epoch": 9.203821656050955, "grad_norm": 0.515625, "learning_rate": 5.172637163146953e-06, "loss": 3.0253, "mean_token_accuracy": 0.44500298947095873, "num_tokens": 1278443520.0, "step": 34680 }, { "epoch": 9.209129511677283, "grad_norm": 0.51171875, "learning_rate": 5.104313806528193e-06, "loss": 3.0288, "mean_token_accuracy": 0.4446958601474762, "num_tokens": 1279180800.0, "step": 34700 }, { "epoch": 9.21443736730361, "grad_norm": 0.51953125, "learning_rate": 5.036436872416693e-06, "loss": 3.0361, "mean_token_accuracy": 0.4435407131910324, "num_tokens": 1279918080.0, "step": 34720 }, { "epoch": 9.219745222929935, "grad_norm": 0.5078125, "learning_rate": 4.9690065699408535e-06, "loss": 3.0307, "mean_token_accuracy": 0.44562948346138, "num_tokens": 1280655360.0, "step": 34740 }, { "epoch": 9.225053078556263, "grad_norm": 0.51171875, "learning_rate": 4.902023106852853e-06, "loss": 3.0216, "mean_token_accuracy": 0.44676288068294523, "num_tokens": 1281392640.0, "step": 34760 }, { "epoch": 9.23036093418259, "grad_norm": 0.53125, "learning_rate": 4.835486689528278e-06, "loss": 3.0134, "mean_token_accuracy": 0.445933897793293, "num_tokens": 1282129920.0, "step": 34780 }, { "epoch": 9.235668789808917, "grad_norm": 0.51953125, "learning_rate": 4.769397522965296e-06, "loss": 3.0223, "mean_token_accuracy": 0.4450736567378044, "num_tokens": 1282867200.0, "step": 34800 }, { "epoch": 9.240976645435245, "grad_norm": 0.51171875, "learning_rate": 4.7037558107841435e-06, "loss": 3.0186, "mean_token_accuracy": 0.44538487046957015, "num_tokens": 1283604480.0, "step": 34820 }, { "epoch": 9.246284501061572, "grad_norm": 0.53125, "learning_rate": 4.638561755226427e-06, "loss": 3.0151, "mean_token_accuracy": 0.44605485200881956, "num_tokens": 1284341760.0, "step": 34840 }, { "epoch": 9.251592356687897, "grad_norm": 0.5078125, "learning_rate": 4.573815557154559e-06, "loss": 3.023, "mean_token_accuracy": 0.4458156660199165, "num_tokens": 1285079040.0, "step": 34860 }, { "epoch": 9.256900212314225, "grad_norm": 0.51171875, "learning_rate": 4.509517416051051e-06, "loss": 3.0222, "mean_token_accuracy": 0.4462192848324776, "num_tokens": 1285816320.0, "step": 34880 }, { "epoch": 9.262208067940552, "grad_norm": 0.50390625, "learning_rate": 4.445667530017988e-06, "loss": 2.9932, "mean_token_accuracy": 0.44946455806493757, "num_tokens": 1286553600.0, "step": 34900 }, { "epoch": 9.26751592356688, "grad_norm": 0.53125, "learning_rate": 4.38226609577641e-06, "loss": 3.0377, "mean_token_accuracy": 0.44340345561504363, "num_tokens": 1287290880.0, "step": 34920 }, { "epoch": 9.272823779193207, "grad_norm": 0.51171875, "learning_rate": 4.319313308665645e-06, "loss": 3.0157, "mean_token_accuracy": 0.4466324210166931, "num_tokens": 1288028160.0, "step": 34940 }, { "epoch": 9.278131634819532, "grad_norm": 0.5390625, "learning_rate": 4.256809362642743e-06, "loss": 3.0055, "mean_token_accuracy": 0.4479452043771744, "num_tokens": 1288765440.0, "step": 34960 }, { "epoch": 9.28343949044586, "grad_norm": 0.5234375, "learning_rate": 4.194754450281879e-06, "loss": 2.998, "mean_token_accuracy": 0.449383020401001, "num_tokens": 1289502720.0, "step": 34980 }, { "epoch": 9.288747346072187, "grad_norm": 0.5234375, "learning_rate": 4.133148762773781e-06, "loss": 3.031, "mean_token_accuracy": 0.4442338034510612, "num_tokens": 1290240000.0, "step": 35000 }, { "epoch": 9.294055201698514, "grad_norm": 0.53515625, "learning_rate": 4.071992489925103e-06, "loss": 3.0101, "mean_token_accuracy": 0.44740297049283984, "num_tokens": 1290977280.0, "step": 35020 }, { "epoch": 9.299363057324841, "grad_norm": 0.52734375, "learning_rate": 4.0112858201578554e-06, "loss": 2.9925, "mean_token_accuracy": 0.44860703945159913, "num_tokens": 1291714560.0, "step": 35040 }, { "epoch": 9.304670912951167, "grad_norm": 0.50390625, "learning_rate": 3.9510289405088105e-06, "loss": 3.0164, "mean_token_accuracy": 0.44582653790712357, "num_tokens": 1292451840.0, "step": 35060 }, { "epoch": 9.309978768577494, "grad_norm": 0.515625, "learning_rate": 3.891222036628977e-06, "loss": 3.0242, "mean_token_accuracy": 0.44578033238649367, "num_tokens": 1293189120.0, "step": 35080 }, { "epoch": 9.315286624203821, "grad_norm": 0.51953125, "learning_rate": 3.8318652927829596e-06, "loss": 3.0129, "mean_token_accuracy": 0.4468620911240578, "num_tokens": 1293926400.0, "step": 35100 }, { "epoch": 9.320594479830149, "grad_norm": 0.515625, "learning_rate": 3.7729588918484375e-06, "loss": 3.0105, "mean_token_accuracy": 0.4468430638313293, "num_tokens": 1294663680.0, "step": 35120 }, { "epoch": 9.325902335456476, "grad_norm": 0.52734375, "learning_rate": 3.714503015315584e-06, "loss": 3.0226, "mean_token_accuracy": 0.4456729725003242, "num_tokens": 1295400960.0, "step": 35140 }, { "epoch": 9.331210191082803, "grad_norm": 0.515625, "learning_rate": 3.656497843286532e-06, "loss": 3.0326, "mean_token_accuracy": 0.44349043518304826, "num_tokens": 1296138240.0, "step": 35160 }, { "epoch": 9.336518046709129, "grad_norm": 0.515625, "learning_rate": 3.5989435544747257e-06, "loss": 3.0492, "mean_token_accuracy": 0.4428014263510704, "num_tokens": 1296875520.0, "step": 35180 }, { "epoch": 9.341825902335456, "grad_norm": 0.51171875, "learning_rate": 3.541840326204537e-06, "loss": 3.0228, "mean_token_accuracy": 0.4437948510050774, "num_tokens": 1297612800.0, "step": 35200 }, { "epoch": 9.347133757961783, "grad_norm": 0.51953125, "learning_rate": 3.4851883344105327e-06, "loss": 3.0255, "mean_token_accuracy": 0.4445966511964798, "num_tokens": 1298350080.0, "step": 35220 }, { "epoch": 9.35244161358811, "grad_norm": 0.5078125, "learning_rate": 3.428987753637141e-06, "loss": 3.0245, "mean_token_accuracy": 0.444788271188736, "num_tokens": 1299087360.0, "step": 35240 }, { "epoch": 9.357749469214438, "grad_norm": 0.51171875, "learning_rate": 3.3732387570378383e-06, "loss": 3.002, "mean_token_accuracy": 0.4484928801655769, "num_tokens": 1299824640.0, "step": 35260 }, { "epoch": 9.363057324840764, "grad_norm": 0.51171875, "learning_rate": 3.317941516374928e-06, "loss": 3.012, "mean_token_accuracy": 0.44659029245376586, "num_tokens": 1300561920.0, "step": 35280 }, { "epoch": 9.36836518046709, "grad_norm": 0.515625, "learning_rate": 3.263096202018761e-06, "loss": 3.0284, "mean_token_accuracy": 0.44356110095977785, "num_tokens": 1301299200.0, "step": 35300 }, { "epoch": 9.373673036093418, "grad_norm": 0.51171875, "learning_rate": 3.20870298294732e-06, "loss": 3.0129, "mean_token_accuracy": 0.4453685566782951, "num_tokens": 1302036480.0, "step": 35320 }, { "epoch": 9.378980891719745, "grad_norm": 0.51953125, "learning_rate": 3.1547620267457163e-06, "loss": 3.0391, "mean_token_accuracy": 0.44333958327770234, "num_tokens": 1302773760.0, "step": 35340 }, { "epoch": 9.384288747346073, "grad_norm": 0.49609375, "learning_rate": 3.101273499605611e-06, "loss": 3.034, "mean_token_accuracy": 0.44326076060533526, "num_tokens": 1303511040.0, "step": 35360 }, { "epoch": 9.3895966029724, "grad_norm": 0.51171875, "learning_rate": 3.048237566324796e-06, "loss": 2.9902, "mean_token_accuracy": 0.44961404651403425, "num_tokens": 1304248320.0, "step": 35380 }, { "epoch": 9.394904458598726, "grad_norm": 0.51171875, "learning_rate": 2.9956543903064954e-06, "loss": 3.0213, "mean_token_accuracy": 0.4450491935014725, "num_tokens": 1304985600.0, "step": 35400 }, { "epoch": 9.400212314225053, "grad_norm": 0.515625, "learning_rate": 2.9435241335591663e-06, "loss": 2.9984, "mean_token_accuracy": 0.4484235718846321, "num_tokens": 1305722880.0, "step": 35420 }, { "epoch": 9.40552016985138, "grad_norm": 0.5078125, "learning_rate": 2.8918469566956483e-06, "loss": 3.017, "mean_token_accuracy": 0.44621520936489106, "num_tokens": 1306460160.0, "step": 35440 }, { "epoch": 9.410828025477707, "grad_norm": 0.515625, "learning_rate": 2.840623018932997e-06, "loss": 3.0282, "mean_token_accuracy": 0.4437282562255859, "num_tokens": 1307197440.0, "step": 35460 }, { "epoch": 9.416135881104035, "grad_norm": 0.5234375, "learning_rate": 2.7898524780917366e-06, "loss": 3.0128, "mean_token_accuracy": 0.4456892803311348, "num_tokens": 1307934720.0, "step": 35480 }, { "epoch": 9.42144373673036, "grad_norm": 0.51171875, "learning_rate": 2.739535490595557e-06, "loss": 3.0439, "mean_token_accuracy": 0.4433817133307457, "num_tokens": 1308672000.0, "step": 35500 }, { "epoch": 9.426751592356688, "grad_norm": 0.52734375, "learning_rate": 2.689672211470667e-06, "loss": 3.0247, "mean_token_accuracy": 0.44590943604707717, "num_tokens": 1309409280.0, "step": 35520 }, { "epoch": 9.432059447983015, "grad_norm": 0.51171875, "learning_rate": 2.64026279434546e-06, "loss": 3.0332, "mean_token_accuracy": 0.44379076808691026, "num_tokens": 1310146560.0, "step": 35540 }, { "epoch": 9.437367303609342, "grad_norm": 0.515625, "learning_rate": 2.5913073914499984e-06, "loss": 3.0225, "mean_token_accuracy": 0.44647070318460463, "num_tokens": 1310883840.0, "step": 35560 }, { "epoch": 9.44267515923567, "grad_norm": 0.53515625, "learning_rate": 2.5428061536154954e-06, "loss": 2.9955, "mean_token_accuracy": 0.44880544394254684, "num_tokens": 1311621120.0, "step": 35580 }, { "epoch": 9.447983014861995, "grad_norm": 0.51953125, "learning_rate": 2.494759230273885e-06, "loss": 3.0128, "mean_token_accuracy": 0.44704555422067643, "num_tokens": 1312358400.0, "step": 35600 }, { "epoch": 9.453290870488322, "grad_norm": 0.52734375, "learning_rate": 2.4471667694573873e-06, "loss": 3.0033, "mean_token_accuracy": 0.4485404431819916, "num_tokens": 1313095680.0, "step": 35620 }, { "epoch": 9.45859872611465, "grad_norm": 0.515625, "learning_rate": 2.400028917798008e-06, "loss": 3.0139, "mean_token_accuracy": 0.4468308359384537, "num_tokens": 1313832960.0, "step": 35640 }, { "epoch": 9.463906581740977, "grad_norm": 0.515625, "learning_rate": 2.353345820527092e-06, "loss": 3.0283, "mean_token_accuracy": 0.4447311908006668, "num_tokens": 1314570240.0, "step": 35660 }, { "epoch": 9.469214437367304, "grad_norm": 0.515625, "learning_rate": 2.3071176214749354e-06, "loss": 3.0084, "mean_token_accuracy": 0.446465265750885, "num_tokens": 1315307520.0, "step": 35680 }, { "epoch": 9.47452229299363, "grad_norm": 0.52734375, "learning_rate": 2.2613444630702583e-06, "loss": 3.0365, "mean_token_accuracy": 0.4439470529556274, "num_tokens": 1316044800.0, "step": 35700 }, { "epoch": 9.479830148619957, "grad_norm": 0.52734375, "learning_rate": 2.216026486339817e-06, "loss": 3.0327, "mean_token_accuracy": 0.4441821604967117, "num_tokens": 1316782080.0, "step": 35720 }, { "epoch": 9.485138004246284, "grad_norm": 0.53515625, "learning_rate": 2.171163830907924e-06, "loss": 3.0304, "mean_token_accuracy": 0.44383425563573836, "num_tokens": 1317519360.0, "step": 35740 }, { "epoch": 9.490445859872612, "grad_norm": 0.5078125, "learning_rate": 2.126756634996113e-06, "loss": 3.0, "mean_token_accuracy": 0.44769106805324554, "num_tokens": 1318256640.0, "step": 35760 }, { "epoch": 9.495753715498939, "grad_norm": 0.53125, "learning_rate": 2.082805035422608e-06, "loss": 3.0443, "mean_token_accuracy": 0.44358556270599364, "num_tokens": 1318993920.0, "step": 35780 }, { "epoch": 9.501061571125266, "grad_norm": 0.51953125, "learning_rate": 2.0393091676019546e-06, "loss": 3.017, "mean_token_accuracy": 0.4467112377285957, "num_tokens": 1319731200.0, "step": 35800 }, { "epoch": 9.506369426751592, "grad_norm": 0.50390625, "learning_rate": 1.996269165544573e-06, "loss": 3.024, "mean_token_accuracy": 0.4447053700685501, "num_tokens": 1320468480.0, "step": 35820 }, { "epoch": 9.511677282377919, "grad_norm": 0.515625, "learning_rate": 1.9536851618564053e-06, "loss": 3.0365, "mean_token_accuracy": 0.44342656135559083, "num_tokens": 1321205760.0, "step": 35840 }, { "epoch": 9.516985138004246, "grad_norm": 0.51171875, "learning_rate": 1.9115572877384355e-06, "loss": 3.0394, "mean_token_accuracy": 0.44255136996507644, "num_tokens": 1321943040.0, "step": 35860 }, { "epoch": 9.522292993630574, "grad_norm": 0.515625, "learning_rate": 1.869885672986321e-06, "loss": 3.0391, "mean_token_accuracy": 0.44434659779071806, "num_tokens": 1322680320.0, "step": 35880 }, { "epoch": 9.5276008492569, "grad_norm": 0.53515625, "learning_rate": 1.8286704459899937e-06, "loss": 3.0326, "mean_token_accuracy": 0.4440584912896156, "num_tokens": 1323417600.0, "step": 35900 }, { "epoch": 9.532908704883226, "grad_norm": 0.515625, "learning_rate": 1.7879117337332438e-06, "loss": 3.0135, "mean_token_accuracy": 0.44773999601602554, "num_tokens": 1324154880.0, "step": 35920 }, { "epoch": 9.538216560509554, "grad_norm": 0.515625, "learning_rate": 1.7476096617933698e-06, "loss": 3.014, "mean_token_accuracy": 0.44647749364376066, "num_tokens": 1324892160.0, "step": 35940 }, { "epoch": 9.543524416135881, "grad_norm": 0.5078125, "learning_rate": 1.707764354340746e-06, "loss": 3.0184, "mean_token_accuracy": 0.4465304955840111, "num_tokens": 1325629440.0, "step": 35960 }, { "epoch": 9.548832271762208, "grad_norm": 0.515625, "learning_rate": 1.6683759341384384e-06, "loss": 3.006, "mean_token_accuracy": 0.4462437480688095, "num_tokens": 1326366720.0, "step": 35980 }, { "epoch": 9.554140127388536, "grad_norm": 0.53125, "learning_rate": 1.6294445225419227e-06, "loss": 3.0291, "mean_token_accuracy": 0.4452095553278923, "num_tokens": 1327104000.0, "step": 36000 }, { "epoch": 9.559447983014863, "grad_norm": 0.51171875, "learning_rate": 1.5909702394985014e-06, "loss": 3.0359, "mean_token_accuracy": 0.44373641312122347, "num_tokens": 1327841280.0, "step": 36020 }, { "epoch": 9.564755838641188, "grad_norm": 0.53125, "learning_rate": 1.5529532035472358e-06, "loss": 3.03, "mean_token_accuracy": 0.4449200913310051, "num_tokens": 1328578560.0, "step": 36040 }, { "epoch": 9.570063694267516, "grad_norm": 0.51953125, "learning_rate": 1.5153935318182487e-06, "loss": 3.0203, "mean_token_accuracy": 0.4460004895925522, "num_tokens": 1329315840.0, "step": 36060 }, { "epoch": 9.575371549893843, "grad_norm": 0.5078125, "learning_rate": 1.4782913400326557e-06, "loss": 3.003, "mean_token_accuracy": 0.4475510954856873, "num_tokens": 1330053120.0, "step": 36080 }, { "epoch": 9.58067940552017, "grad_norm": 0.5234375, "learning_rate": 1.4416467425020172e-06, "loss": 3.0223, "mean_token_accuracy": 0.44520547688007356, "num_tokens": 1330790400.0, "step": 36100 }, { "epoch": 9.585987261146498, "grad_norm": 0.5078125, "learning_rate": 1.4054598521280381e-06, "loss": 3.0179, "mean_token_accuracy": 0.4449472725391388, "num_tokens": 1331527680.0, "step": 36120 }, { "epoch": 9.591295116772823, "grad_norm": 0.5234375, "learning_rate": 1.3697307804022672e-06, "loss": 3.0322, "mean_token_accuracy": 0.4438763871788979, "num_tokens": 1332264960.0, "step": 36140 }, { "epoch": 9.59660297239915, "grad_norm": 0.5234375, "learning_rate": 1.3344596374056992e-06, "loss": 3.0417, "mean_token_accuracy": 0.4439049229025841, "num_tokens": 1333002240.0, "step": 36160 }, { "epoch": 9.601910828025478, "grad_norm": 0.515625, "learning_rate": 1.2996465318084736e-06, "loss": 3.0368, "mean_token_accuracy": 0.44469585716724397, "num_tokens": 1333739520.0, "step": 36180 }, { "epoch": 9.607218683651805, "grad_norm": 0.5234375, "learning_rate": 1.2652915708694756e-06, "loss": 3.0339, "mean_token_accuracy": 0.44298760443925855, "num_tokens": 1334476800.0, "step": 36200 }, { "epoch": 9.612526539278132, "grad_norm": 0.51953125, "learning_rate": 1.2313948604361023e-06, "loss": 3.0327, "mean_token_accuracy": 0.443703792989254, "num_tokens": 1335214080.0, "step": 36220 }, { "epoch": 9.617834394904458, "grad_norm": 0.52734375, "learning_rate": 1.1979565049438477e-06, "loss": 3.0355, "mean_token_accuracy": 0.44427320957183836, "num_tokens": 1335951360.0, "step": 36240 }, { "epoch": 9.623142250530785, "grad_norm": 0.52734375, "learning_rate": 1.1649766074160349e-06, "loss": 3.0299, "mean_token_accuracy": 0.4428911179304123, "num_tokens": 1336688640.0, "step": 36260 }, { "epoch": 9.628450106157112, "grad_norm": 0.53125, "learning_rate": 1.132455269463467e-06, "loss": 3.0265, "mean_token_accuracy": 0.4453821495175362, "num_tokens": 1337425920.0, "step": 36280 }, { "epoch": 9.63375796178344, "grad_norm": 0.5, "learning_rate": 1.1003925912841772e-06, "loss": 3.0155, "mean_token_accuracy": 0.4468226820230484, "num_tokens": 1338163200.0, "step": 36300 }, { "epoch": 9.639065817409767, "grad_norm": 0.51171875, "learning_rate": 1.068788671662979e-06, "loss": 3.0197, "mean_token_accuracy": 0.4458156704902649, "num_tokens": 1338900480.0, "step": 36320 }, { "epoch": 9.644373673036092, "grad_norm": 0.5078125, "learning_rate": 1.0376436079713502e-06, "loss": 3.0274, "mean_token_accuracy": 0.4454297125339508, "num_tokens": 1339637760.0, "step": 36340 }, { "epoch": 9.64968152866242, "grad_norm": 0.5234375, "learning_rate": 1.0069574961669657e-06, "loss": 3.0221, "mean_token_accuracy": 0.44504919797182085, "num_tokens": 1340375040.0, "step": 36360 }, { "epoch": 9.654989384288747, "grad_norm": 0.51953125, "learning_rate": 9.767304307935152e-07, "loss": 3.0362, "mean_token_accuracy": 0.4431547597050667, "num_tokens": 1341112320.0, "step": 36380 }, { "epoch": 9.660297239915074, "grad_norm": 0.515625, "learning_rate": 9.469625049803697e-07, "loss": 3.0234, "mean_token_accuracy": 0.4458047911524773, "num_tokens": 1341849600.0, "step": 36400 }, { "epoch": 9.665605095541402, "grad_norm": 0.51953125, "learning_rate": 9.176538104422649e-07, "loss": 3.0183, "mean_token_accuracy": 0.4465386524796486, "num_tokens": 1342586880.0, "step": 36420 }, { "epoch": 9.670912951167729, "grad_norm": 0.51171875, "learning_rate": 8.888044374790848e-07, "loss": 3.0195, "mean_token_accuracy": 0.44564715027809143, "num_tokens": 1343324160.0, "step": 36440 }, { "epoch": 9.676220806794054, "grad_norm": 0.53515625, "learning_rate": 8.60414474975496e-07, "loss": 3.0296, "mean_token_accuracy": 0.444697217643261, "num_tokens": 1344061440.0, "step": 36460 }, { "epoch": 9.681528662420382, "grad_norm": 0.53515625, "learning_rate": 8.324840104007802e-07, "loss": 3.0406, "mean_token_accuracy": 0.44256224036216735, "num_tokens": 1344798720.0, "step": 36480 }, { "epoch": 9.686836518046709, "grad_norm": 0.51171875, "learning_rate": 8.050131298085016e-07, "loss": 3.0135, "mean_token_accuracy": 0.44600728452205657, "num_tokens": 1345536000.0, "step": 36500 }, { "epoch": 9.692144373673036, "grad_norm": 0.5234375, "learning_rate": 7.780019178362073e-07, "loss": 3.0311, "mean_token_accuracy": 0.4442310810089111, "num_tokens": 1346273280.0, "step": 36520 }, { "epoch": 9.697452229299364, "grad_norm": 0.51953125, "learning_rate": 7.514504577052271e-07, "loss": 3.0477, "mean_token_accuracy": 0.44237741976976397, "num_tokens": 1347010560.0, "step": 36540 }, { "epoch": 9.70276008492569, "grad_norm": 0.51171875, "learning_rate": 7.253588312204406e-07, "loss": 3.0185, "mean_token_accuracy": 0.4448059380054474, "num_tokens": 1347747840.0, "step": 36560 }, { "epoch": 9.708067940552016, "grad_norm": 0.515625, "learning_rate": 6.997271187699272e-07, "loss": 3.0433, "mean_token_accuracy": 0.44270493388175963, "num_tokens": 1348485120.0, "step": 36580 }, { "epoch": 9.713375796178344, "grad_norm": 0.51953125, "learning_rate": 6.745553993247998e-07, "loss": 3.0242, "mean_token_accuracy": 0.44543786644935607, "num_tokens": 1349222400.0, "step": 36600 }, { "epoch": 9.718683651804671, "grad_norm": 0.5234375, "learning_rate": 6.498437504389553e-07, "loss": 3.0246, "mean_token_accuracy": 0.445136171579361, "num_tokens": 1349959680.0, "step": 36620 }, { "epoch": 9.723991507430998, "grad_norm": 0.515625, "learning_rate": 6.25592248248774e-07, "loss": 2.9935, "mean_token_accuracy": 0.4499103054404259, "num_tokens": 1350696960.0, "step": 36640 }, { "epoch": 9.729299363057326, "grad_norm": 0.5078125, "learning_rate": 6.01800967472954e-07, "loss": 3.0295, "mean_token_accuracy": 0.4452313005924225, "num_tokens": 1351434240.0, "step": 36660 }, { "epoch": 9.734607218683651, "grad_norm": 0.51171875, "learning_rate": 5.784699814122607e-07, "loss": 2.9966, "mean_token_accuracy": 0.4489726036787033, "num_tokens": 1352171520.0, "step": 36680 }, { "epoch": 9.739915074309978, "grad_norm": 0.51953125, "learning_rate": 5.555993619492604e-07, "loss": 3.0245, "mean_token_accuracy": 0.44577625691890715, "num_tokens": 1352908800.0, "step": 36700 }, { "epoch": 9.745222929936306, "grad_norm": 0.50390625, "learning_rate": 5.331891795482047e-07, "loss": 3.0284, "mean_token_accuracy": 0.44341704696416856, "num_tokens": 1353646080.0, "step": 36720 }, { "epoch": 9.750530785562633, "grad_norm": 0.5078125, "learning_rate": 5.11239503254629e-07, "loss": 3.0314, "mean_token_accuracy": 0.4449513480067253, "num_tokens": 1354383360.0, "step": 36740 }, { "epoch": 9.75583864118896, "grad_norm": 0.51953125, "learning_rate": 4.897504006953878e-07, "loss": 3.0295, "mean_token_accuracy": 0.4459801077842712, "num_tokens": 1355120640.0, "step": 36760 }, { "epoch": 9.761146496815286, "grad_norm": 0.5234375, "learning_rate": 4.6872193807818704e-07, "loss": 3.0298, "mean_token_accuracy": 0.445079094171524, "num_tokens": 1355857920.0, "step": 36780 }, { "epoch": 9.766454352441613, "grad_norm": 0.5234375, "learning_rate": 4.481541801916011e-07, "loss": 3.052, "mean_token_accuracy": 0.44204854369163515, "num_tokens": 1356595200.0, "step": 36800 }, { "epoch": 9.77176220806794, "grad_norm": 0.52734375, "learning_rate": 4.2804719040469005e-07, "loss": 3.0396, "mean_token_accuracy": 0.4436711773276329, "num_tokens": 1357332480.0, "step": 36820 }, { "epoch": 9.777070063694268, "grad_norm": 0.53125, "learning_rate": 4.084010306669827e-07, "loss": 3.0364, "mean_token_accuracy": 0.4447121635079384, "num_tokens": 1358069760.0, "step": 36840 }, { "epoch": 9.782377919320595, "grad_norm": 0.51171875, "learning_rate": 3.8921576150812703e-07, "loss": 3.004, "mean_token_accuracy": 0.44696673154830935, "num_tokens": 1358807040.0, "step": 36860 }, { "epoch": 9.787685774946922, "grad_norm": 0.53515625, "learning_rate": 3.704914420378069e-07, "loss": 3.01, "mean_token_accuracy": 0.44669221341609955, "num_tokens": 1359544320.0, "step": 36880 }, { "epoch": 9.792993630573248, "grad_norm": 0.515625, "learning_rate": 3.5222812994547545e-07, "loss": 3.0128, "mean_token_accuracy": 0.44755109697580336, "num_tokens": 1360281600.0, "step": 36900 }, { "epoch": 9.798301486199575, "grad_norm": 0.5234375, "learning_rate": 3.3442588150028873e-07, "loss": 3.0228, "mean_token_accuracy": 0.44499211609363554, "num_tokens": 1361018880.0, "step": 36920 }, { "epoch": 9.803609341825902, "grad_norm": 0.53515625, "learning_rate": 3.1708475155082236e-07, "loss": 3.0292, "mean_token_accuracy": 0.4448698118329048, "num_tokens": 1361756160.0, "step": 36940 }, { "epoch": 9.80891719745223, "grad_norm": 0.51171875, "learning_rate": 3.0020479352497183e-07, "loss": 3.0217, "mean_token_accuracy": 0.4458265364170074, "num_tokens": 1362493440.0, "step": 36960 }, { "epoch": 9.814225053078557, "grad_norm": 0.51953125, "learning_rate": 2.837860594297525e-07, "loss": 3.03, "mean_token_accuracy": 0.4449064999818802, "num_tokens": 1363230720.0, "step": 36980 }, { "epoch": 9.819532908704883, "grad_norm": 0.5078125, "learning_rate": 2.6782859985116645e-07, "loss": 3.0195, "mean_token_accuracy": 0.445417483150959, "num_tokens": 1363968000.0, "step": 37000 }, { "epoch": 9.82484076433121, "grad_norm": 0.52734375, "learning_rate": 2.5233246395403584e-07, "loss": 3.0206, "mean_token_accuracy": 0.44588633328676225, "num_tokens": 1364705280.0, "step": 37020 }, { "epoch": 9.830148619957537, "grad_norm": 0.53125, "learning_rate": 2.3729769948181987e-07, "loss": 3.0227, "mean_token_accuracy": 0.4456892773509026, "num_tokens": 1365442560.0, "step": 37040 }, { "epoch": 9.835456475583864, "grad_norm": 0.51953125, "learning_rate": 2.2272435275653143e-07, "loss": 3.0369, "mean_token_accuracy": 0.4431044816970825, "num_tokens": 1366179840.0, "step": 37060 }, { "epoch": 9.840764331210192, "grad_norm": 0.5234375, "learning_rate": 2.08612468678504e-07, "loss": 3.0244, "mean_token_accuracy": 0.44477196484804155, "num_tokens": 1366917120.0, "step": 37080 }, { "epoch": 9.846072186836517, "grad_norm": 0.515625, "learning_rate": 1.9496209072639158e-07, "loss": 3.0382, "mean_token_accuracy": 0.44450424164533614, "num_tokens": 1367654400.0, "step": 37100 }, { "epoch": 9.851380042462845, "grad_norm": 0.5078125, "learning_rate": 1.8177326095686894e-07, "loss": 3.0434, "mean_token_accuracy": 0.4425445795059204, "num_tokens": 1368391680.0, "step": 37120 }, { "epoch": 9.856687898089172, "grad_norm": 0.5234375, "learning_rate": 1.690460200046484e-07, "loss": 3.0272, "mean_token_accuracy": 0.444089749455452, "num_tokens": 1369128960.0, "step": 37140 }, { "epoch": 9.8619957537155, "grad_norm": 0.52734375, "learning_rate": 1.5678040708224647e-07, "loss": 3.0371, "mean_token_accuracy": 0.4438641518354416, "num_tokens": 1369866240.0, "step": 37160 }, { "epoch": 9.867303609341826, "grad_norm": 0.51171875, "learning_rate": 1.4497645997991747e-07, "loss": 3.0266, "mean_token_accuracy": 0.4447379857301712, "num_tokens": 1370603520.0, "step": 37180 }, { "epoch": 9.872611464968152, "grad_norm": 0.53125, "learning_rate": 1.336342150655534e-07, "loss": 3.0182, "mean_token_accuracy": 0.4467982158064842, "num_tokens": 1371340800.0, "step": 37200 }, { "epoch": 9.87791932059448, "grad_norm": 0.51953125, "learning_rate": 1.2275370728450086e-07, "loss": 3.0221, "mean_token_accuracy": 0.44509540498256683, "num_tokens": 1372078080.0, "step": 37220 }, { "epoch": 9.883227176220807, "grad_norm": 0.51953125, "learning_rate": 1.123349701595444e-07, "loss": 3.0114, "mean_token_accuracy": 0.4472466841340065, "num_tokens": 1372815360.0, "step": 37240 }, { "epoch": 9.888535031847134, "grad_norm": 0.52734375, "learning_rate": 1.0237803579070669e-07, "loss": 3.0474, "mean_token_accuracy": 0.44162997156381606, "num_tokens": 1373552640.0, "step": 37260 }, { "epoch": 9.893842887473461, "grad_norm": 0.5078125, "learning_rate": 9.28829348552651e-08, "loss": 3.0184, "mean_token_accuracy": 0.4450084254145622, "num_tokens": 1374289920.0, "step": 37280 }, { "epoch": 9.899150743099788, "grad_norm": 0.5078125, "learning_rate": 8.384969660751861e-08, "loss": 3.0123, "mean_token_accuracy": 0.44656854718923567, "num_tokens": 1375027200.0, "step": 37300 }, { "epoch": 9.904458598726114, "grad_norm": 0.53125, "learning_rate": 7.527834887882111e-08, "loss": 3.036, "mean_token_accuracy": 0.444096539914608, "num_tokens": 1375764480.0, "step": 37320 }, { "epoch": 9.909766454352441, "grad_norm": 0.5234375, "learning_rate": 6.716891807741487e-08, "loss": 3.0303, "mean_token_accuracy": 0.4446958556771278, "num_tokens": 1376501760.0, "step": 37340 }, { "epoch": 9.915074309978769, "grad_norm": 0.5234375, "learning_rate": 5.952142918841385e-08, "loss": 2.9962, "mean_token_accuracy": 0.44854859709739686, "num_tokens": 1377239040.0, "step": 37360 }, { "epoch": 9.920382165605096, "grad_norm": 0.5390625, "learning_rate": 5.233590577367053e-08, "loss": 3.0277, "mean_token_accuracy": 0.444796422123909, "num_tokens": 1377976320.0, "step": 37380 }, { "epoch": 9.925690021231423, "grad_norm": 0.51171875, "learning_rate": 4.5612369971725905e-08, "loss": 3.0188, "mean_token_accuracy": 0.4468335509300232, "num_tokens": 1378713600.0, "step": 37400 }, { "epoch": 9.930997876857749, "grad_norm": 0.53125, "learning_rate": 3.935084249772624e-08, "loss": 3.0425, "mean_token_accuracy": 0.4416489988565445, "num_tokens": 1379450880.0, "step": 37420 }, { "epoch": 9.936305732484076, "grad_norm": 0.5234375, "learning_rate": 3.355134264340642e-08, "loss": 3.0141, "mean_token_accuracy": 0.4468118041753769, "num_tokens": 1380188160.0, "step": 37440 }, { "epoch": 9.941613588110403, "grad_norm": 0.52734375, "learning_rate": 2.8213888276990005e-08, "loss": 3.0598, "mean_token_accuracy": 0.4414302036166191, "num_tokens": 1380925440.0, "step": 37460 }, { "epoch": 9.94692144373673, "grad_norm": 0.5234375, "learning_rate": 2.3338495843105986e-08, "loss": 3.0377, "mean_token_accuracy": 0.4446347028017044, "num_tokens": 1381662720.0, "step": 37480 }, { "epoch": 9.952229299363058, "grad_norm": 0.51953125, "learning_rate": 1.8925180362838743e-08, "loss": 3.0037, "mean_token_accuracy": 0.4489875495433807, "num_tokens": 1382400000.0, "step": 37500 }, { "epoch": 9.957537154989385, "grad_norm": 0.5234375, "learning_rate": 1.497395543354485e-08, "loss": 3.0533, "mean_token_accuracy": 0.44137448370456694, "num_tokens": 1383137280.0, "step": 37520 }, { "epoch": 9.96284501061571, "grad_norm": 0.51171875, "learning_rate": 1.1484833228919688e-08, "loss": 3.012, "mean_token_accuracy": 0.44598825871944425, "num_tokens": 1383874560.0, "step": 37540 }, { "epoch": 9.968152866242038, "grad_norm": 0.515625, "learning_rate": 8.457824498947496e-09, "loss": 3.0071, "mean_token_accuracy": 0.4467791900038719, "num_tokens": 1384611840.0, "step": 37560 }, { "epoch": 9.973460721868365, "grad_norm": 0.515625, "learning_rate": 5.892938569818096e-09, "loss": 3.0121, "mean_token_accuracy": 0.44625054448843005, "num_tokens": 1385349120.0, "step": 37580 }, { "epoch": 9.978768577494693, "grad_norm": 0.52734375, "learning_rate": 3.790183343893582e-09, "loss": 3.0463, "mean_token_accuracy": 0.44417264610528945, "num_tokens": 1386086400.0, "step": 37600 }, { "epoch": 9.98407643312102, "grad_norm": 0.515625, "learning_rate": 2.1495652997749467e-09, "loss": 3.0153, "mean_token_accuracy": 0.44715019762516023, "num_tokens": 1386823680.0, "step": 37620 }, { "epoch": 9.989384288747345, "grad_norm": 0.5234375, "learning_rate": 9.71089492185495e-10, "loss": 3.026, "mean_token_accuracy": 0.44394705146551133, "num_tokens": 1387560960.0, "step": 37640 }, { "epoch": 9.994692144373673, "grad_norm": 0.515625, "learning_rate": 2.547595520041579e-10, "loss": 3.0244, "mean_token_accuracy": 0.44606028497219086, "num_tokens": 1388298240.0, "step": 37660 }, { "epoch": 10.0, "grad_norm": 0.5234375, "learning_rate": 5.776862155304484e-13, "loss": 3.0583, "mean_token_accuracy": 0.44099260717630384, "num_tokens": 1389035520.0, "step": 37680 } ], "logging_steps": 20, "max_steps": 37680, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9712374235725824e+18, "train_batch_size": 72, "trial_name": null, "trial_params": null }