diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,4528 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 5.0, - "eval_steps": 500, - "global_step": 555, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.009029345372460496, - "grad_norm": 4.1226773140670385, - "learning_rate": 0.0, - "loss": 1.3433, - "num_tokens": 417963.0, - "step": 1 - }, - { - "epoch": 0.01805869074492099, - "grad_norm": 3.9462061203782963, - "learning_rate": 5.882352941176471e-07, - "loss": 1.3076, - "num_tokens": 856730.0, - "step": 2 - }, - { - "epoch": 0.02708803611738149, - "grad_norm": 4.056538068886665, - "learning_rate": 1.1764705882352942e-06, - "loss": 1.3225, - "num_tokens": 1283679.0, - "step": 3 - }, - { - "epoch": 0.03611738148984198, - "grad_norm": 3.9035828019837617, - "learning_rate": 1.7647058823529414e-06, - "loss": 1.2971, - "num_tokens": 1722056.0, - "step": 4 - }, - { - "epoch": 0.045146726862302484, - "grad_norm": 3.818499985659685, - "learning_rate": 2.3529411764705885e-06, - "loss": 1.3056, - "num_tokens": 2150137.0, - "step": 5 - }, - { - "epoch": 0.05417607223476298, - "grad_norm": 3.3482582599671074, - "learning_rate": 2.9411764705882355e-06, - "loss": 1.2864, - "num_tokens": 2564649.0, - "step": 6 - }, - { - "epoch": 0.06320541760722348, - "grad_norm": 2.431710651256278, - "learning_rate": 3.529411764705883e-06, - "loss": 1.1649, - "num_tokens": 2985977.0, - "step": 7 - }, - { - "epoch": 0.07223476297968397, - "grad_norm": 2.291927554769836, - "learning_rate": 4.11764705882353e-06, - "loss": 1.1279, - "num_tokens": 3413212.0, - "step": 8 - }, - { - "epoch": 0.08126410835214447, - "grad_norm": 2.029519481294733, - "learning_rate": 4.705882352941177e-06, - "loss": 0.9535, - "num_tokens": 3843515.0, - "step": 9 - }, - { - "epoch": 0.09029345372460497, - "grad_norm": 1.8569383671360136, - "learning_rate": 5.294117647058824e-06, - "loss": 0.8774, - "num_tokens": 4289732.0, - "step": 10 - }, - { - "epoch": 0.09932279909706546, - "grad_norm": 1.9074099721896551, - "learning_rate": 5.882352941176471e-06, - "loss": 0.8476, - "num_tokens": 4715096.0, - "step": 11 - }, - { - "epoch": 0.10835214446952596, - "grad_norm": 2.2685460452669806, - "learning_rate": 6.470588235294119e-06, - "loss": 0.5251, - "num_tokens": 5128419.0, - "step": 12 - }, - { - "epoch": 0.11738148984198646, - "grad_norm": 2.261721528203699, - "learning_rate": 7.058823529411766e-06, - "loss": 0.4533, - "num_tokens": 5540634.0, - "step": 13 - }, - { - "epoch": 0.12641083521444696, - "grad_norm": 1.8091458825349906, - "learning_rate": 7.647058823529411e-06, - "loss": 0.3695, - "num_tokens": 5968005.0, - "step": 14 - }, - { - "epoch": 0.13544018058690746, - "grad_norm": 1.505512868285292, - "learning_rate": 8.23529411764706e-06, - "loss": 0.2675, - "num_tokens": 6397897.0, - "step": 15 - }, - { - "epoch": 0.14446952595936793, - "grad_norm": 0.8544464295725124, - "learning_rate": 8.823529411764707e-06, - "loss": 0.1612, - "num_tokens": 6815448.0, - "step": 16 - }, - { - "epoch": 0.15349887133182843, - "grad_norm": 0.6784675541905167, - "learning_rate": 9.411764705882354e-06, - "loss": 0.1197, - "num_tokens": 7230528.0, - "step": 17 - }, - { - "epoch": 0.16252821670428894, - "grad_norm": 0.6379689058024756, - "learning_rate": 1e-05, - "loss": 0.1302, - "num_tokens": 7649044.0, - "step": 18 - }, - { - "epoch": 0.17155756207674944, - "grad_norm": 0.25897815088971404, - "learning_rate": 9.999923278607256e-06, - "loss": 0.0892, - "num_tokens": 8067225.0, - "step": 19 - }, - { - "epoch": 0.18058690744920994, - "grad_norm": 0.244824125801969, - "learning_rate": 9.999693117045099e-06, - "loss": 0.0876, - "num_tokens": 8500120.0, - "step": 20 - }, - { - "epoch": 0.18961625282167044, - "grad_norm": 0.4453079318767393, - "learning_rate": 9.99930952316167e-06, - "loss": 0.0798, - "num_tokens": 8925262.0, - "step": 21 - }, - { - "epoch": 0.1986455981941309, - "grad_norm": 0.18027632385122908, - "learning_rate": 9.998772510036905e-06, - "loss": 0.0776, - "num_tokens": 9354973.0, - "step": 22 - }, - { - "epoch": 0.2076749435665914, - "grad_norm": 0.1682514733086765, - "learning_rate": 9.998082095982091e-06, - "loss": 0.0913, - "num_tokens": 9795967.0, - "step": 23 - }, - { - "epoch": 0.21670428893905191, - "grad_norm": 0.6839625538532086, - "learning_rate": 9.997238304539241e-06, - "loss": 0.0805, - "num_tokens": 10217971.0, - "step": 24 - }, - { - "epoch": 0.22573363431151242, - "grad_norm": 0.1801620006143571, - "learning_rate": 9.99624116448029e-06, - "loss": 0.1043, - "num_tokens": 10675155.0, - "step": 25 - }, - { - "epoch": 0.23476297968397292, - "grad_norm": 0.1510431579702823, - "learning_rate": 9.995090709806113e-06, - "loss": 0.0894, - "num_tokens": 11119872.0, - "step": 26 - }, - { - "epoch": 0.24379232505643342, - "grad_norm": 0.1552751490698237, - "learning_rate": 9.993786979745374e-06, - "loss": 0.0724, - "num_tokens": 11538902.0, - "step": 27 - }, - { - "epoch": 0.2528216704288939, - "grad_norm": 0.13901109032980072, - "learning_rate": 9.992330018753175e-06, - "loss": 0.0592, - "num_tokens": 11961057.0, - "step": 28 - }, - { - "epoch": 0.2618510158013544, - "grad_norm": 0.12982893321708955, - "learning_rate": 9.990719876509551e-06, - "loss": 0.0575, - "num_tokens": 12406878.0, - "step": 29 - }, - { - "epoch": 0.2708803611738149, - "grad_norm": 0.13249591564705812, - "learning_rate": 9.98895660791777e-06, - "loss": 0.053, - "num_tokens": 12819692.0, - "step": 30 - }, - { - "epoch": 0.2799097065462754, - "grad_norm": 0.13079157117035578, - "learning_rate": 9.987040273102466e-06, - "loss": 0.0567, - "num_tokens": 13240915.0, - "step": 31 - }, - { - "epoch": 0.28893905191873587, - "grad_norm": 0.12490016513516743, - "learning_rate": 9.984970937407583e-06, - "loss": 0.0597, - "num_tokens": 13660214.0, - "step": 32 - }, - { - "epoch": 0.2979683972911964, - "grad_norm": 0.12284775257936209, - "learning_rate": 9.98274867139415e-06, - "loss": 0.0485, - "num_tokens": 14114321.0, - "step": 33 - }, - { - "epoch": 0.30699774266365687, - "grad_norm": 0.10605884412119998, - "learning_rate": 9.980373550837877e-06, - "loss": 0.0507, - "num_tokens": 14548964.0, - "step": 34 - }, - { - "epoch": 0.3160270880361174, - "grad_norm": 0.0996863253036289, - "learning_rate": 9.977845656726565e-06, - "loss": 0.0533, - "num_tokens": 14981191.0, - "step": 35 - }, - { - "epoch": 0.32505643340857787, - "grad_norm": 0.099940697319749, - "learning_rate": 9.975165075257351e-06, - "loss": 0.0456, - "num_tokens": 15397252.0, - "step": 36 - }, - { - "epoch": 0.3340857787810384, - "grad_norm": 0.09953657791437007, - "learning_rate": 9.972331897833766e-06, - "loss": 0.0553, - "num_tokens": 15825845.0, - "step": 37 - }, - { - "epoch": 0.3431151241534989, - "grad_norm": 0.10522339551652994, - "learning_rate": 9.96934622106262e-06, - "loss": 0.048, - "num_tokens": 16263335.0, - "step": 38 - }, - { - "epoch": 0.35214446952595935, - "grad_norm": 0.09154677645576041, - "learning_rate": 9.966208146750697e-06, - "loss": 0.0497, - "num_tokens": 16685043.0, - "step": 39 - }, - { - "epoch": 0.3611738148984199, - "grad_norm": 0.09239203897782809, - "learning_rate": 9.962917781901308e-06, - "loss": 0.0447, - "num_tokens": 17112536.0, - "step": 40 - }, - { - "epoch": 0.37020316027088035, - "grad_norm": 0.09443543473738454, - "learning_rate": 9.959475238710617e-06, - "loss": 0.038, - "num_tokens": 17536800.0, - "step": 41 - }, - { - "epoch": 0.3792325056433409, - "grad_norm": 0.08866051485745102, - "learning_rate": 9.955880634563825e-06, - "loss": 0.038, - "num_tokens": 17972150.0, - "step": 42 - }, - { - "epoch": 0.38826185101580135, - "grad_norm": 0.0937980936876227, - "learning_rate": 9.952134092031174e-06, - "loss": 0.0454, - "num_tokens": 18396216.0, - "step": 43 - }, - { - "epoch": 0.3972911963882618, - "grad_norm": 0.08953079302859607, - "learning_rate": 9.948235738863755e-06, - "loss": 0.0395, - "num_tokens": 18821149.0, - "step": 44 - }, - { - "epoch": 0.40632054176072235, - "grad_norm": 0.09638013349344338, - "learning_rate": 9.944185707989163e-06, - "loss": 0.0389, - "num_tokens": 19245794.0, - "step": 45 - }, - { - "epoch": 0.4153498871331828, - "grad_norm": 0.0963287325583386, - "learning_rate": 9.939984137506954e-06, - "loss": 0.0423, - "num_tokens": 19678417.0, - "step": 46 - }, - { - "epoch": 0.42437923250564336, - "grad_norm": 0.09082871490975147, - "learning_rate": 9.93563117068395e-06, - "loss": 0.0377, - "num_tokens": 20092889.0, - "step": 47 - }, - { - "epoch": 0.43340857787810383, - "grad_norm": 0.09446657995542025, - "learning_rate": 9.93112695594934e-06, - "loss": 0.0537, - "num_tokens": 20547069.0, - "step": 48 - }, - { - "epoch": 0.44243792325056436, - "grad_norm": 0.08479659819661463, - "learning_rate": 9.926471646889624e-06, - "loss": 0.0355, - "num_tokens": 20966538.0, - "step": 49 - }, - { - "epoch": 0.45146726862302483, - "grad_norm": 0.10103007225011962, - "learning_rate": 9.921665402243376e-06, - "loss": 0.0447, - "num_tokens": 21400755.0, - "step": 50 - }, - { - "epoch": 0.4604966139954853, - "grad_norm": 0.08624391161599489, - "learning_rate": 9.916708385895837e-06, - "loss": 0.0389, - "num_tokens": 21840775.0, - "step": 51 - }, - { - "epoch": 0.46952595936794583, - "grad_norm": 0.08394108614320675, - "learning_rate": 9.911600766873312e-06, - "loss": 0.04, - "num_tokens": 22278786.0, - "step": 52 - }, - { - "epoch": 0.4785553047404063, - "grad_norm": 0.08578722894055574, - "learning_rate": 9.906342719337427e-06, - "loss": 0.0368, - "num_tokens": 22693466.0, - "step": 53 - }, - { - "epoch": 0.48758465011286684, - "grad_norm": 0.08306630403146477, - "learning_rate": 9.900934422579167e-06, - "loss": 0.0313, - "num_tokens": 23112540.0, - "step": 54 - }, - { - "epoch": 0.4966139954853273, - "grad_norm": 0.08907008819061979, - "learning_rate": 9.895376061012786e-06, - "loss": 0.0368, - "num_tokens": 23564619.0, - "step": 55 - }, - { - "epoch": 0.5056433408577878, - "grad_norm": 0.09449233534316975, - "learning_rate": 9.889667824169498e-06, - "loss": 0.0347, - "num_tokens": 23967845.0, - "step": 56 - }, - { - "epoch": 0.5146726862302483, - "grad_norm": 0.13169270387732046, - "learning_rate": 9.883809906691031e-06, - "loss": 0.1075, - "num_tokens": 24391020.0, - "step": 57 - }, - { - "epoch": 0.5237020316027088, - "grad_norm": 0.08626372320969251, - "learning_rate": 9.877802508322977e-06, - "loss": 0.0397, - "num_tokens": 24825527.0, - "step": 58 - }, - { - "epoch": 0.5327313769751693, - "grad_norm": 0.6199046568163001, - "learning_rate": 9.871645833907992e-06, - "loss": 0.085, - "num_tokens": 25263167.0, - "step": 59 - }, - { - "epoch": 0.5417607223476298, - "grad_norm": 0.08854553441636301, - "learning_rate": 9.865340093378799e-06, - "loss": 0.0347, - "num_tokens": 25693750.0, - "step": 60 - }, - { - "epoch": 0.5507900677200903, - "grad_norm": 0.08190394530809236, - "learning_rate": 9.858885501751044e-06, - "loss": 0.0301, - "num_tokens": 26127066.0, - "step": 61 - }, - { - "epoch": 0.5598194130925508, - "grad_norm": 0.07633990311461816, - "learning_rate": 9.852282279115951e-06, - "loss": 0.0331, - "num_tokens": 26553930.0, - "step": 62 - }, - { - "epoch": 0.5688487584650113, - "grad_norm": 0.10134813831731004, - "learning_rate": 9.84553065063283e-06, - "loss": 0.0482, - "num_tokens": 26997155.0, - "step": 63 - }, - { - "epoch": 0.5778781038374717, - "grad_norm": 0.07702444941063769, - "learning_rate": 9.838630846521381e-06, - "loss": 0.0289, - "num_tokens": 27416930.0, - "step": 64 - }, - { - "epoch": 0.5869074492099323, - "grad_norm": 0.08139774767813368, - "learning_rate": 9.831583102053868e-06, - "loss": 0.0342, - "num_tokens": 27849198.0, - "step": 65 - }, - { - "epoch": 0.5959367945823928, - "grad_norm": 0.08475766420893233, - "learning_rate": 9.824387657547074e-06, - "loss": 0.0326, - "num_tokens": 28281608.0, - "step": 66 - }, - { - "epoch": 0.6049661399548533, - "grad_norm": 0.08880561553586032, - "learning_rate": 9.817044758354123e-06, - "loss": 0.0275, - "num_tokens": 28703014.0, - "step": 67 - }, - { - "epoch": 0.6139954853273137, - "grad_norm": 0.16108769242249113, - "learning_rate": 9.809554654856106e-06, - "loss": 0.1026, - "num_tokens": 29154694.0, - "step": 68 - }, - { - "epoch": 0.6230248306997742, - "grad_norm": 0.08758330536346154, - "learning_rate": 9.80191760245354e-06, - "loss": 0.0309, - "num_tokens": 29581312.0, - "step": 69 - }, - { - "epoch": 0.6320541760722348, - "grad_norm": 0.08405143622629944, - "learning_rate": 9.794133861557674e-06, - "loss": 0.0311, - "num_tokens": 30010189.0, - "step": 70 - }, - { - "epoch": 0.6410835214446953, - "grad_norm": 0.09242952293604854, - "learning_rate": 9.78620369758159e-06, - "loss": 0.0308, - "num_tokens": 30423268.0, - "step": 71 - }, - { - "epoch": 0.6501128668171557, - "grad_norm": 0.08547346542642349, - "learning_rate": 9.778127380931165e-06, - "loss": 0.0309, - "num_tokens": 30854686.0, - "step": 72 - }, - { - "epoch": 0.6591422121896162, - "grad_norm": 0.07877474464629973, - "learning_rate": 9.76990518699585e-06, - "loss": 0.0264, - "num_tokens": 31260959.0, - "step": 73 - }, - { - "epoch": 0.6681715575620768, - "grad_norm": 0.0952585998223962, - "learning_rate": 9.761537396139277e-06, - "loss": 0.0268, - "num_tokens": 31676000.0, - "step": 74 - }, - { - "epoch": 0.6772009029345373, - "grad_norm": 0.08841152027185054, - "learning_rate": 9.753024293689696e-06, - "loss": 0.0324, - "num_tokens": 32086486.0, - "step": 75 - }, - { - "epoch": 0.6862302483069977, - "grad_norm": 0.08632580074796341, - "learning_rate": 9.744366169930254e-06, - "loss": 0.0372, - "num_tokens": 32498590.0, - "step": 76 - }, - { - "epoch": 0.6952595936794582, - "grad_norm": 0.08545023498371142, - "learning_rate": 9.735563320089088e-06, - "loss": 0.03, - "num_tokens": 32917173.0, - "step": 77 - }, - { - "epoch": 0.7042889390519187, - "grad_norm": 0.10023458177821297, - "learning_rate": 9.72661604432927e-06, - "loss": 0.0316, - "num_tokens": 33349789.0, - "step": 78 - }, - { - "epoch": 0.7133182844243793, - "grad_norm": 0.09633881794164335, - "learning_rate": 9.717524647738553e-06, - "loss": 0.031, - "num_tokens": 33785863.0, - "step": 79 - }, - { - "epoch": 0.7223476297968398, - "grad_norm": 0.08583028492237739, - "learning_rate": 9.70828944031899e-06, - "loss": 0.0299, - "num_tokens": 34216063.0, - "step": 80 - }, - { - "epoch": 0.7313769751693002, - "grad_norm": 0.08655284410423547, - "learning_rate": 9.698910736976344e-06, - "loss": 0.0261, - "num_tokens": 34655546.0, - "step": 81 - }, - { - "epoch": 0.7404063205417607, - "grad_norm": 0.08819526204263398, - "learning_rate": 9.689388857509365e-06, - "loss": 0.0335, - "num_tokens": 35085479.0, - "step": 82 - }, - { - "epoch": 0.7494356659142212, - "grad_norm": 0.0775793985272994, - "learning_rate": 9.679724126598878e-06, - "loss": 0.0254, - "num_tokens": 35510555.0, - "step": 83 - }, - { - "epoch": 0.7584650112866818, - "grad_norm": 0.09506327636871045, - "learning_rate": 9.669916873796709e-06, - "loss": 0.034, - "num_tokens": 35921862.0, - "step": 84 - }, - { - "epoch": 0.7674943566591422, - "grad_norm": 0.08310818380762558, - "learning_rate": 9.659967433514458e-06, - "loss": 0.0252, - "num_tokens": 36328530.0, - "step": 85 - }, - { - "epoch": 0.7765237020316027, - "grad_norm": 0.07740207993257811, - "learning_rate": 9.649876145012085e-06, - "loss": 0.0278, - "num_tokens": 36772328.0, - "step": 86 - }, - { - "epoch": 0.7855530474040632, - "grad_norm": 0.088775953244118, - "learning_rate": 9.639643352386353e-06, - "loss": 0.0316, - "num_tokens": 37186946.0, - "step": 87 - }, - { - "epoch": 0.7945823927765236, - "grad_norm": 0.08696148874584747, - "learning_rate": 9.629269404559081e-06, - "loss": 0.0249, - "num_tokens": 37591509.0, - "step": 88 - }, - { - "epoch": 0.8036117381489842, - "grad_norm": 0.1487676873622619, - "learning_rate": 9.618754655265262e-06, - "loss": 0.0498, - "num_tokens": 38045842.0, - "step": 89 - }, - { - "epoch": 0.8126410835214447, - "grad_norm": 0.08230381913162506, - "learning_rate": 9.608099463040989e-06, - "loss": 0.0258, - "num_tokens": 38465378.0, - "step": 90 - }, - { - "epoch": 0.8216704288939052, - "grad_norm": 0.07869393467806507, - "learning_rate": 9.597304191211228e-06, - "loss": 0.0255, - "num_tokens": 38900287.0, - "step": 91 - }, - { - "epoch": 0.8306997742663657, - "grad_norm": 0.2570682466701682, - "learning_rate": 9.586369207877449e-06, - "loss": 0.0945, - "num_tokens": 39323143.0, - "step": 92 - }, - { - "epoch": 0.8397291196388262, - "grad_norm": 0.0846434867175951, - "learning_rate": 9.575294885905051e-06, - "loss": 0.0302, - "num_tokens": 39745833.0, - "step": 93 - }, - { - "epoch": 0.8487584650112867, - "grad_norm": 0.07921974056879383, - "learning_rate": 9.564081602910654e-06, - "loss": 0.0248, - "num_tokens": 40155224.0, - "step": 94 - }, - { - "epoch": 0.8577878103837472, - "grad_norm": 0.08337454729573074, - "learning_rate": 9.552729741249235e-06, - "loss": 0.03, - "num_tokens": 40600001.0, - "step": 95 - }, - { - "epoch": 0.8668171557562077, - "grad_norm": 0.09668085159830789, - "learning_rate": 9.541239688001076e-06, - "loss": 0.0773, - "num_tokens": 41034208.0, - "step": 96 - }, - { - "epoch": 0.8758465011286681, - "grad_norm": 0.07704679084970735, - "learning_rate": 9.52961183495857e-06, - "loss": 0.0228, - "num_tokens": 41463499.0, - "step": 97 - }, - { - "epoch": 0.8848758465011287, - "grad_norm": 0.42293339948498004, - "learning_rate": 9.517846578612866e-06, - "loss": 0.0722, - "num_tokens": 41905599.0, - "step": 98 - }, - { - "epoch": 0.8939051918735892, - "grad_norm": 0.09304706758598777, - "learning_rate": 9.505944320140343e-06, - "loss": 0.0286, - "num_tokens": 42324749.0, - "step": 99 - }, - { - "epoch": 0.9029345372460497, - "grad_norm": 0.0756645296148333, - "learning_rate": 9.49390546538893e-06, - "loss": 0.0249, - "num_tokens": 42745369.0, - "step": 100 - }, - { - "epoch": 0.9119638826185101, - "grad_norm": 0.09351722784422487, - "learning_rate": 9.481730424864276e-06, - "loss": 0.0351, - "num_tokens": 43186876.0, - "step": 101 - }, - { - "epoch": 0.9209932279909706, - "grad_norm": 0.07742636072299149, - "learning_rate": 9.469419613715743e-06, - "loss": 0.0231, - "num_tokens": 43605408.0, - "step": 102 - }, - { - "epoch": 0.9300225733634312, - "grad_norm": 0.08249628831703604, - "learning_rate": 9.456973451722255e-06, - "loss": 0.0252, - "num_tokens": 44020619.0, - "step": 103 - }, - { - "epoch": 0.9390519187358917, - "grad_norm": 0.08239928085619691, - "learning_rate": 9.44439236327798e-06, - "loss": 0.0357, - "num_tokens": 44482117.0, - "step": 104 - }, - { - "epoch": 0.9480812641083521, - "grad_norm": 0.07649857341695085, - "learning_rate": 9.431676777377865e-06, - "loss": 0.0255, - "num_tokens": 44899252.0, - "step": 105 - }, - { - "epoch": 0.9571106094808126, - "grad_norm": 0.07792520551168008, - "learning_rate": 9.418827127603e-06, - "loss": 0.029, - "num_tokens": 45332722.0, - "step": 106 - }, - { - "epoch": 0.9661399548532731, - "grad_norm": 0.07612403843116701, - "learning_rate": 9.405843852105846e-06, - "loss": 0.0223, - "num_tokens": 45764262.0, - "step": 107 - }, - { - "epoch": 0.9751693002257337, - "grad_norm": 0.0785710592412512, - "learning_rate": 9.392727393595278e-06, - "loss": 0.0232, - "num_tokens": 46186006.0, - "step": 108 - }, - { - "epoch": 0.9841986455981941, - "grad_norm": 0.07886878116003551, - "learning_rate": 9.379478199321508e-06, - "loss": 0.038, - "num_tokens": 46639566.0, - "step": 109 - }, - { - "epoch": 0.9932279909706546, - "grad_norm": 0.07681915336448678, - "learning_rate": 9.366096721060817e-06, - "loss": 0.0264, - "num_tokens": 47064244.0, - "step": 110 - }, - { - "epoch": 1.0, - "grad_norm": 0.07681915336448678, - "learning_rate": 9.352583415100157e-06, - "loss": 0.1022, - "num_tokens": 47393855.0, - "step": 111 - }, - { - "epoch": 1.0, - "eval_loss": 0.08535497635602951, - "eval_num_tokens": 47393855.0, - "eval_runtime": 56.3284, - "eval_samples_per_second": 44.471, - "eval_steps_per_second": 5.574, - "step": 111 - }, - { - "epoch": 1.0090293453724606, - "grad_norm": 0.17356249840222848, - "learning_rate": 9.3389387422216e-06, - "loss": 0.0204, - "num_tokens": 47806383.0, - "step": 112 - }, - { - "epoch": 1.018058690744921, - "grad_norm": 0.07958262229456115, - "learning_rate": 9.325163167686615e-06, - "loss": 0.0202, - "num_tokens": 48241862.0, - "step": 113 - }, - { - "epoch": 1.0270880361173815, - "grad_norm": 0.07642681533916272, - "learning_rate": 9.311257161220207e-06, - "loss": 0.0241, - "num_tokens": 48651747.0, - "step": 114 - }, - { - "epoch": 1.036117381489842, - "grad_norm": 0.07480863020983759, - "learning_rate": 9.297221196994904e-06, - "loss": 0.0566, - "num_tokens": 49088950.0, - "step": 115 - }, - { - "epoch": 1.0451467268623025, - "grad_norm": 0.22260772897039935, - "learning_rate": 9.283055753614581e-06, - "loss": 0.0326, - "num_tokens": 49528452.0, - "step": 116 - }, - { - "epoch": 1.054176072234763, - "grad_norm": 0.08495744331654775, - "learning_rate": 9.268761314098148e-06, - "loss": 0.0251, - "num_tokens": 49951741.0, - "step": 117 - }, - { - "epoch": 1.0632054176072234, - "grad_norm": 0.08591356397624175, - "learning_rate": 9.254338365863079e-06, - "loss": 0.0408, - "num_tokens": 50388295.0, - "step": 118 - }, - { - "epoch": 1.072234762979684, - "grad_norm": 0.08378209224429746, - "learning_rate": 9.239787400708779e-06, - "loss": 0.0372, - "num_tokens": 50829191.0, - "step": 119 - }, - { - "epoch": 1.0812641083521444, - "grad_norm": 0.08856082886989099, - "learning_rate": 9.225108914799833e-06, - "loss": 0.0215, - "num_tokens": 51266209.0, - "step": 120 - }, - { - "epoch": 1.090293453724605, - "grad_norm": 0.08191730251281082, - "learning_rate": 9.21030340864908e-06, - "loss": 0.0241, - "num_tokens": 51687096.0, - "step": 121 - }, - { - "epoch": 1.0993227990970655, - "grad_norm": 0.08193039384365579, - "learning_rate": 9.195371387100544e-06, - "loss": 0.0185, - "num_tokens": 52114071.0, - "step": 122 - }, - { - "epoch": 1.108352144469526, - "grad_norm": 0.07999371830102796, - "learning_rate": 9.180313359312218e-06, - "loss": 0.0217, - "num_tokens": 52541213.0, - "step": 123 - }, - { - "epoch": 1.1173814898419865, - "grad_norm": 0.07935868903198422, - "learning_rate": 9.165129838738706e-06, - "loss": 0.0223, - "num_tokens": 52966113.0, - "step": 124 - }, - { - "epoch": 1.1264108352144468, - "grad_norm": 0.08121184818306772, - "learning_rate": 9.14982134311372e-06, - "loss": 0.0269, - "num_tokens": 53418877.0, - "step": 125 - }, - { - "epoch": 1.1354401805869074, - "grad_norm": 0.10411396152761351, - "learning_rate": 9.13438839443242e-06, - "loss": 0.0212, - "num_tokens": 53839935.0, - "step": 126 - }, - { - "epoch": 1.144469525959368, - "grad_norm": 0.07346431042302812, - "learning_rate": 9.11883151893361e-06, - "loss": 0.0179, - "num_tokens": 54258590.0, - "step": 127 - }, - { - "epoch": 1.1534988713318284, - "grad_norm": 0.0740206025776819, - "learning_rate": 9.103151247081803e-06, - "loss": 0.0212, - "num_tokens": 54666548.0, - "step": 128 - }, - { - "epoch": 1.162528216704289, - "grad_norm": 0.08228008762099617, - "learning_rate": 9.087348113549134e-06, - "loss": 0.0201, - "num_tokens": 55091316.0, - "step": 129 - }, - { - "epoch": 1.1715575620767495, - "grad_norm": 0.0779148259917351, - "learning_rate": 9.071422657197117e-06, - "loss": 0.0202, - "num_tokens": 55507416.0, - "step": 130 - }, - { - "epoch": 1.18058690744921, - "grad_norm": 0.08002903746993835, - "learning_rate": 9.05537542105828e-06, - "loss": 0.0238, - "num_tokens": 55947804.0, - "step": 131 - }, - { - "epoch": 1.1896162528216705, - "grad_norm": 0.07906565123280986, - "learning_rate": 9.039206952317655e-06, - "loss": 0.0186, - "num_tokens": 56371231.0, - "step": 132 - }, - { - "epoch": 1.1986455981941309, - "grad_norm": 0.07489036883955975, - "learning_rate": 9.022917802294098e-06, - "loss": 0.02, - "num_tokens": 56783891.0, - "step": 133 - }, - { - "epoch": 1.2076749435665914, - "grad_norm": 0.07820510364094234, - "learning_rate": 9.006508526421511e-06, - "loss": 0.0216, - "num_tokens": 57202386.0, - "step": 134 - }, - { - "epoch": 1.2167042889390518, - "grad_norm": 0.09106232664538468, - "learning_rate": 8.989979684229894e-06, - "loss": 0.0188, - "num_tokens": 57631769.0, - "step": 135 - }, - { - "epoch": 1.2257336343115124, - "grad_norm": 0.08349103688477277, - "learning_rate": 8.973331839326266e-06, - "loss": 0.022, - "num_tokens": 58062896.0, - "step": 136 - }, - { - "epoch": 1.234762979683973, - "grad_norm": 0.07090655801699461, - "learning_rate": 8.956565559375452e-06, - "loss": 0.0215, - "num_tokens": 58487448.0, - "step": 137 - }, - { - "epoch": 1.2437923250564333, - "grad_norm": 0.08633882584217765, - "learning_rate": 8.93968141608071e-06, - "loss": 0.0207, - "num_tokens": 58922017.0, - "step": 138 - }, - { - "epoch": 1.252821670428894, - "grad_norm": 0.07728014844526158, - "learning_rate": 8.922679985164262e-06, - "loss": 0.022, - "num_tokens": 59367436.0, - "step": 139 - }, - { - "epoch": 1.2618510158013545, - "grad_norm": 0.09084101538743762, - "learning_rate": 8.905561846347648e-06, - "loss": 0.0232, - "num_tokens": 59795521.0, - "step": 140 - }, - { - "epoch": 1.2708803611738149, - "grad_norm": 0.08500823309230846, - "learning_rate": 8.888327583331953e-06, - "loss": 0.0185, - "num_tokens": 60200448.0, - "step": 141 - }, - { - "epoch": 1.2799097065462754, - "grad_norm": 0.08835861679468875, - "learning_rate": 8.870977783777917e-06, - "loss": 0.0207, - "num_tokens": 60630275.0, - "step": 142 - }, - { - "epoch": 1.2889390519187358, - "grad_norm": 0.08076126658072942, - "learning_rate": 8.853513039285888e-06, - "loss": 0.0328, - "num_tokens": 61053633.0, - "step": 143 - }, - { - "epoch": 1.2979683972911964, - "grad_norm": 0.1384536355652561, - "learning_rate": 8.835933945375654e-06, - "loss": 0.0234, - "num_tokens": 61498301.0, - "step": 144 - }, - { - "epoch": 1.3069977426636568, - "grad_norm": 0.0829871254442218, - "learning_rate": 8.818241101466135e-06, - "loss": 0.0205, - "num_tokens": 61937666.0, - "step": 145 - }, - { - "epoch": 1.3160270880361173, - "grad_norm": 0.082910689437852, - "learning_rate": 8.800435110854943e-06, - "loss": 0.0201, - "num_tokens": 62349356.0, - "step": 146 - }, - { - "epoch": 1.325056433408578, - "grad_norm": 0.07645576769781585, - "learning_rate": 8.78251658069781e-06, - "loss": 0.0222, - "num_tokens": 62773669.0, - "step": 147 - }, - { - "epoch": 1.3340857787810383, - "grad_norm": 0.08787635923368299, - "learning_rate": 8.764486121987885e-06, - "loss": 0.0185, - "num_tokens": 63184808.0, - "step": 148 - }, - { - "epoch": 1.3431151241534989, - "grad_norm": 0.07676125126637708, - "learning_rate": 8.746344349534905e-06, - "loss": 0.0181, - "num_tokens": 63626245.0, - "step": 149 - }, - { - "epoch": 1.3521444695259595, - "grad_norm": 0.0694959330953695, - "learning_rate": 8.728091881944226e-06, - "loss": 0.0213, - "num_tokens": 64068177.0, - "step": 150 - }, - { - "epoch": 1.3611738148984198, - "grad_norm": 0.08269786083124728, - "learning_rate": 8.70972934159573e-06, - "loss": 0.0222, - "num_tokens": 64496374.0, - "step": 151 - }, - { - "epoch": 1.3702031602708804, - "grad_norm": 0.09481830509518524, - "learning_rate": 8.691257354622602e-06, - "loss": 0.0184, - "num_tokens": 64924919.0, - "step": 152 - }, - { - "epoch": 1.379232505643341, - "grad_norm": 0.07325200785757595, - "learning_rate": 8.672676550889985e-06, - "loss": 0.0228, - "num_tokens": 65360268.0, - "step": 153 - }, - { - "epoch": 1.3882618510158014, - "grad_norm": 0.08206745377927094, - "learning_rate": 8.653987563973494e-06, - "loss": 0.0182, - "num_tokens": 65781587.0, - "step": 154 - }, - { - "epoch": 1.3972911963882617, - "grad_norm": 0.07582039793883952, - "learning_rate": 8.635191031137624e-06, - "loss": 0.0183, - "num_tokens": 66191116.0, - "step": 155 - }, - { - "epoch": 1.4063205417607223, - "grad_norm": 0.0746601188535464, - "learning_rate": 8.616287593314006e-06, - "loss": 0.0205, - "num_tokens": 66617225.0, - "step": 156 - }, - { - "epoch": 1.4153498871331829, - "grad_norm": 0.0802654573159223, - "learning_rate": 8.597277895079568e-06, - "loss": 0.0409, - "num_tokens": 67046148.0, - "step": 157 - }, - { - "epoch": 1.4243792325056432, - "grad_norm": 0.11469234518973448, - "learning_rate": 8.578162584634537e-06, - "loss": 0.02, - "num_tokens": 67476904.0, - "step": 158 - }, - { - "epoch": 1.4334085778781038, - "grad_norm": 0.07976570567339254, - "learning_rate": 8.558942313780357e-06, - "loss": 0.078, - "num_tokens": 67946595.0, - "step": 159 - }, - { - "epoch": 1.4424379232505644, - "grad_norm": 0.20695421562611824, - "learning_rate": 8.539617737897452e-06, - "loss": 0.0194, - "num_tokens": 68359880.0, - "step": 160 - }, - { - "epoch": 1.4514672686230248, - "grad_norm": 0.08200978654094307, - "learning_rate": 8.520189515922872e-06, - "loss": 0.0253, - "num_tokens": 68795114.0, - "step": 161 - }, - { - "epoch": 1.4604966139954854, - "grad_norm": 0.08580590592640575, - "learning_rate": 8.500658310327842e-06, - "loss": 0.0215, - "num_tokens": 69225525.0, - "step": 162 - }, - { - "epoch": 1.469525959367946, - "grad_norm": 0.0789594468693246, - "learning_rate": 8.48102478709516e-06, - "loss": 0.0205, - "num_tokens": 69645836.0, - "step": 163 - }, - { - "epoch": 1.4785553047404063, - "grad_norm": 0.0735719259083779, - "learning_rate": 8.461289615696489e-06, - "loss": 0.0188, - "num_tokens": 70067001.0, - "step": 164 - }, - { - "epoch": 1.487584650112867, - "grad_norm": 0.07613369106659294, - "learning_rate": 8.441453469069536e-06, - "loss": 0.0296, - "num_tokens": 70492907.0, - "step": 165 - }, - { - "epoch": 1.4966139954853273, - "grad_norm": 0.07543177380385463, - "learning_rate": 8.4215170235951e-06, - "loss": 0.0316, - "num_tokens": 70942223.0, - "step": 166 - }, - { - "epoch": 1.5056433408577878, - "grad_norm": 0.09008361072430175, - "learning_rate": 8.401480959074006e-06, - "loss": 0.0176, - "num_tokens": 71371574.0, - "step": 167 - }, - { - "epoch": 1.5146726862302482, - "grad_norm": 0.08346708149449512, - "learning_rate": 8.381345958703933e-06, - "loss": 0.0502, - "num_tokens": 71822208.0, - "step": 168 - }, - { - "epoch": 1.5237020316027088, - "grad_norm": 0.12181670655808274, - "learning_rate": 8.361112709056115e-06, - "loss": 0.0168, - "num_tokens": 72234675.0, - "step": 169 - }, - { - "epoch": 1.5327313769751694, - "grad_norm": 0.07503087447812291, - "learning_rate": 8.340781900051924e-06, - "loss": 0.0185, - "num_tokens": 72652286.0, - "step": 170 - }, - { - "epoch": 1.5417607223476297, - "grad_norm": 0.08398432499044427, - "learning_rate": 8.32035422493935e-06, - "loss": 0.0253, - "num_tokens": 73068061.0, - "step": 171 - }, - { - "epoch": 1.5507900677200903, - "grad_norm": 0.07872465338491443, - "learning_rate": 8.299830380269372e-06, - "loss": 0.0213, - "num_tokens": 73496804.0, - "step": 172 - }, - { - "epoch": 1.559819413092551, - "grad_norm": 0.0839265636209512, - "learning_rate": 8.27921106587218e-06, - "loss": 0.0182, - "num_tokens": 73913610.0, - "step": 173 - }, - { - "epoch": 1.5688487584650113, - "grad_norm": 0.07717964434903289, - "learning_rate": 8.258496984833344e-06, - "loss": 0.0191, - "num_tokens": 74340086.0, - "step": 174 - }, - { - "epoch": 1.5778781038374716, - "grad_norm": 0.09088946896779272, - "learning_rate": 8.237688843469815e-06, - "loss": 0.018, - "num_tokens": 74761072.0, - "step": 175 - }, - { - "epoch": 1.5869074492099324, - "grad_norm": 0.08587530602264679, - "learning_rate": 8.216787351305854e-06, - "loss": 0.0341, - "num_tokens": 75200788.0, - "step": 176 - }, - { - "epoch": 1.5959367945823928, - "grad_norm": 0.0895332050746481, - "learning_rate": 8.195793221048834e-06, - "loss": 0.0172, - "num_tokens": 75626000.0, - "step": 177 - }, - { - "epoch": 1.6049661399548532, - "grad_norm": 0.07005785017925037, - "learning_rate": 8.17470716856494e-06, - "loss": 0.0203, - "num_tokens": 76049835.0, - "step": 178 - }, - { - "epoch": 1.6139954853273137, - "grad_norm": 0.06991469620387697, - "learning_rate": 8.153529912854751e-06, - "loss": 0.0161, - "num_tokens": 76485166.0, - "step": 179 - }, - { - "epoch": 1.6230248306997743, - "grad_norm": 0.07435841338293188, - "learning_rate": 8.13226217602874e-06, - "loss": 0.0181, - "num_tokens": 76889061.0, - "step": 180 - }, - { - "epoch": 1.6320541760722347, - "grad_norm": 0.07690872751853474, - "learning_rate": 8.110904683282635e-06, - "loss": 0.0924, - "num_tokens": 77325361.0, - "step": 181 - }, - { - "epoch": 1.6410835214446953, - "grad_norm": 0.17850649612473476, - "learning_rate": 8.089458162872697e-06, - "loss": 0.0168, - "num_tokens": 77759756.0, - "step": 182 - }, - { - "epoch": 1.6501128668171559, - "grad_norm": 0.06746756510504061, - "learning_rate": 8.067923346090888e-06, - "loss": 0.0179, - "num_tokens": 78187190.0, - "step": 183 - }, - { - "epoch": 1.6591422121896162, - "grad_norm": 0.07885448482479361, - "learning_rate": 8.046300967239934e-06, - "loss": 0.0166, - "num_tokens": 78598237.0, - "step": 184 - }, - { - "epoch": 1.6681715575620768, - "grad_norm": 0.06890297475547125, - "learning_rate": 8.024591763608291e-06, - "loss": 0.0171, - "num_tokens": 79028466.0, - "step": 185 - }, - { - "epoch": 1.6772009029345374, - "grad_norm": 0.0751884704197381, - "learning_rate": 8.002796475444995e-06, - "loss": 0.0179, - "num_tokens": 79444122.0, - "step": 186 - }, - { - "epoch": 1.6862302483069977, - "grad_norm": 0.07290866223712479, - "learning_rate": 7.980915845934433e-06, - "loss": 0.0194, - "num_tokens": 79857405.0, - "step": 187 - }, - { - "epoch": 1.695259593679458, - "grad_norm": 0.0792398036644149, - "learning_rate": 7.95895062117099e-06, - "loss": 0.0171, - "num_tokens": 80293603.0, - "step": 188 - }, - { - "epoch": 1.7042889390519187, - "grad_norm": 0.07077113041595946, - "learning_rate": 7.936901550133616e-06, - "loss": 0.0536, - "num_tokens": 80728108.0, - "step": 189 - }, - { - "epoch": 1.7133182844243793, - "grad_norm": 0.47740128118493913, - "learning_rate": 7.914769384660283e-06, - "loss": 0.0178, - "num_tokens": 81173331.0, - "step": 190 - }, - { - "epoch": 1.7223476297968396, - "grad_norm": 0.08235180008975489, - "learning_rate": 7.892554879422351e-06, - "loss": 0.024, - "num_tokens": 81579387.0, - "step": 191 - }, - { - "epoch": 1.7313769751693002, - "grad_norm": 0.07101875141394902, - "learning_rate": 7.870258791898832e-06, - "loss": 0.0189, - "num_tokens": 82013021.0, - "step": 192 - }, - { - "epoch": 1.7404063205417608, - "grad_norm": 0.07249721287890425, - "learning_rate": 7.847881882350568e-06, - "loss": 0.0169, - "num_tokens": 82440490.0, - "step": 193 - }, - { - "epoch": 1.7494356659142212, - "grad_norm": 0.07095993054906387, - "learning_rate": 7.825424913794299e-06, - "loss": 0.0193, - "num_tokens": 82879243.0, - "step": 194 - }, - { - "epoch": 1.7584650112866818, - "grad_norm": 0.06608741875722214, - "learning_rate": 7.802888651976647e-06, - "loss": 0.017, - "num_tokens": 83320595.0, - "step": 195 - }, - { - "epoch": 1.7674943566591423, - "grad_norm": 0.07043212785004925, - "learning_rate": 7.78027386534801e-06, - "loss": 0.0167, - "num_tokens": 83736643.0, - "step": 196 - }, - { - "epoch": 1.7765237020316027, - "grad_norm": 0.07424356993552451, - "learning_rate": 7.757581325036357e-06, - "loss": 0.027, - "num_tokens": 84180990.0, - "step": 197 - }, - { - "epoch": 1.785553047404063, - "grad_norm": 0.07082819077709723, - "learning_rate": 7.73481180482093e-06, - "loss": 0.0184, - "num_tokens": 84610497.0, - "step": 198 - }, - { - "epoch": 1.7945823927765236, - "grad_norm": 0.0765803358413884, - "learning_rate": 7.711966081105863e-06, - "loss": 0.0299, - "num_tokens": 85060786.0, - "step": 199 - }, - { - "epoch": 1.8036117381489842, - "grad_norm": 0.09743570524376206, - "learning_rate": 7.68904493289371e-06, - "loss": 0.0159, - "num_tokens": 85477616.0, - "step": 200 - }, - { - "epoch": 1.8126410835214446, - "grad_norm": 0.06763165044353958, - "learning_rate": 7.666049141758878e-06, - "loss": 0.0668, - "num_tokens": 85930357.0, - "step": 201 - }, - { - "epoch": 1.8216704288939052, - "grad_norm": 0.13642266006295956, - "learning_rate": 7.642979491820974e-06, - "loss": 0.0169, - "num_tokens": 86344995.0, - "step": 202 - }, - { - "epoch": 1.8306997742663658, - "grad_norm": 0.07068370731607548, - "learning_rate": 7.619836769718075e-06, - "loss": 0.0158, - "num_tokens": 86757489.0, - "step": 203 - }, - { - "epoch": 1.8397291196388261, - "grad_norm": 0.07345491867594905, - "learning_rate": 7.596621764579904e-06, - "loss": 0.0175, - "num_tokens": 87187973.0, - "step": 204 - }, - { - "epoch": 1.8487584650112867, - "grad_norm": 0.0842192458018367, - "learning_rate": 7.573335268000918e-06, - "loss": 0.0253, - "num_tokens": 87614553.0, - "step": 205 - }, - { - "epoch": 1.8577878103837473, - "grad_norm": 0.06522749322895638, - "learning_rate": 7.549978074013314e-06, - "loss": 0.0168, - "num_tokens": 88048570.0, - "step": 206 - }, - { - "epoch": 1.8668171557562077, - "grad_norm": 0.07495632464623402, - "learning_rate": 7.5265509790599625e-06, - "loss": 0.0166, - "num_tokens": 88466238.0, - "step": 207 - }, - { - "epoch": 1.875846501128668, - "grad_norm": 0.07359735329146656, - "learning_rate": 7.503054781967241e-06, - "loss": 0.0185, - "num_tokens": 88895453.0, - "step": 208 - }, - { - "epoch": 1.8848758465011288, - "grad_norm": 0.09710562224494881, - "learning_rate": 7.479490283917802e-06, - "loss": 0.0659, - "num_tokens": 89349875.0, - "step": 209 - }, - { - "epoch": 1.8939051918735892, - "grad_norm": 0.0702463525638772, - "learning_rate": 7.455858288423249e-06, - "loss": 0.0166, - "num_tokens": 89792563.0, - "step": 210 - }, - { - "epoch": 1.9029345372460496, - "grad_norm": 0.06912331304375105, - "learning_rate": 7.43215960129674e-06, - "loss": 0.0453, - "num_tokens": 90234898.0, - "step": 211 - }, - { - "epoch": 1.9119638826185101, - "grad_norm": 0.25433782045873404, - "learning_rate": 7.408395030625513e-06, - "loss": 0.0176, - "num_tokens": 90648682.0, - "step": 212 - }, - { - "epoch": 1.9209932279909707, - "grad_norm": 0.07360863662739528, - "learning_rate": 7.384565386743327e-06, - "loss": 0.0181, - "num_tokens": 91063468.0, - "step": 213 - }, - { - "epoch": 1.930022573363431, - "grad_norm": 0.07509698446261776, - "learning_rate": 7.360671482202838e-06, - "loss": 0.0252, - "num_tokens": 91494817.0, - "step": 214 - }, - { - "epoch": 1.9390519187358917, - "grad_norm": 0.07276762866373235, - "learning_rate": 7.336714131747878e-06, - "loss": 0.0173, - "num_tokens": 91925573.0, - "step": 215 - }, - { - "epoch": 1.9480812641083523, - "grad_norm": 0.0674231394947175, - "learning_rate": 7.312694152285691e-06, - "loss": 0.0657, - "num_tokens": 92350433.0, - "step": 216 - }, - { - "epoch": 1.9571106094808126, - "grad_norm": 0.07775923659337187, - "learning_rate": 7.288612362859066e-06, - "loss": 0.0163, - "num_tokens": 92765584.0, - "step": 217 - }, - { - "epoch": 1.966139954853273, - "grad_norm": 0.075222930525113, - "learning_rate": 7.2644695846184165e-06, - "loss": 0.0178, - "num_tokens": 93229192.0, - "step": 218 - }, - { - "epoch": 1.9751693002257338, - "grad_norm": 0.06923809192473053, - "learning_rate": 7.240266640793774e-06, - "loss": 0.0148, - "num_tokens": 93636026.0, - "step": 219 - }, - { - "epoch": 1.9841986455981941, - "grad_norm": 0.06904441894343259, - "learning_rate": 7.216004356666717e-06, - "loss": 0.0173, - "num_tokens": 94060859.0, - "step": 220 - }, - { - "epoch": 1.9932279909706545, - "grad_norm": 0.07164737021846543, - "learning_rate": 7.191683559542238e-06, - "loss": 0.0154, - "num_tokens": 94478604.0, - "step": 221 - }, - { - "epoch": 2.0, - "grad_norm": 0.07299986656925679, - "learning_rate": 7.167305078720527e-06, - "loss": 0.0152, - "num_tokens": 94789780.0, - "step": 222 - }, - { - "epoch": 2.0, - "eval_loss": 0.09336748719215393, - "eval_num_tokens": 94789780.0, - "eval_runtime": 55.4758, - "eval_samples_per_second": 45.155, - "eval_steps_per_second": 5.66, - "step": 222 - }, - { - "epoch": 2.0090293453724604, - "grad_norm": 0.08097562146839203, - "learning_rate": 7.142869745468697e-06, - "loss": 0.0141, - "num_tokens": 95206304.0, - "step": 223 - }, - { - "epoch": 2.018058690744921, - "grad_norm": 0.0657521707564096, - "learning_rate": 7.118378392992436e-06, - "loss": 0.0128, - "num_tokens": 95629591.0, - "step": 224 - }, - { - "epoch": 2.0270880361173815, - "grad_norm": 0.06260152099136218, - "learning_rate": 7.093831856407599e-06, - "loss": 0.0247, - "num_tokens": 96043349.0, - "step": 225 - }, - { - "epoch": 2.036117381489842, - "grad_norm": 0.0787462901317735, - "learning_rate": 7.069230972711727e-06, - "loss": 0.0136, - "num_tokens": 96461415.0, - "step": 226 - }, - { - "epoch": 2.0451467268623027, - "grad_norm": 0.06859199368166498, - "learning_rate": 7.044576580755517e-06, - "loss": 0.0125, - "num_tokens": 96920134.0, - "step": 227 - }, - { - "epoch": 2.054176072234763, - "grad_norm": 0.07142924118779401, - "learning_rate": 7.019869521214206e-06, - "loss": 0.0149, - "num_tokens": 97365121.0, - "step": 228 - }, - { - "epoch": 2.0632054176072234, - "grad_norm": 0.06946075051983241, - "learning_rate": 6.995110636558916e-06, - "loss": 0.0132, - "num_tokens": 97777134.0, - "step": 229 - }, - { - "epoch": 2.072234762979684, - "grad_norm": 0.07132704808421908, - "learning_rate": 6.970300771027914e-06, - "loss": 0.0135, - "num_tokens": 98209859.0, - "step": 230 - }, - { - "epoch": 2.0812641083521446, - "grad_norm": 0.0787937957547939, - "learning_rate": 6.945440770597845e-06, - "loss": 0.0166, - "num_tokens": 98622007.0, - "step": 231 - }, - { - "epoch": 2.090293453724605, - "grad_norm": 0.06889250982502683, - "learning_rate": 6.920531482954863e-06, - "loss": 0.0139, - "num_tokens": 99052847.0, - "step": 232 - }, - { - "epoch": 2.0993227990970653, - "grad_norm": 0.06756115890886172, - "learning_rate": 6.895573757465745e-06, - "loss": 0.0128, - "num_tokens": 99471292.0, - "step": 233 - }, - { - "epoch": 2.108352144469526, - "grad_norm": 0.06696980846993505, - "learning_rate": 6.870568445148915e-06, - "loss": 0.0133, - "num_tokens": 99901083.0, - "step": 234 - }, - { - "epoch": 2.1173814898419865, - "grad_norm": 0.07298831986544797, - "learning_rate": 6.845516398645434e-06, - "loss": 0.0222, - "num_tokens": 100323717.0, - "step": 235 - }, - { - "epoch": 2.126410835214447, - "grad_norm": 0.07277884406791328, - "learning_rate": 6.820418472189926e-06, - "loss": 0.0137, - "num_tokens": 100746774.0, - "step": 236 - }, - { - "epoch": 2.1354401805869077, - "grad_norm": 0.07572502256810043, - "learning_rate": 6.795275521581443e-06, - "loss": 0.0592, - "num_tokens": 101186553.0, - "step": 237 - }, - { - "epoch": 2.144469525959368, - "grad_norm": 0.1286025399754837, - "learning_rate": 6.770088404154293e-06, - "loss": 0.0146, - "num_tokens": 101595403.0, - "step": 238 - }, - { - "epoch": 2.1534988713318284, - "grad_norm": 0.07448253421632307, - "learning_rate": 6.744857978748795e-06, - "loss": 0.0153, - "num_tokens": 102015527.0, - "step": 239 - }, - { - "epoch": 2.1625282167042887, - "grad_norm": 0.07103847764519645, - "learning_rate": 6.719585105682012e-06, - "loss": 0.0166, - "num_tokens": 102435412.0, - "step": 240 - }, - { - "epoch": 2.1715575620767495, - "grad_norm": 0.07961645748931483, - "learning_rate": 6.6942706467183916e-06, - "loss": 0.0157, - "num_tokens": 102851570.0, - "step": 241 - }, - { - "epoch": 2.18058690744921, - "grad_norm": 0.07140779969396704, - "learning_rate": 6.668915465040403e-06, - "loss": 0.0133, - "num_tokens": 103270322.0, - "step": 242 - }, - { - "epoch": 2.1896162528216703, - "grad_norm": 0.06501016705122462, - "learning_rate": 6.643520425219093e-06, - "loss": 0.0126, - "num_tokens": 103685477.0, - "step": 243 - }, - { - "epoch": 2.198645598194131, - "grad_norm": 0.07536879705533626, - "learning_rate": 6.618086393184601e-06, - "loss": 0.0123, - "num_tokens": 104120983.0, - "step": 244 - }, - { - "epoch": 2.2076749435665914, - "grad_norm": 0.06681362355147086, - "learning_rate": 6.592614236196646e-06, - "loss": 0.0194, - "num_tokens": 104536337.0, - "step": 245 - }, - { - "epoch": 2.216704288939052, - "grad_norm": 0.0693251653826736, - "learning_rate": 6.567104822814942e-06, - "loss": 0.013, - "num_tokens": 104955016.0, - "step": 246 - }, - { - "epoch": 2.2257336343115126, - "grad_norm": 0.06358470526760071, - "learning_rate": 6.541559022869589e-06, - "loss": 0.0136, - "num_tokens": 105388070.0, - "step": 247 - }, - { - "epoch": 2.234762979683973, - "grad_norm": 0.2564353264908074, - "learning_rate": 6.515977707431411e-06, - "loss": 0.0434, - "num_tokens": 105818809.0, - "step": 248 - }, - { - "epoch": 2.2437923250564333, - "grad_norm": 0.07335440119765696, - "learning_rate": 6.490361748782248e-06, - "loss": 0.0136, - "num_tokens": 106245854.0, - "step": 249 - }, - { - "epoch": 2.2528216704288937, - "grad_norm": 0.06532922181091284, - "learning_rate": 6.464712020385223e-06, - "loss": 0.013, - "num_tokens": 106670473.0, - "step": 250 - }, - { - "epoch": 2.2618510158013545, - "grad_norm": 0.7266023360680285, - "learning_rate": 6.439029396854955e-06, - "loss": 0.048, - "num_tokens": 107095545.0, - "step": 251 - }, - { - "epoch": 2.270880361173815, - "grad_norm": 0.07370448837002078, - "learning_rate": 6.4133147539277295e-06, - "loss": 0.0131, - "num_tokens": 107520274.0, - "step": 252 - }, - { - "epoch": 2.2799097065462752, - "grad_norm": 0.08390458225564246, - "learning_rate": 6.3875689684316435e-06, - "loss": 0.0339, - "num_tokens": 107953982.0, - "step": 253 - }, - { - "epoch": 2.288939051918736, - "grad_norm": 0.06777882375271017, - "learning_rate": 6.361792918256705e-06, - "loss": 0.0142, - "num_tokens": 108388280.0, - "step": 254 - }, - { - "epoch": 2.2979683972911964, - "grad_norm": 0.07626936803178642, - "learning_rate": 6.335987482324904e-06, - "loss": 0.0142, - "num_tokens": 108824002.0, - "step": 255 - }, - { - "epoch": 2.3069977426636568, - "grad_norm": 0.06449258340863874, - "learning_rate": 6.310153540560229e-06, - "loss": 0.0219, - "num_tokens": 109244245.0, - "step": 256 - }, - { - "epoch": 2.3160270880361176, - "grad_norm": 0.06691990520571968, - "learning_rate": 6.284291973858682e-06, - "loss": 0.0113, - "num_tokens": 109673712.0, - "step": 257 - }, - { - "epoch": 2.325056433408578, - "grad_norm": 0.06859871365700514, - "learning_rate": 6.25840366405822e-06, - "loss": 0.0134, - "num_tokens": 110102425.0, - "step": 258 - }, - { - "epoch": 2.3340857787810383, - "grad_norm": 0.07117195067564289, - "learning_rate": 6.232489493908706e-06, - "loss": 0.0143, - "num_tokens": 110528736.0, - "step": 259 - }, - { - "epoch": 2.343115124153499, - "grad_norm": 0.07013656339118007, - "learning_rate": 6.2065503470417956e-06, - "loss": 0.0122, - "num_tokens": 110947222.0, - "step": 260 - }, - { - "epoch": 2.3521444695259595, - "grad_norm": 0.06746840266813196, - "learning_rate": 6.180587107940809e-06, - "loss": 0.0405, - "num_tokens": 111396704.0, - "step": 261 - }, - { - "epoch": 2.36117381489842, - "grad_norm": 0.20371704046600575, - "learning_rate": 6.154600661910577e-06, - "loss": 0.0568, - "num_tokens": 111831800.0, - "step": 262 - }, - { - "epoch": 2.37020316027088, - "grad_norm": 0.1429920710461871, - "learning_rate": 6.128591895047243e-06, - "loss": 0.0127, - "num_tokens": 112263654.0, - "step": 263 - }, - { - "epoch": 2.379232505643341, - "grad_norm": 0.06796938770368956, - "learning_rate": 6.102561694208064e-06, - "loss": 0.0121, - "num_tokens": 112691946.0, - "step": 264 - }, - { - "epoch": 2.3882618510158014, - "grad_norm": 0.0666953830758214, - "learning_rate": 6.076510946981155e-06, - "loss": 0.0235, - "num_tokens": 113132551.0, - "step": 265 - }, - { - "epoch": 2.3972911963882617, - "grad_norm": 0.13625856338482667, - "learning_rate": 6.05044054165523e-06, - "loss": 0.0328, - "num_tokens": 113569796.0, - "step": 266 - }, - { - "epoch": 2.4063205417607225, - "grad_norm": 0.15413072191676425, - "learning_rate": 6.024351367189314e-06, - "loss": 0.0129, - "num_tokens": 114001801.0, - "step": 267 - }, - { - "epoch": 2.415349887133183, - "grad_norm": 0.06757492167260408, - "learning_rate": 5.998244313182431e-06, - "loss": 0.0125, - "num_tokens": 114434376.0, - "step": 268 - }, - { - "epoch": 2.4243792325056432, - "grad_norm": 0.07107258708384646, - "learning_rate": 5.972120269843263e-06, - "loss": 0.0145, - "num_tokens": 114881898.0, - "step": 269 - }, - { - "epoch": 2.4334085778781036, - "grad_norm": 0.061338195451882724, - "learning_rate": 5.945980127959812e-06, - "loss": 0.0121, - "num_tokens": 115303494.0, - "step": 270 - }, - { - "epoch": 2.4424379232505644, - "grad_norm": 0.06762066726047494, - "learning_rate": 5.919824778869002e-06, - "loss": 0.0142, - "num_tokens": 115722287.0, - "step": 271 - }, - { - "epoch": 2.4514672686230248, - "grad_norm": 0.0692200341605998, - "learning_rate": 5.893655114426306e-06, - "loss": 0.012, - "num_tokens": 116161502.0, - "step": 272 - }, - { - "epoch": 2.460496613995485, - "grad_norm": 0.06754090447134138, - "learning_rate": 5.867472026975326e-06, - "loss": 0.0127, - "num_tokens": 116581091.0, - "step": 273 - }, - { - "epoch": 2.469525959367946, - "grad_norm": 0.06225621229357672, - "learning_rate": 5.841276409317366e-06, - "loss": 0.0125, - "num_tokens": 117012555.0, - "step": 274 - }, - { - "epoch": 2.4785553047404063, - "grad_norm": 0.06196719021324254, - "learning_rate": 5.815069154680991e-06, - "loss": 0.0122, - "num_tokens": 117435175.0, - "step": 275 - }, - { - "epoch": 2.4875846501128667, - "grad_norm": 0.06697708798601215, - "learning_rate": 5.788851156691569e-06, - "loss": 0.0658, - "num_tokens": 117873663.0, - "step": 276 - }, - { - "epoch": 2.4966139954853275, - "grad_norm": 0.34075070167121463, - "learning_rate": 5.7626233093407955e-06, - "loss": 0.0128, - "num_tokens": 118286469.0, - "step": 277 - }, - { - "epoch": 2.505643340857788, - "grad_norm": 0.07629756246894763, - "learning_rate": 5.7363865069562195e-06, - "loss": 0.0115, - "num_tokens": 118713374.0, - "step": 278 - }, - { - "epoch": 2.514672686230248, - "grad_norm": 0.07034796548205811, - "learning_rate": 5.710141644170734e-06, - "loss": 0.0139, - "num_tokens": 119126272.0, - "step": 279 - }, - { - "epoch": 2.523702031602709, - "grad_norm": 0.08603521531576876, - "learning_rate": 5.683889615892091e-06, - "loss": 0.0212, - "num_tokens": 119554963.0, - "step": 280 - }, - { - "epoch": 2.5327313769751694, - "grad_norm": 0.07088091524565285, - "learning_rate": 5.65763131727236e-06, - "loss": 0.0123, - "num_tokens": 119977012.0, - "step": 281 - }, - { - "epoch": 2.5417607223476297, - "grad_norm": 0.06614360737927173, - "learning_rate": 5.631367643677428e-06, - "loss": 0.0128, - "num_tokens": 120396254.0, - "step": 282 - }, - { - "epoch": 2.55079006772009, - "grad_norm": 0.07042295702919123, - "learning_rate": 5.605099490656459e-06, - "loss": 0.0121, - "num_tokens": 120811188.0, - "step": 283 - }, - { - "epoch": 2.559819413092551, - "grad_norm": 0.06916246634023543, - "learning_rate": 5.578827753911357e-06, - "loss": 0.0129, - "num_tokens": 121249730.0, - "step": 284 - }, - { - "epoch": 2.5688487584650113, - "grad_norm": 0.07209978464901166, - "learning_rate": 5.5525533292662246e-06, - "loss": 0.0136, - "num_tokens": 121653968.0, - "step": 285 - }, - { - "epoch": 2.5778781038374716, - "grad_norm": 0.0787892316770162, - "learning_rate": 5.52627711263682e-06, - "loss": 0.0127, - "num_tokens": 122090338.0, - "step": 286 - }, - { - "epoch": 2.5869074492099324, - "grad_norm": 0.07387994099802492, - "learning_rate": 5.500000000000001e-06, - "loss": 0.0126, - "num_tokens": 122507656.0, - "step": 287 - }, - { - "epoch": 2.595936794582393, - "grad_norm": 0.06821405288648451, - "learning_rate": 5.4737228873631835e-06, - "loss": 0.012, - "num_tokens": 122940825.0, - "step": 288 - }, - { - "epoch": 2.604966139954853, - "grad_norm": 0.06756484850678789, - "learning_rate": 5.447446670733777e-06, - "loss": 0.0127, - "num_tokens": 123349895.0, - "step": 289 - }, - { - "epoch": 2.6139954853273135, - "grad_norm": 0.06308494712154893, - "learning_rate": 5.421172246088645e-06, - "loss": 0.0122, - "num_tokens": 123771878.0, - "step": 290 - }, - { - "epoch": 2.6230248306997743, - "grad_norm": 0.08082656510725997, - "learning_rate": 5.394900509343543e-06, - "loss": 0.0129, - "num_tokens": 124201272.0, - "step": 291 - }, - { - "epoch": 2.6320541760722347, - "grad_norm": 0.06299806239016838, - "learning_rate": 5.368632356322574e-06, - "loss": 0.0197, - "num_tokens": 124649372.0, - "step": 292 - }, - { - "epoch": 2.6410835214446955, - "grad_norm": 0.09342033450420674, - "learning_rate": 5.342368682727641e-06, - "loss": 0.0122, - "num_tokens": 125066840.0, - "step": 293 - }, - { - "epoch": 2.650112866817156, - "grad_norm": 0.07602506163920753, - "learning_rate": 5.3161103841079105e-06, - "loss": 0.0141, - "num_tokens": 125494250.0, - "step": 294 - }, - { - "epoch": 2.659142212189616, - "grad_norm": 0.06866876545387215, - "learning_rate": 5.2898583558292645e-06, - "loss": 0.012, - "num_tokens": 125918388.0, - "step": 295 - }, - { - "epoch": 2.6681715575620766, - "grad_norm": 0.0686873223442973, - "learning_rate": 5.2636134930437836e-06, - "loss": 0.013, - "num_tokens": 126360809.0, - "step": 296 - }, - { - "epoch": 2.6772009029345374, - "grad_norm": 0.06418767945659551, - "learning_rate": 5.237376690659206e-06, - "loss": 0.0118, - "num_tokens": 126789778.0, - "step": 297 - }, - { - "epoch": 2.6862302483069977, - "grad_norm": 0.06467304405936973, - "learning_rate": 5.211148843308432e-06, - "loss": 0.0188, - "num_tokens": 127228131.0, - "step": 298 - }, - { - "epoch": 2.695259593679458, - "grad_norm": 0.327408568479494, - "learning_rate": 5.1849308453190105e-06, - "loss": 0.045, - "num_tokens": 127662535.0, - "step": 299 - }, - { - "epoch": 2.704288939051919, - "grad_norm": 0.07065815550062918, - "learning_rate": 5.158723590682636e-06, - "loss": 0.0153, - "num_tokens": 128099563.0, - "step": 300 - }, - { - "epoch": 2.7133182844243793, - "grad_norm": 0.0854618622284164, - "learning_rate": 5.132527973024677e-06, - "loss": 0.0217, - "num_tokens": 128526843.0, - "step": 301 - }, - { - "epoch": 2.7223476297968396, - "grad_norm": 0.06637294397685695, - "learning_rate": 5.106344885573695e-06, - "loss": 0.0145, - "num_tokens": 128943620.0, - "step": 302 - }, - { - "epoch": 2.7313769751693, - "grad_norm": 0.06892427094488447, - "learning_rate": 5.0801752211309995e-06, - "loss": 0.0144, - "num_tokens": 129365819.0, - "step": 303 - }, - { - "epoch": 2.740406320541761, - "grad_norm": 0.07451856933575882, - "learning_rate": 5.05401987204019e-06, - "loss": 0.0127, - "num_tokens": 129798803.0, - "step": 304 - }, - { - "epoch": 2.749435665914221, - "grad_norm": 0.08191501668420725, - "learning_rate": 5.027879730156738e-06, - "loss": 0.013, - "num_tokens": 130218483.0, - "step": 305 - }, - { - "epoch": 2.758465011286682, - "grad_norm": 0.07021794114794579, - "learning_rate": 5.001755686817573e-06, - "loss": 0.0121, - "num_tokens": 130649485.0, - "step": 306 - }, - { - "epoch": 2.7674943566591423, - "grad_norm": 0.08004562057562054, - "learning_rate": 4.975648632810686e-06, - "loss": 0.017, - "num_tokens": 131078439.0, - "step": 307 - }, - { - "epoch": 2.7765237020316027, - "grad_norm": 0.0684239650400617, - "learning_rate": 4.949559458344771e-06, - "loss": 0.0128, - "num_tokens": 131532361.0, - "step": 308 - }, - { - "epoch": 2.785553047404063, - "grad_norm": 0.0660413280467444, - "learning_rate": 4.923489053018846e-06, - "loss": 0.012, - "num_tokens": 131952787.0, - "step": 309 - }, - { - "epoch": 2.7945823927765234, - "grad_norm": 0.06861103363424428, - "learning_rate": 4.897438305791937e-06, - "loss": 0.013, - "num_tokens": 132386508.0, - "step": 310 - }, - { - "epoch": 2.8036117381489842, - "grad_norm": 0.07134277327598654, - "learning_rate": 4.8714081049527565e-06, - "loss": 0.0126, - "num_tokens": 132820484.0, - "step": 311 - }, - { - "epoch": 2.8126410835214446, - "grad_norm": 0.06294849212931829, - "learning_rate": 4.845399338089425e-06, - "loss": 0.0118, - "num_tokens": 133254008.0, - "step": 312 - }, - { - "epoch": 2.8216704288939054, - "grad_norm": 0.07024694450157741, - "learning_rate": 4.819412892059192e-06, - "loss": 0.0678, - "num_tokens": 133686531.0, - "step": 313 - }, - { - "epoch": 2.8306997742663658, - "grad_norm": 0.2582020873633315, - "learning_rate": 4.793449652958207e-06, - "loss": 0.0131, - "num_tokens": 134117853.0, - "step": 314 - }, - { - "epoch": 2.839729119638826, - "grad_norm": 0.07077002016575418, - "learning_rate": 4.767510506091296e-06, - "loss": 0.0138, - "num_tokens": 134542656.0, - "step": 315 - }, - { - "epoch": 2.8487584650112865, - "grad_norm": 0.06199513656975843, - "learning_rate": 4.741596335941782e-06, - "loss": 0.012, - "num_tokens": 134983147.0, - "step": 316 - }, - { - "epoch": 2.8577878103837473, - "grad_norm": 0.06740110933280159, - "learning_rate": 4.715708026141321e-06, - "loss": 0.0123, - "num_tokens": 135415172.0, - "step": 317 - }, - { - "epoch": 2.8668171557562077, - "grad_norm": 0.06838469318720133, - "learning_rate": 4.6898464594397715e-06, - "loss": 0.0112, - "num_tokens": 135853491.0, - "step": 318 - }, - { - "epoch": 2.875846501128668, - "grad_norm": 0.07024949444733218, - "learning_rate": 4.664012517675098e-06, - "loss": 0.0311, - "num_tokens": 136297933.0, - "step": 319 - }, - { - "epoch": 2.884875846501129, - "grad_norm": 0.1250997981580887, - "learning_rate": 4.638207081743295e-06, - "loss": 0.0556, - "num_tokens": 136746456.0, - "step": 320 - }, - { - "epoch": 2.893905191873589, - "grad_norm": 0.10424208944448676, - "learning_rate": 4.612431031568359e-06, - "loss": 0.0169, - "num_tokens": 137185823.0, - "step": 321 - }, - { - "epoch": 2.9029345372460496, - "grad_norm": 0.08211764206486848, - "learning_rate": 4.586685246072272e-06, - "loss": 0.0143, - "num_tokens": 137610632.0, - "step": 322 - }, - { - "epoch": 2.91196388261851, - "grad_norm": 0.0752587009855602, - "learning_rate": 4.560970603145046e-06, - "loss": 0.0124, - "num_tokens": 138036129.0, - "step": 323 - }, - { - "epoch": 2.9209932279909707, - "grad_norm": 0.06185391843833257, - "learning_rate": 4.535287979614777e-06, - "loss": 0.0112, - "num_tokens": 138460480.0, - "step": 324 - }, - { - "epoch": 2.930022573363431, - "grad_norm": 0.06723587071308086, - "learning_rate": 4.5096382512177535e-06, - "loss": 0.0115, - "num_tokens": 138890848.0, - "step": 325 - }, - { - "epoch": 2.939051918735892, - "grad_norm": 0.06238988762166912, - "learning_rate": 4.484022292568593e-06, - "loss": 0.0115, - "num_tokens": 139300121.0, - "step": 326 - }, - { - "epoch": 2.9480812641083523, - "grad_norm": 0.06938937112343414, - "learning_rate": 4.458440977130413e-06, - "loss": 0.0206, - "num_tokens": 139730061.0, - "step": 327 - }, - { - "epoch": 2.9571106094808126, - "grad_norm": 0.062009976633795545, - "learning_rate": 4.432895177185061e-06, - "loss": 0.0119, - "num_tokens": 140161307.0, - "step": 328 - }, - { - "epoch": 2.966139954853273, - "grad_norm": 0.06784066321825818, - "learning_rate": 4.407385763803355e-06, - "loss": 0.0119, - "num_tokens": 140580963.0, - "step": 329 - }, - { - "epoch": 2.975169300225734, - "grad_norm": 0.06292128977616586, - "learning_rate": 4.381913606815401e-06, - "loss": 0.0127, - "num_tokens": 140998977.0, - "step": 330 - }, - { - "epoch": 2.984198645598194, - "grad_norm": 0.07569293102716057, - "learning_rate": 4.356479574780909e-06, - "loss": 0.0126, - "num_tokens": 141427751.0, - "step": 331 - }, - { - "epoch": 2.9932279909706545, - "grad_norm": 0.05892529849159098, - "learning_rate": 4.331084534959598e-06, - "loss": 0.0125, - "num_tokens": 141871576.0, - "step": 332 - }, - { - "epoch": 3.0, - "grad_norm": 0.07768349752650722, - "learning_rate": 4.305729353281608e-06, - "loss": 0.0112, - "num_tokens": 142185499.0, - "step": 333 - }, - { - "epoch": 3.0, - "eval_loss": 0.10246679186820984, - "eval_num_tokens": 142185499.0, - "eval_runtime": 176.189, - "eval_samples_per_second": 14.218, - "eval_steps_per_second": 1.782, - "step": 333 - }, - { - "epoch": 3.0090293453724604, - "grad_norm": 0.07152061537608126, - "learning_rate": 4.28041489431799e-06, - "loss": 0.0113, - "num_tokens": 142609048.0, - "step": 334 - }, - { - "epoch": 3.018058690744921, - "grad_norm": 0.05547975846200711, - "learning_rate": 4.255142021251206e-06, - "loss": 0.0097, - "num_tokens": 143034905.0, - "step": 335 - }, - { - "epoch": 3.0270880361173815, - "grad_norm": 0.06145285775610661, - "learning_rate": 4.22991159584571e-06, - "loss": 0.0099, - "num_tokens": 143441637.0, - "step": 336 - }, - { - "epoch": 3.036117381489842, - "grad_norm": 0.058702561810255335, - "learning_rate": 4.204724478418558e-06, - "loss": 0.0093, - "num_tokens": 143864737.0, - "step": 337 - }, - { - "epoch": 3.0451467268623027, - "grad_norm": 0.06716554695100217, - "learning_rate": 4.1795815278100746e-06, - "loss": 0.0102, - "num_tokens": 144287169.0, - "step": 338 - }, - { - "epoch": 3.054176072234763, - "grad_norm": 0.10156337114344059, - "learning_rate": 4.154483601354566e-06, - "loss": 0.0161, - "num_tokens": 144729276.0, - "step": 339 - }, - { - "epoch": 3.0632054176072234, - "grad_norm": 0.06503277327720089, - "learning_rate": 4.129431554851086e-06, - "loss": 0.0105, - "num_tokens": 145152649.0, - "step": 340 - }, - { - "epoch": 3.072234762979684, - "grad_norm": 0.06496822163807517, - "learning_rate": 4.104426242534256e-06, - "loss": 0.0102, - "num_tokens": 145603813.0, - "step": 341 - }, - { - "epoch": 3.0812641083521446, - "grad_norm": 0.056389637609083165, - "learning_rate": 4.079468517045136e-06, - "loss": 0.0091, - "num_tokens": 146042430.0, - "step": 342 - }, - { - "epoch": 3.090293453724605, - "grad_norm": 0.08215256814716321, - "learning_rate": 4.054559229402157e-06, - "loss": 0.0182, - "num_tokens": 146486450.0, - "step": 343 - }, - { - "epoch": 3.0993227990970653, - "grad_norm": 0.0658823665069719, - "learning_rate": 4.029699228972087e-06, - "loss": 0.0099, - "num_tokens": 146918914.0, - "step": 344 - }, - { - "epoch": 3.108352144469526, - "grad_norm": 0.06379382035081951, - "learning_rate": 4.0048893634410865e-06, - "loss": 0.0098, - "num_tokens": 147357955.0, - "step": 345 - }, - { - "epoch": 3.1173814898419865, - "grad_norm": 0.08656012715805109, - "learning_rate": 3.980130478785794e-06, - "loss": 0.0092, - "num_tokens": 147800926.0, - "step": 346 - }, - { - "epoch": 3.126410835214447, - "grad_norm": 0.06257455234456158, - "learning_rate": 3.955423419244484e-06, - "loss": 0.0092, - "num_tokens": 148218943.0, - "step": 347 - }, - { - "epoch": 3.1354401805869077, - "grad_norm": 0.0627738180223448, - "learning_rate": 3.930769027288273e-06, - "loss": 0.0107, - "num_tokens": 148653844.0, - "step": 348 - }, - { - "epoch": 3.144469525959368, - "grad_norm": 0.06243500821032468, - "learning_rate": 3.9061681435924014e-06, - "loss": 0.0092, - "num_tokens": 149091233.0, - "step": 349 - }, - { - "epoch": 3.1534988713318284, - "grad_norm": 0.060114887497339085, - "learning_rate": 3.881621607007565e-06, - "loss": 0.0093, - "num_tokens": 149519351.0, - "step": 350 - }, - { - "epoch": 3.1625282167042887, - "grad_norm": 0.06684460684294827, - "learning_rate": 3.857130254531303e-06, - "loss": 0.0102, - "num_tokens": 149963954.0, - "step": 351 - }, - { - "epoch": 3.1715575620767495, - "grad_norm": 0.0581834905948131, - "learning_rate": 3.832694921279474e-06, - "loss": 0.0117, - "num_tokens": 150388491.0, - "step": 352 - }, - { - "epoch": 3.18058690744921, - "grad_norm": 0.07664972688212673, - "learning_rate": 3.8083164404577654e-06, - "loss": 0.047, - "num_tokens": 150847232.0, - "step": 353 - }, - { - "epoch": 3.1896162528216703, - "grad_norm": 0.05840055533816688, - "learning_rate": 3.7839956433332847e-06, - "loss": 0.0297, - "num_tokens": 151284392.0, - "step": 354 - }, - { - "epoch": 3.198645598194131, - "grad_norm": 0.09943187660436438, - "learning_rate": 3.759733359206229e-06, - "loss": 0.0095, - "num_tokens": 151731845.0, - "step": 355 - }, - { - "epoch": 3.2076749435665914, - "grad_norm": 0.058677854163930766, - "learning_rate": 3.735530415381584e-06, - "loss": 0.0095, - "num_tokens": 152170900.0, - "step": 356 - }, - { - "epoch": 3.216704288939052, - "grad_norm": 0.060264490178896846, - "learning_rate": 3.7113876371409354e-06, - "loss": 0.0088, - "num_tokens": 152607162.0, - "step": 357 - }, - { - "epoch": 3.2257336343115126, - "grad_norm": 0.05927279793724808, - "learning_rate": 3.687305847714311e-06, - "loss": 0.009, - "num_tokens": 153028418.0, - "step": 358 - }, - { - "epoch": 3.234762979683973, - "grad_norm": 0.06139721772997134, - "learning_rate": 3.6632858682521233e-06, - "loss": 0.0094, - "num_tokens": 153425933.0, - "step": 359 - }, - { - "epoch": 3.2437923250564333, - "grad_norm": 0.0829022135578992, - "learning_rate": 3.639328517797164e-06, - "loss": 0.0473, - "num_tokens": 153862042.0, - "step": 360 - }, - { - "epoch": 3.2528216704288937, - "grad_norm": 0.06461822963842173, - "learning_rate": 3.6154346132566732e-06, - "loss": 0.012, - "num_tokens": 154292718.0, - "step": 361 - }, - { - "epoch": 3.2618510158013545, - "grad_norm": 0.06483450196299163, - "learning_rate": 3.5916049693744883e-06, - "loss": 0.0098, - "num_tokens": 154735278.0, - "step": 362 - }, - { - "epoch": 3.270880361173815, - "grad_norm": 0.09379714259866676, - "learning_rate": 3.5678403987032616e-06, - "loss": 0.0124, - "num_tokens": 155179493.0, - "step": 363 - }, - { - "epoch": 3.2799097065462752, - "grad_norm": 0.06565894474694481, - "learning_rate": 3.544141711576754e-06, - "loss": 0.0097, - "num_tokens": 155601091.0, - "step": 364 - }, - { - "epoch": 3.288939051918736, - "grad_norm": 0.06116750216624111, - "learning_rate": 3.5205097160821987e-06, - "loss": 0.0094, - "num_tokens": 156037078.0, - "step": 365 - }, - { - "epoch": 3.2979683972911964, - "grad_norm": 0.06928926670826958, - "learning_rate": 3.4969452180327614e-06, - "loss": 0.0101, - "num_tokens": 156459746.0, - "step": 366 - }, - { - "epoch": 3.3069977426636568, - "grad_norm": 0.08409965398766228, - "learning_rate": 3.4734490209400397e-06, - "loss": 0.0274, - "num_tokens": 156901429.0, - "step": 367 - }, - { - "epoch": 3.3160270880361176, - "grad_norm": 0.061886724574349766, - "learning_rate": 3.450021925986687e-06, - "loss": 0.0084, - "num_tokens": 157331010.0, - "step": 368 - }, - { - "epoch": 3.325056433408578, - "grad_norm": 0.05625118622282191, - "learning_rate": 3.4266647319990832e-06, - "loss": 0.009, - "num_tokens": 157772602.0, - "step": 369 - }, - { - "epoch": 3.3340857787810383, - "grad_norm": 0.059888736611577655, - "learning_rate": 3.403378235420096e-06, - "loss": 0.0092, - "num_tokens": 158189457.0, - "step": 370 - }, - { - "epoch": 3.343115124153499, - "grad_norm": 0.055637198763357, - "learning_rate": 3.380163230281928e-06, - "loss": 0.0089, - "num_tokens": 158626349.0, - "step": 371 - }, - { - "epoch": 3.3521444695259595, - "grad_norm": 0.058711357359014525, - "learning_rate": 3.3570205081790285e-06, - "loss": 0.01, - "num_tokens": 159042993.0, - "step": 372 - }, - { - "epoch": 3.36117381489842, - "grad_norm": 0.15325418912567096, - "learning_rate": 3.3339508582411245e-06, - "loss": 0.03, - "num_tokens": 159461779.0, - "step": 373 - }, - { - "epoch": 3.37020316027088, - "grad_norm": 0.06129832967810523, - "learning_rate": 3.3109550671062907e-06, - "loss": 0.0106, - "num_tokens": 159889249.0, - "step": 374 - }, - { - "epoch": 3.379232505643341, - "grad_norm": 0.06713273236259697, - "learning_rate": 3.288033918894137e-06, - "loss": 0.0095, - "num_tokens": 160318538.0, - "step": 375 - }, - { - "epoch": 3.3882618510158014, - "grad_norm": 0.057780993532694884, - "learning_rate": 3.265188195179071e-06, - "loss": 0.0093, - "num_tokens": 160741586.0, - "step": 376 - }, - { - "epoch": 3.3972911963882617, - "grad_norm": 0.06396483661772823, - "learning_rate": 3.2424186749636455e-06, - "loss": 0.0099, - "num_tokens": 161157068.0, - "step": 377 - }, - { - "epoch": 3.4063205417607225, - "grad_norm": 0.11037209671799429, - "learning_rate": 3.2197261346519905e-06, - "loss": 0.0359, - "num_tokens": 161591338.0, - "step": 378 - }, - { - "epoch": 3.415349887133183, - "grad_norm": 0.06141290629847806, - "learning_rate": 3.1971113480233556e-06, - "loss": 0.0266, - "num_tokens": 162030602.0, - "step": 379 - }, - { - "epoch": 3.4243792325056432, - "grad_norm": 0.06327066709618506, - "learning_rate": 3.1745750862057033e-06, - "loss": 0.0096, - "num_tokens": 162466006.0, - "step": 380 - }, - { - "epoch": 3.4334085778781036, - "grad_norm": 0.058909300821178116, - "learning_rate": 3.152118117649433e-06, - "loss": 0.0094, - "num_tokens": 162897575.0, - "step": 381 - }, - { - "epoch": 3.4424379232505644, - "grad_norm": 0.05997809135270737, - "learning_rate": 3.1297412081011686e-06, - "loss": 0.0096, - "num_tokens": 163303958.0, - "step": 382 - }, - { - "epoch": 3.4514672686230248, - "grad_norm": 0.06445221190776994, - "learning_rate": 3.1074451205776505e-06, - "loss": 0.0093, - "num_tokens": 163713904.0, - "step": 383 - }, - { - "epoch": 3.460496613995485, - "grad_norm": 0.05870409054047719, - "learning_rate": 3.0852306153397194e-06, - "loss": 0.0089, - "num_tokens": 164137525.0, - "step": 384 - }, - { - "epoch": 3.469525959367946, - "grad_norm": 0.07569408829366625, - "learning_rate": 3.063098449866384e-06, - "loss": 0.009, - "num_tokens": 164577195.0, - "step": 385 - }, - { - "epoch": 3.4785553047404063, - "grad_norm": 0.0588097227923788, - "learning_rate": 3.0410493788290114e-06, - "loss": 0.0086, - "num_tokens": 165007790.0, - "step": 386 - }, - { - "epoch": 3.4875846501128667, - "grad_norm": 0.07652438346762197, - "learning_rate": 3.019084154065568e-06, - "loss": 0.0236, - "num_tokens": 165448590.0, - "step": 387 - }, - { - "epoch": 3.4966139954853275, - "grad_norm": 0.06437216729466062, - "learning_rate": 2.997203524555005e-06, - "loss": 0.0094, - "num_tokens": 165878673.0, - "step": 388 - }, - { - "epoch": 3.505643340857788, - "grad_norm": 0.06358680380643286, - "learning_rate": 2.97540823639171e-06, - "loss": 0.0093, - "num_tokens": 166305211.0, - "step": 389 - }, - { - "epoch": 3.514672686230248, - "grad_norm": 0.07750794447714442, - "learning_rate": 2.953699032760067e-06, - "loss": 0.0167, - "num_tokens": 166738863.0, - "step": 390 - }, - { - "epoch": 3.523702031602709, - "grad_norm": 0.07534509702038769, - "learning_rate": 2.932076653909115e-06, - "loss": 0.009, - "num_tokens": 167150386.0, - "step": 391 - }, - { - "epoch": 3.5327313769751694, - "grad_norm": 0.06216853830987747, - "learning_rate": 2.910541837127305e-06, - "loss": 0.009, - "num_tokens": 167571071.0, - "step": 392 - }, - { - "epoch": 3.5417607223476297, - "grad_norm": 0.06388977277596672, - "learning_rate": 2.889095316717366e-06, - "loss": 0.0096, - "num_tokens": 167985607.0, - "step": 393 - }, - { - "epoch": 3.55079006772009, - "grad_norm": 0.05749415988844605, - "learning_rate": 2.8677378239712607e-06, - "loss": 0.009, - "num_tokens": 168420127.0, - "step": 394 - }, - { - "epoch": 3.559819413092551, - "grad_norm": 0.06339615887091614, - "learning_rate": 2.846470087145249e-06, - "loss": 0.0091, - "num_tokens": 168853493.0, - "step": 395 - }, - { - "epoch": 3.5688487584650113, - "grad_norm": 0.06137274100197093, - "learning_rate": 2.8252928314350626e-06, - "loss": 0.0087, - "num_tokens": 169267253.0, - "step": 396 - }, - { - "epoch": 3.5778781038374716, - "grad_norm": 0.059678184298786935, - "learning_rate": 2.804206778951168e-06, - "loss": 0.0086, - "num_tokens": 169692290.0, - "step": 397 - }, - { - "epoch": 3.5869074492099324, - "grad_norm": 0.09399964545190312, - "learning_rate": 2.7832126486941456e-06, - "loss": 0.0387, - "num_tokens": 170118921.0, - "step": 398 - }, - { - "epoch": 3.595936794582393, - "grad_norm": 0.05833904735584078, - "learning_rate": 2.7623111565301863e-06, - "loss": 0.0097, - "num_tokens": 170541906.0, - "step": 399 - }, - { - "epoch": 3.604966139954853, - "grad_norm": 0.08684244774836425, - "learning_rate": 2.7415030151666567e-06, - "loss": 0.0435, - "num_tokens": 170969534.0, - "step": 400 - }, - { - "epoch": 3.6139954853273135, - "grad_norm": 0.06469737843216128, - "learning_rate": 2.720788934127819e-06, - "loss": 0.0099, - "num_tokens": 171393139.0, - "step": 401 - }, - { - "epoch": 3.6230248306997743, - "grad_norm": 0.07008366163987513, - "learning_rate": 2.700169619730631e-06, - "loss": 0.0098, - "num_tokens": 171822275.0, - "step": 402 - }, - { - "epoch": 3.6320541760722347, - "grad_norm": 0.06074005907395048, - "learning_rate": 2.6796457750606487e-06, - "loss": 0.0097, - "num_tokens": 172269333.0, - "step": 403 - }, - { - "epoch": 3.6410835214446955, - "grad_norm": 0.06078788588742423, - "learning_rate": 2.659218099948079e-06, - "loss": 0.0093, - "num_tokens": 172666432.0, - "step": 404 - }, - { - "epoch": 3.650112866817156, - "grad_norm": 0.06624525044121411, - "learning_rate": 2.6388872909438875e-06, - "loss": 0.0094, - "num_tokens": 173085250.0, - "step": 405 - }, - { - "epoch": 3.659142212189616, - "grad_norm": 0.05869490549109586, - "learning_rate": 2.618654041296068e-06, - "loss": 0.0093, - "num_tokens": 173503425.0, - "step": 406 - }, - { - "epoch": 3.6681715575620766, - "grad_norm": 0.06550847475102616, - "learning_rate": 2.5985190409259957e-06, - "loss": 0.0094, - "num_tokens": 173915204.0, - "step": 407 - }, - { - "epoch": 3.6772009029345374, - "grad_norm": 0.11285521045144888, - "learning_rate": 2.5784829764049013e-06, - "loss": 0.0109, - "num_tokens": 174374964.0, - "step": 408 - }, - { - "epoch": 3.6862302483069977, - "grad_norm": 0.05975449002422044, - "learning_rate": 2.558546530930466e-06, - "loss": 0.009, - "num_tokens": 174779332.0, - "step": 409 - }, - { - "epoch": 3.695259593679458, - "grad_norm": 0.08349376485094143, - "learning_rate": 2.5387103843035126e-06, - "loss": 0.0179, - "num_tokens": 175200386.0, - "step": 410 - }, - { - "epoch": 3.704288939051919, - "grad_norm": 0.06215689171823831, - "learning_rate": 2.5189752129048428e-06, - "loss": 0.0088, - "num_tokens": 175605971.0, - "step": 411 - }, - { - "epoch": 3.7133182844243793, - "grad_norm": 0.05902745197195644, - "learning_rate": 2.49934168967216e-06, - "loss": 0.0365, - "num_tokens": 176029017.0, - "step": 412 - }, - { - "epoch": 3.7223476297968396, - "grad_norm": 0.13669531400358878, - "learning_rate": 2.4798104840771294e-06, - "loss": 0.0091, - "num_tokens": 176454891.0, - "step": 413 - }, - { - "epoch": 3.7313769751693, - "grad_norm": 0.06004009215428138, - "learning_rate": 2.46038226210255e-06, - "loss": 0.0093, - "num_tokens": 176885789.0, - "step": 414 - }, - { - "epoch": 3.740406320541761, - "grad_norm": 0.07913705101953389, - "learning_rate": 2.4410576862196435e-06, - "loss": 0.0142, - "num_tokens": 177301433.0, - "step": 415 - }, - { - "epoch": 3.749435665914221, - "grad_norm": 0.06212903945062577, - "learning_rate": 2.4218374153654627e-06, - "loss": 0.0095, - "num_tokens": 177722579.0, - "step": 416 - }, - { - "epoch": 3.758465011286682, - "grad_norm": 0.06270055687189864, - "learning_rate": 2.4027221049204347e-06, - "loss": 0.0094, - "num_tokens": 178144731.0, - "step": 417 - }, - { - "epoch": 3.7674943566591423, - "grad_norm": 0.0668935291153066, - "learning_rate": 2.383712406685995e-06, - "loss": 0.0101, - "num_tokens": 178574868.0, - "step": 418 - }, - { - "epoch": 3.7765237020316027, - "grad_norm": 0.0640944822370456, - "learning_rate": 2.364808968862378e-06, - "loss": 0.0096, - "num_tokens": 179005190.0, - "step": 419 - }, - { - "epoch": 3.785553047404063, - "grad_norm": 0.06369669742321124, - "learning_rate": 2.346012436026508e-06, - "loss": 0.0096, - "num_tokens": 179421217.0, - "step": 420 - }, - { - "epoch": 3.7945823927765234, - "grad_norm": 0.06441713353622074, - "learning_rate": 2.327323449110017e-06, - "loss": 0.0093, - "num_tokens": 179838546.0, - "step": 421 - }, - { - "epoch": 3.8036117381489842, - "grad_norm": 0.08338546996857724, - "learning_rate": 2.3087426453774002e-06, - "loss": 0.0107, - "num_tokens": 180250539.0, - "step": 422 - }, - { - "epoch": 3.8126410835214446, - "grad_norm": 0.06738272796438924, - "learning_rate": 2.290270658404271e-06, - "loss": 0.011, - "num_tokens": 180672562.0, - "step": 423 - }, - { - "epoch": 3.8216704288939054, - "grad_norm": 0.0625991588242634, - "learning_rate": 2.2719081180557757e-06, - "loss": 0.0096, - "num_tokens": 181091537.0, - "step": 424 - }, - { - "epoch": 3.8306997742663658, - "grad_norm": 0.0556196250975855, - "learning_rate": 2.253655650465096e-06, - "loss": 0.0089, - "num_tokens": 181506969.0, - "step": 425 - }, - { - "epoch": 3.839729119638826, - "grad_norm": 0.06481115036912619, - "learning_rate": 2.2355138780121166e-06, - "loss": 0.0564, - "num_tokens": 181969132.0, - "step": 426 - }, - { - "epoch": 3.8487584650112865, - "grad_norm": 0.08347930182037203, - "learning_rate": 2.2174834193021934e-06, - "loss": 0.0091, - "num_tokens": 182397825.0, - "step": 427 - }, - { - "epoch": 3.8577878103837473, - "grad_norm": 0.059621287329162206, - "learning_rate": 2.199564889145058e-06, - "loss": 0.0172, - "num_tokens": 182846790.0, - "step": 428 - }, - { - "epoch": 3.8668171557562077, - "grad_norm": 0.07303880000577928, - "learning_rate": 2.181758898533866e-06, - "loss": 0.0093, - "num_tokens": 183262540.0, - "step": 429 - }, - { - "epoch": 3.875846501128668, - "grad_norm": 0.0673061690918613, - "learning_rate": 2.164066054624347e-06, - "loss": 0.0095, - "num_tokens": 183697485.0, - "step": 430 - }, - { - "epoch": 3.884875846501129, - "grad_norm": 0.0661976596717091, - "learning_rate": 2.146486960714114e-06, - "loss": 0.0089, - "num_tokens": 184125479.0, - "step": 431 - }, - { - "epoch": 3.893905191873589, - "grad_norm": 0.06398778686313247, - "learning_rate": 2.129022216222085e-06, - "loss": 0.0088, - "num_tokens": 184544539.0, - "step": 432 - }, - { - "epoch": 3.9029345372460496, - "grad_norm": 0.06215599177607287, - "learning_rate": 2.111672416668048e-06, - "loss": 0.0091, - "num_tokens": 184949325.0, - "step": 433 - }, - { - "epoch": 3.91196388261851, - "grad_norm": 0.062320619384830064, - "learning_rate": 2.0944381536523526e-06, - "loss": 0.0091, - "num_tokens": 185353902.0, - "step": 434 - }, - { - "epoch": 3.9209932279909707, - "grad_norm": 0.06084848690325647, - "learning_rate": 2.077320014835738e-06, - "loss": 0.0083, - "num_tokens": 185775888.0, - "step": 435 - }, - { - "epoch": 3.930022573363431, - "grad_norm": 0.06353024998165271, - "learning_rate": 2.0603185839192914e-06, - "loss": 0.009, - "num_tokens": 186203975.0, - "step": 436 - }, - { - "epoch": 3.939051918735892, - "grad_norm": 0.058439153731873986, - "learning_rate": 2.043434440624551e-06, - "loss": 0.0086, - "num_tokens": 186639222.0, - "step": 437 - }, - { - "epoch": 3.9480812641083523, - "grad_norm": 0.05830094939756939, - "learning_rate": 2.0266681606737335e-06, - "loss": 0.0093, - "num_tokens": 187080093.0, - "step": 438 - }, - { - "epoch": 3.9571106094808126, - "grad_norm": 0.059003884205823975, - "learning_rate": 2.0100203157701066e-06, - "loss": 0.0087, - "num_tokens": 187510794.0, - "step": 439 - }, - { - "epoch": 3.966139954853273, - "grad_norm": 0.06298185573366215, - "learning_rate": 1.993491473578491e-06, - "loss": 0.01, - "num_tokens": 187954041.0, - "step": 440 - }, - { - "epoch": 3.975169300225734, - "grad_norm": 0.059253636545730855, - "learning_rate": 1.9770821977059026e-06, - "loss": 0.0086, - "num_tokens": 188383605.0, - "step": 441 - }, - { - "epoch": 3.984198645598194, - "grad_norm": 0.060746167690560224, - "learning_rate": 1.9607930476823467e-06, - "loss": 0.009, - "num_tokens": 188821681.0, - "step": 442 - }, - { - "epoch": 3.9932279909706545, - "grad_norm": 0.059588516531004394, - "learning_rate": 1.9446245789417194e-06, - "loss": 0.0082, - "num_tokens": 189274421.0, - "step": 443 - }, - { - "epoch": 4.0, - "grad_norm": 0.0817118222108563, - "learning_rate": 1.928577342802885e-06, - "loss": 0.0086, - "num_tokens": 189586064.0, - "step": 444 - }, - { - "epoch": 4.0, - "eval_loss": 0.11091578751802444, - "eval_num_tokens": 189586064.0, - "eval_runtime": 55.4572, - "eval_samples_per_second": 45.17, - "eval_steps_per_second": 5.662, - "step": 444 - }, - { - "epoch": 4.00902934537246, - "grad_norm": 0.052009943856600654, - "learning_rate": 1.9126518864508685e-06, - "loss": 0.0074, - "num_tokens": 190012482.0, - "step": 445 - }, - { - "epoch": 4.018058690744921, - "grad_norm": 0.0558999480030762, - "learning_rate": 1.8968487529181967e-06, - "loss": 0.0077, - "num_tokens": 190450990.0, - "step": 446 - }, - { - "epoch": 4.027088036117381, - "grad_norm": 0.052664936025694234, - "learning_rate": 1.8811684810663915e-06, - "loss": 0.0075, - "num_tokens": 190867207.0, - "step": 447 - }, - { - "epoch": 4.036117381489842, - "grad_norm": 0.052591105688649366, - "learning_rate": 1.8656116055675816e-06, - "loss": 0.0072, - "num_tokens": 191302488.0, - "step": 448 - }, - { - "epoch": 4.045146726862303, - "grad_norm": 0.05229870599777959, - "learning_rate": 1.85017865688628e-06, - "loss": 0.0074, - "num_tokens": 191733936.0, - "step": 449 - }, - { - "epoch": 4.054176072234763, - "grad_norm": 0.05419506215141233, - "learning_rate": 1.8348701612612951e-06, - "loss": 0.0071, - "num_tokens": 192167735.0, - "step": 450 - }, - { - "epoch": 4.063205417607223, - "grad_norm": 0.05154741924536706, - "learning_rate": 1.819686640687785e-06, - "loss": 0.0072, - "num_tokens": 192575157.0, - "step": 451 - }, - { - "epoch": 4.072234762979684, - "grad_norm": 0.1279411133607819, - "learning_rate": 1.8046286128994578e-06, - "loss": 0.0185, - "num_tokens": 193021330.0, - "step": 452 - }, - { - "epoch": 4.081264108352144, - "grad_norm": 0.05804375874747993, - "learning_rate": 1.7896965913509213e-06, - "loss": 0.0075, - "num_tokens": 193464296.0, - "step": 453 - }, - { - "epoch": 4.090293453724605, - "grad_norm": 0.05478585010295768, - "learning_rate": 1.7748910852001684e-06, - "loss": 0.0073, - "num_tokens": 193896598.0, - "step": 454 - }, - { - "epoch": 4.099322799097066, - "grad_norm": 0.09510833668893832, - "learning_rate": 1.7602125992912239e-06, - "loss": 0.0116, - "num_tokens": 194322822.0, - "step": 455 - }, - { - "epoch": 4.108352144469526, - "grad_norm": 0.09148589236158025, - "learning_rate": 1.7456616341369237e-06, - "loss": 0.0266, - "num_tokens": 194756207.0, - "step": 456 - }, - { - "epoch": 4.1173814898419865, - "grad_norm": 0.0936052964125868, - "learning_rate": 1.7312386859018517e-06, - "loss": 0.0306, - "num_tokens": 195185054.0, - "step": 457 - }, - { - "epoch": 4.126410835214447, - "grad_norm": 0.0630492348104541, - "learning_rate": 1.7169442463854208e-06, - "loss": 0.0081, - "num_tokens": 195599914.0, - "step": 458 - }, - { - "epoch": 4.135440180586907, - "grad_norm": 0.058377931570083946, - "learning_rate": 1.7027788030050967e-06, - "loss": 0.0072, - "num_tokens": 196011879.0, - "step": 459 - }, - { - "epoch": 4.144469525959368, - "grad_norm": 0.06609945281438827, - "learning_rate": 1.6887428387797942e-06, - "loss": 0.0087, - "num_tokens": 196428514.0, - "step": 460 - }, - { - "epoch": 4.153498871331829, - "grad_norm": 0.058518152958306584, - "learning_rate": 1.6748368323133868e-06, - "loss": 0.0078, - "num_tokens": 196847106.0, - "step": 461 - }, - { - "epoch": 4.162528216704289, - "grad_norm": 0.06538274508219379, - "learning_rate": 1.6610612577784009e-06, - "loss": 0.015, - "num_tokens": 197290686.0, - "step": 462 - }, - { - "epoch": 4.1715575620767495, - "grad_norm": 0.05765426974466294, - "learning_rate": 1.6474165848998439e-06, - "loss": 0.0077, - "num_tokens": 197727840.0, - "step": 463 - }, - { - "epoch": 4.18058690744921, - "grad_norm": 0.05678297815227037, - "learning_rate": 1.633903278939185e-06, - "loss": 0.0075, - "num_tokens": 198145792.0, - "step": 464 - }, - { - "epoch": 4.18961625282167, - "grad_norm": 0.05345688980658021, - "learning_rate": 1.6205218006784934e-06, - "loss": 0.0074, - "num_tokens": 198577289.0, - "step": 465 - }, - { - "epoch": 4.198645598194131, - "grad_norm": 0.05771798802728127, - "learning_rate": 1.6072726064047212e-06, - "loss": 0.0073, - "num_tokens": 199021897.0, - "step": 466 - }, - { - "epoch": 4.207674943566591, - "grad_norm": 0.05564057777959995, - "learning_rate": 1.5941561478941563e-06, - "loss": 0.0075, - "num_tokens": 199437131.0, - "step": 467 - }, - { - "epoch": 4.216704288939052, - "grad_norm": 0.05403994059952407, - "learning_rate": 1.5811728723970019e-06, - "loss": 0.0072, - "num_tokens": 199863916.0, - "step": 468 - }, - { - "epoch": 4.225733634311513, - "grad_norm": 0.05449636042777582, - "learning_rate": 1.568323222622138e-06, - "loss": 0.0075, - "num_tokens": 200298569.0, - "step": 469 - }, - { - "epoch": 4.234762979683973, - "grad_norm": 0.056568063508186425, - "learning_rate": 1.5556076367220218e-06, - "loss": 0.0075, - "num_tokens": 200746877.0, - "step": 470 - }, - { - "epoch": 4.243792325056433, - "grad_norm": 0.04916400724026921, - "learning_rate": 1.543026548277746e-06, - "loss": 0.0068, - "num_tokens": 201152308.0, - "step": 471 - }, - { - "epoch": 4.252821670428894, - "grad_norm": 0.05590743077242364, - "learning_rate": 1.5305803862842569e-06, - "loss": 0.0076, - "num_tokens": 201575609.0, - "step": 472 - }, - { - "epoch": 4.261851015801354, - "grad_norm": 0.06612932945772447, - "learning_rate": 1.5182695751357245e-06, - "loss": 0.0082, - "num_tokens": 201998154.0, - "step": 473 - }, - { - "epoch": 4.270880361173815, - "grad_norm": 0.16852448868178485, - "learning_rate": 1.5060945346110707e-06, - "loss": 0.0314, - "num_tokens": 202418768.0, - "step": 474 - }, - { - "epoch": 4.279909706546276, - "grad_norm": 0.12451963953682407, - "learning_rate": 1.4940556798596585e-06, - "loss": 0.0109, - "num_tokens": 202847509.0, - "step": 475 - }, - { - "epoch": 4.288939051918736, - "grad_norm": 0.05636694817589496, - "learning_rate": 1.4821534213871344e-06, - "loss": 0.0077, - "num_tokens": 203293633.0, - "step": 476 - }, - { - "epoch": 4.297968397291196, - "grad_norm": 0.05468619950050741, - "learning_rate": 1.4703881650414304e-06, - "loss": 0.0077, - "num_tokens": 203726576.0, - "step": 477 - }, - { - "epoch": 4.306997742663657, - "grad_norm": 0.05296646383507029, - "learning_rate": 1.4587603119989263e-06, - "loss": 0.0069, - "num_tokens": 204143709.0, - "step": 478 - }, - { - "epoch": 4.316027088036117, - "grad_norm": 0.058621647518717554, - "learning_rate": 1.4472702587507655e-06, - "loss": 0.0073, - "num_tokens": 204573732.0, - "step": 479 - }, - { - "epoch": 4.3250564334085775, - "grad_norm": 0.06658969833141296, - "learning_rate": 1.435918397089347e-06, - "loss": 0.0247, - "num_tokens": 205005583.0, - "step": 480 - }, - { - "epoch": 4.334085778781039, - "grad_norm": 0.05509992027896923, - "learning_rate": 1.4247051140949513e-06, - "loss": 0.0074, - "num_tokens": 205437157.0, - "step": 481 - }, - { - "epoch": 4.343115124153499, - "grad_norm": 0.056995599613870046, - "learning_rate": 1.4136307921225513e-06, - "loss": 0.0075, - "num_tokens": 205850496.0, - "step": 482 - }, - { - "epoch": 4.3521444695259595, - "grad_norm": 0.05649954152945347, - "learning_rate": 1.4026958087887723e-06, - "loss": 0.0073, - "num_tokens": 206281600.0, - "step": 483 - }, - { - "epoch": 4.36117381489842, - "grad_norm": 0.08462415808186972, - "learning_rate": 1.3919005369590132e-06, - "loss": 0.0158, - "num_tokens": 206692358.0, - "step": 484 - }, - { - "epoch": 4.37020316027088, - "grad_norm": 0.05861019810430756, - "learning_rate": 1.381245344734739e-06, - "loss": 0.0076, - "num_tokens": 207116740.0, - "step": 485 - }, - { - "epoch": 4.3792325056433405, - "grad_norm": 0.05590320171665174, - "learning_rate": 1.3707305954409194e-06, - "loss": 0.0073, - "num_tokens": 207550858.0, - "step": 486 - }, - { - "epoch": 4.388261851015802, - "grad_norm": 0.1440931849337204, - "learning_rate": 1.3603566476136488e-06, - "loss": 0.0304, - "num_tokens": 207978354.0, - "step": 487 - }, - { - "epoch": 4.397291196388262, - "grad_norm": 0.0575755327672531, - "learning_rate": 1.3501238549879156e-06, - "loss": 0.0072, - "num_tokens": 208410302.0, - "step": 488 - }, - { - "epoch": 4.4063205417607225, - "grad_norm": 0.05585899759462395, - "learning_rate": 1.3400325664855437e-06, - "loss": 0.0074, - "num_tokens": 208812870.0, - "step": 489 - }, - { - "epoch": 4.415349887133183, - "grad_norm": 0.05686170026770136, - "learning_rate": 1.3300831262032925e-06, - "loss": 0.0069, - "num_tokens": 209254543.0, - "step": 490 - }, - { - "epoch": 4.424379232505643, - "grad_norm": 0.05803957066651316, - "learning_rate": 1.3202758734011244e-06, - "loss": 0.0075, - "num_tokens": 209669464.0, - "step": 491 - }, - { - "epoch": 4.433408577878104, - "grad_norm": 0.05687101353160547, - "learning_rate": 1.3106111424906355e-06, - "loss": 0.0073, - "num_tokens": 210092796.0, - "step": 492 - }, - { - "epoch": 4.442437923250564, - "grad_norm": 0.06650218713012311, - "learning_rate": 1.3010892630236568e-06, - "loss": 0.0545, - "num_tokens": 210539090.0, - "step": 493 - }, - { - "epoch": 4.451467268623025, - "grad_norm": 0.059652537040254104, - "learning_rate": 1.2917105596810112e-06, - "loss": 0.0081, - "num_tokens": 210962860.0, - "step": 494 - }, - { - "epoch": 4.460496613995486, - "grad_norm": 0.05691116296595083, - "learning_rate": 1.2824753522614473e-06, - "loss": 0.0073, - "num_tokens": 211389239.0, - "step": 495 - }, - { - "epoch": 4.469525959367946, - "grad_norm": 0.05271407561778469, - "learning_rate": 1.273383955670732e-06, - "loss": 0.0067, - "num_tokens": 211848433.0, - "step": 496 - }, - { - "epoch": 4.478555304740406, - "grad_norm": 0.05811717147816959, - "learning_rate": 1.2644366799109118e-06, - "loss": 0.0073, - "num_tokens": 212286944.0, - "step": 497 - }, - { - "epoch": 4.487584650112867, - "grad_norm": 0.05983073556707705, - "learning_rate": 1.2556338300697485e-06, - "loss": 0.0076, - "num_tokens": 212714521.0, - "step": 498 - }, - { - "epoch": 4.496613995485327, - "grad_norm": 0.05684584946279294, - "learning_rate": 1.2469757063103061e-06, - "loss": 0.0072, - "num_tokens": 213148368.0, - "step": 499 - }, - { - "epoch": 4.505643340857787, - "grad_norm": 0.05067708037080972, - "learning_rate": 1.2384626038607255e-06, - "loss": 0.0069, - "num_tokens": 213576096.0, - "step": 500 - }, - { - "epoch": 4.514672686230249, - "grad_norm": 0.060108535921653126, - "learning_rate": 1.2300948130041515e-06, - "loss": 0.0072, - "num_tokens": 213992716.0, - "step": 501 - }, - { - "epoch": 4.523702031602709, - "grad_norm": 0.0552555230197704, - "learning_rate": 1.2218726190688356e-06, - "loss": 0.0073, - "num_tokens": 214418244.0, - "step": 502 - }, - { - "epoch": 4.532731376975169, - "grad_norm": 0.08684486719951047, - "learning_rate": 1.2137963024184115e-06, - "loss": 0.0151, - "num_tokens": 214857117.0, - "step": 503 - }, - { - "epoch": 4.54176072234763, - "grad_norm": 0.055314422617567714, - "learning_rate": 1.2058661384423267e-06, - "loss": 0.0071, - "num_tokens": 215275416.0, - "step": 504 - }, - { - "epoch": 4.55079006772009, - "grad_norm": 0.05865873272038585, - "learning_rate": 1.1980823975464593e-06, - "loss": 0.0079, - "num_tokens": 215695066.0, - "step": 505 - }, - { - "epoch": 4.5598194130925505, - "grad_norm": 0.05975989236654091, - "learning_rate": 1.1904453451438951e-06, - "loss": 0.0074, - "num_tokens": 216118838.0, - "step": 506 - }, - { - "epoch": 4.568848758465011, - "grad_norm": 0.05892246505853784, - "learning_rate": 1.1829552416458775e-06, - "loss": 0.0073, - "num_tokens": 216542622.0, - "step": 507 - }, - { - "epoch": 4.577878103837472, - "grad_norm": 0.053580754460193454, - "learning_rate": 1.1756123424529266e-06, - "loss": 0.0067, - "num_tokens": 216996879.0, - "step": 508 - }, - { - "epoch": 4.586907449209932, - "grad_norm": 0.05682211652915156, - "learning_rate": 1.1684168979461336e-06, - "loss": 0.007, - "num_tokens": 217423398.0, - "step": 509 - }, - { - "epoch": 4.595936794582393, - "grad_norm": 0.0552314425067665, - "learning_rate": 1.1613691534786196e-06, - "loss": 0.0074, - "num_tokens": 217840389.0, - "step": 510 - }, - { - "epoch": 4.604966139954853, - "grad_norm": 0.11448108264486681, - "learning_rate": 1.1544693493671712e-06, - "loss": 0.0117, - "num_tokens": 218270640.0, - "step": 511 - }, - { - "epoch": 4.6139954853273135, - "grad_norm": 0.16768787318826778, - "learning_rate": 1.1477177208840482e-06, - "loss": 0.0166, - "num_tokens": 218713265.0, - "step": 512 - }, - { - "epoch": 4.623024830699774, - "grad_norm": 0.055188358952264455, - "learning_rate": 1.1411144982489562e-06, - "loss": 0.0069, - "num_tokens": 219137609.0, - "step": 513 - }, - { - "epoch": 4.632054176072235, - "grad_norm": 0.17922405875168793, - "learning_rate": 1.1346599066212008e-06, - "loss": 0.0266, - "num_tokens": 219578547.0, - "step": 514 - }, - { - "epoch": 4.6410835214446955, - "grad_norm": 0.07132874505614062, - "learning_rate": 1.128354166092009e-06, - "loss": 0.0136, - "num_tokens": 220026371.0, - "step": 515 - }, - { - "epoch": 4.650112866817156, - "grad_norm": 0.06058796922864307, - "learning_rate": 1.1221974916770236e-06, - "loss": 0.0075, - "num_tokens": 220433942.0, - "step": 516 - }, - { - "epoch": 4.659142212189616, - "grad_norm": 0.05833795875496949, - "learning_rate": 1.11619009330897e-06, - "loss": 0.0073, - "num_tokens": 220867424.0, - "step": 517 - }, - { - "epoch": 4.668171557562077, - "grad_norm": 0.05870831578481961, - "learning_rate": 1.1103321758305028e-06, - "loss": 0.0072, - "num_tokens": 221290561.0, - "step": 518 - }, - { - "epoch": 4.677200902934537, - "grad_norm": 0.054615134252345106, - "learning_rate": 1.104623938987216e-06, - "loss": 0.0072, - "num_tokens": 221699415.0, - "step": 519 - }, - { - "epoch": 4.686230248306998, - "grad_norm": 0.05316946880846092, - "learning_rate": 1.0990655774208339e-06, - "loss": 0.0071, - "num_tokens": 222150407.0, - "step": 520 - }, - { - "epoch": 4.6952595936794586, - "grad_norm": 0.07788416053210236, - "learning_rate": 1.0936572806625755e-06, - "loss": 0.0233, - "num_tokens": 222572283.0, - "step": 521 - }, - { - "epoch": 4.704288939051919, - "grad_norm": 0.05880412240959133, - "learning_rate": 1.0883992331266883e-06, - "loss": 0.0076, - "num_tokens": 222986703.0, - "step": 522 - }, - { - "epoch": 4.713318284424379, - "grad_norm": 0.05632076628688802, - "learning_rate": 1.0832916141041655e-06, - "loss": 0.0072, - "num_tokens": 223408969.0, - "step": 523 - }, - { - "epoch": 4.72234762979684, - "grad_norm": 0.058100528164223286, - "learning_rate": 1.078334597756625e-06, - "loss": 0.0075, - "num_tokens": 223835474.0, - "step": 524 - }, - { - "epoch": 4.7313769751693, - "grad_norm": 0.052063222481905536, - "learning_rate": 1.0735283531103781e-06, - "loss": 0.0066, - "num_tokens": 224259986.0, - "step": 525 - }, - { - "epoch": 4.74040632054176, - "grad_norm": 0.055167878798155864, - "learning_rate": 1.0688730440506611e-06, - "loss": 0.008, - "num_tokens": 224699745.0, - "step": 526 - }, - { - "epoch": 4.749435665914222, - "grad_norm": 0.05267890945523617, - "learning_rate": 1.0643688293160503e-06, - "loss": 0.0067, - "num_tokens": 225145143.0, - "step": 527 - }, - { - "epoch": 4.758465011286682, - "grad_norm": 0.05312403103681725, - "learning_rate": 1.0600158624930462e-06, - "loss": 0.0071, - "num_tokens": 225578145.0, - "step": 528 - }, - { - "epoch": 4.767494356659142, - "grad_norm": 0.06796402119311354, - "learning_rate": 1.0558142920108394e-06, - "loss": 0.0067, - "num_tokens": 226003984.0, - "step": 529 - }, - { - "epoch": 4.776523702031603, - "grad_norm": 0.05356999529102921, - "learning_rate": 1.0517642611362464e-06, - "loss": 0.0068, - "num_tokens": 226426088.0, - "step": 530 - }, - { - "epoch": 4.785553047404063, - "grad_norm": 0.059876875495227244, - "learning_rate": 1.047865907968827e-06, - "loss": 0.0075, - "num_tokens": 226858806.0, - "step": 531 - }, - { - "epoch": 4.794582392776523, - "grad_norm": 0.0626550898148628, - "learning_rate": 1.0441193654361755e-06, - "loss": 0.0089, - "num_tokens": 227291465.0, - "step": 532 - }, - { - "epoch": 4.803611738148984, - "grad_norm": 0.05255502973330509, - "learning_rate": 1.0405247612893841e-06, - "loss": 0.007, - "num_tokens": 227721309.0, - "step": 533 - }, - { - "epoch": 4.812641083521445, - "grad_norm": 0.05438779778903025, - "learning_rate": 1.037082218098692e-06, - "loss": 0.0069, - "num_tokens": 228142508.0, - "step": 534 - }, - { - "epoch": 4.821670428893905, - "grad_norm": 0.0593963271616768, - "learning_rate": 1.0337918532493027e-06, - "loss": 0.0073, - "num_tokens": 228560446.0, - "step": 535 - }, - { - "epoch": 4.830699774266366, - "grad_norm": 0.05474712434489935, - "learning_rate": 1.0306537789373832e-06, - "loss": 0.0065, - "num_tokens": 228983150.0, - "step": 536 - }, - { - "epoch": 4.839729119638826, - "grad_norm": 0.058896302816248663, - "learning_rate": 1.027668102166235e-06, - "loss": 0.0074, - "num_tokens": 229412211.0, - "step": 537 - }, - { - "epoch": 4.8487584650112865, - "grad_norm": 0.05470480123832545, - "learning_rate": 1.02483492474265e-06, - "loss": 0.0074, - "num_tokens": 229854229.0, - "step": 538 - }, - { - "epoch": 4.857787810383747, - "grad_norm": 0.07890252865945219, - "learning_rate": 1.0221543432734369e-06, - "loss": 0.008, - "num_tokens": 230272495.0, - "step": 539 - }, - { - "epoch": 4.866817155756207, - "grad_norm": 0.056142774309556866, - "learning_rate": 1.0196264491621247e-06, - "loss": 0.0071, - "num_tokens": 230670995.0, - "step": 540 - }, - { - "epoch": 4.8758465011286685, - "grad_norm": 0.05995160644721174, - "learning_rate": 1.0172513286058505e-06, - "loss": 0.0072, - "num_tokens": 231097273.0, - "step": 541 - }, - { - "epoch": 4.884875846501129, - "grad_norm": 0.057626877086779195, - "learning_rate": 1.015029062592418e-06, - "loss": 0.0069, - "num_tokens": 231529230.0, - "step": 542 - }, - { - "epoch": 4.893905191873589, - "grad_norm": 0.058716050916906605, - "learning_rate": 1.012959726897535e-06, - "loss": 0.0074, - "num_tokens": 231936065.0, - "step": 543 - }, - { - "epoch": 4.9029345372460496, - "grad_norm": 0.05639352371278753, - "learning_rate": 1.0110433920822306e-06, - "loss": 0.0069, - "num_tokens": 232357557.0, - "step": 544 - }, - { - "epoch": 4.91196388261851, - "grad_norm": 0.06136250042932289, - "learning_rate": 1.009280123490451e-06, - "loss": 0.0081, - "num_tokens": 232776441.0, - "step": 545 - }, - { - "epoch": 4.92099322799097, - "grad_norm": 0.058152440154199, - "learning_rate": 1.0076699812468264e-06, - "loss": 0.0072, - "num_tokens": 233194997.0, - "step": 546 - }, - { - "epoch": 4.9300225733634315, - "grad_norm": 0.054470578247469356, - "learning_rate": 1.0062130202546278e-06, - "loss": 0.007, - "num_tokens": 233614735.0, - "step": 547 - }, - { - "epoch": 4.939051918735892, - "grad_norm": 0.05476103959928163, - "learning_rate": 1.0049092901938875e-06, - "loss": 0.0067, - "num_tokens": 234054550.0, - "step": 548 - }, - { - "epoch": 4.948081264108352, - "grad_norm": 0.05951797614235394, - "learning_rate": 1.0037588355197116e-06, - "loss": 0.0073, - "num_tokens": 234481341.0, - "step": 549 - }, - { - "epoch": 4.957110609480813, - "grad_norm": 0.06242793615084626, - "learning_rate": 1.00276169546076e-06, - "loss": 0.0078, - "num_tokens": 234907817.0, - "step": 550 - }, - { - "epoch": 4.966139954853273, - "grad_norm": 0.0636923634854018, - "learning_rate": 1.0019179040179093e-06, - "loss": 0.0125, - "num_tokens": 235357272.0, - "step": 551 - }, - { - "epoch": 4.975169300225733, - "grad_norm": 0.055465368286989225, - "learning_rate": 1.0012274899630954e-06, - "loss": 0.0231, - "num_tokens": 235805693.0, - "step": 552 - }, - { - "epoch": 4.984198645598195, - "grad_norm": 0.05982694743764175, - "learning_rate": 1.0006904768383305e-06, - "loss": 0.007, - "num_tokens": 236221003.0, - "step": 553 - }, - { - "epoch": 4.993227990970655, - "grad_norm": 0.06260234129879096, - "learning_rate": 1.0003068829549017e-06, - "loss": 0.0073, - "num_tokens": 236660222.0, - "step": 554 - }, - { - "epoch": 5.0, - "grad_norm": 0.06260234129879096, - "learning_rate": 1.0000767213927445e-06, - "loss": 0.0071, - "num_tokens": 236984937.0, - "step": 555 - }, - { - "epoch": 5.0, - "eval_loss": 0.12006451934576035, - "eval_num_tokens": 236984937.0, - "eval_runtime": 55.4823, - "eval_samples_per_second": 45.15, - "eval_steps_per_second": 5.659, - "step": 555 - }, - { - "epoch": 5.0, - "step": 555, - "total_flos": 7.804790123203133e+17, - "train_loss": 0.04724993688717216, - "train_runtime": 7963.8173, - "train_samples_per_second": 8.896, - "train_steps_per_second": 0.07 - } - ], - "logging_steps": 1, - "max_steps": 555, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 7.804790123203133e+17, - "train_batch_size": 2, - "trial_name": null, - "trial_params": null -}