{ "best_global_step": 700, "best_metric": 0.59086859, "best_model_checkpoint": "/home/raid/models/25Blowfish_v1.1.5/v1-20260204-180539/checkpoint-500", "epoch": 3.0, "eval_steps": 50, "global_step": 1116, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026936026936026937, "grad_norm": 6.327178478240967, "learning_rate": 1.4705882352941178e-07, "loss": 1.2658028602600098, "step": 1, "token_acc": 0.7048856012386031 }, { "epoch": 0.0053872053872053875, "grad_norm": 5.692208766937256, "learning_rate": 2.9411764705882356e-07, "loss": 1.2563695907592773, "step": 2, "token_acc": 0.7006555564331411 }, { "epoch": 0.00808080808080808, "grad_norm": 5.91769552230835, "learning_rate": 4.4117647058823536e-07, "loss": 1.2924132347106934, "step": 3, "token_acc": 0.6982949701619778 }, { "epoch": 0.010774410774410775, "grad_norm": 6.101612091064453, "learning_rate": 5.882352941176471e-07, "loss": 1.2813005447387695, "step": 4, "token_acc": 0.6978171896316507 }, { "epoch": 0.013468013468013467, "grad_norm": 5.960460186004639, "learning_rate": 7.352941176470589e-07, "loss": 1.2544516324996948, "step": 5, "token_acc": 0.7031041257367387 }, { "epoch": 0.01616161616161616, "grad_norm": 5.8821563720703125, "learning_rate": 8.823529411764707e-07, "loss": 1.268875002861023, "step": 6, "token_acc": 0.7004374316513045 }, { "epoch": 0.018855218855218854, "grad_norm": 6.048320770263672, "learning_rate": 1.0294117647058825e-06, "loss": 1.2701635360717773, "step": 7, "token_acc": 0.6967006109979633 }, { "epoch": 0.02154882154882155, "grad_norm": 5.733208656311035, "learning_rate": 1.1764705882352942e-06, "loss": 1.2684029340744019, "step": 8, "token_acc": 0.7 }, { "epoch": 0.024242424242424242, "grad_norm": 4.4054107666015625, "learning_rate": 1.323529411764706e-06, "loss": 1.2304329872131348, "step": 9, "token_acc": 0.7058823529411765 }, { "epoch": 0.026936026936026935, "grad_norm": 4.2520527839660645, "learning_rate": 1.4705882352941177e-06, "loss": 1.2263442277908325, "step": 10, "token_acc": 0.7016870010543756 }, { "epoch": 0.02962962962962963, "grad_norm": 4.984673976898193, "learning_rate": 1.6176470588235297e-06, "loss": 1.2646002769470215, "step": 11, "token_acc": 0.6933640552995392 }, { "epoch": 0.03232323232323232, "grad_norm": 6.6318864822387695, "learning_rate": 1.7647058823529414e-06, "loss": 1.1595369577407837, "step": 12, "token_acc": 0.711726525313717 }, { "epoch": 0.035016835016835016, "grad_norm": 3.30735182762146, "learning_rate": 1.9117647058823528e-06, "loss": 1.1208479404449463, "step": 13, "token_acc": 0.7198084160109477 }, { "epoch": 0.03771043771043771, "grad_norm": 2.9693126678466797, "learning_rate": 2.058823529411765e-06, "loss": 1.1334466934204102, "step": 14, "token_acc": 0.7061919730974486 }, { "epoch": 0.04040404040404041, "grad_norm": 3.123236894607544, "learning_rate": 2.2058823529411767e-06, "loss": 1.125007152557373, "step": 15, "token_acc": 0.70644370513045 }, { "epoch": 0.0430976430976431, "grad_norm": 2.90378475189209, "learning_rate": 2.3529411764705885e-06, "loss": 1.1151888370513916, "step": 16, "token_acc": 0.7181445453814616 }, { "epoch": 0.04579124579124579, "grad_norm": 2.767350435256958, "learning_rate": 2.5e-06, "loss": 1.0844428539276123, "step": 17, "token_acc": 0.7160035366931918 }, { "epoch": 0.048484848484848485, "grad_norm": 2.4120895862579346, "learning_rate": 2.647058823529412e-06, "loss": 1.0506455898284912, "step": 18, "token_acc": 0.7185920737654047 }, { "epoch": 0.05117845117845118, "grad_norm": 2.179903507232666, "learning_rate": 2.7941176470588237e-06, "loss": 1.0126980543136597, "step": 19, "token_acc": 0.7299469671859463 }, { "epoch": 0.05387205387205387, "grad_norm": 2.1633050441741943, "learning_rate": 2.9411764705882355e-06, "loss": 1.0257422924041748, "step": 20, "token_acc": 0.7273593898951383 }, { "epoch": 0.05656565656565657, "grad_norm": 2.281221389770508, "learning_rate": 3.0882352941176476e-06, "loss": 0.9861394762992859, "step": 21, "token_acc": 0.7393435043920481 }, { "epoch": 0.05925925925925926, "grad_norm": 2.2489614486694336, "learning_rate": 3.2352941176470594e-06, "loss": 1.0492191314697266, "step": 22, "token_acc": 0.7239882025233492 }, { "epoch": 0.061952861952861954, "grad_norm": 2.27473521232605, "learning_rate": 3.382352941176471e-06, "loss": 0.9766485095024109, "step": 23, "token_acc": 0.7353722265056113 }, { "epoch": 0.06464646464646465, "grad_norm": 2.233490467071533, "learning_rate": 3.529411764705883e-06, "loss": 0.9662765860557556, "step": 24, "token_acc": 0.7422263419566866 }, { "epoch": 0.06734006734006734, "grad_norm": 2.5038654804229736, "learning_rate": 3.6764705882352946e-06, "loss": 0.937242865562439, "step": 25, "token_acc": 0.7478929332221913 }, { "epoch": 0.07003367003367003, "grad_norm": 2.1911282539367676, "learning_rate": 3.8235294117647055e-06, "loss": 0.8835188150405884, "step": 26, "token_acc": 0.7588371324743971 }, { "epoch": 0.07272727272727272, "grad_norm": 2.094655990600586, "learning_rate": 3.970588235294118e-06, "loss": 0.9517135620117188, "step": 27, "token_acc": 0.7397373420335344 }, { "epoch": 0.07542087542087542, "grad_norm": 1.9400707483291626, "learning_rate": 4.11764705882353e-06, "loss": 0.9086529612541199, "step": 28, "token_acc": 0.75 }, { "epoch": 0.07811447811447811, "grad_norm": 1.8915455341339111, "learning_rate": 4.264705882352942e-06, "loss": 0.8676010370254517, "step": 29, "token_acc": 0.7584742301723434 }, { "epoch": 0.08080808080808081, "grad_norm": 1.9338808059692383, "learning_rate": 4.411764705882353e-06, "loss": 0.839089572429657, "step": 30, "token_acc": 0.7676996724615772 }, { "epoch": 0.08350168350168351, "grad_norm": 1.7812891006469727, "learning_rate": 4.558823529411765e-06, "loss": 0.8629589080810547, "step": 31, "token_acc": 0.7561735261401558 }, { "epoch": 0.0861952861952862, "grad_norm": 1.8797152042388916, "learning_rate": 4.705882352941177e-06, "loss": 0.8803250193595886, "step": 32, "token_acc": 0.7531634598717282 }, { "epoch": 0.08888888888888889, "grad_norm": 1.7441165447235107, "learning_rate": 4.852941176470589e-06, "loss": 0.8753576278686523, "step": 33, "token_acc": 0.75742833620238 }, { "epoch": 0.09158249158249158, "grad_norm": 1.8013807535171509, "learning_rate": 5e-06, "loss": 0.8573440313339233, "step": 34, "token_acc": 0.7638546291728334 }, { "epoch": 0.09427609427609428, "grad_norm": 1.752218246459961, "learning_rate": 4.999989462079163e-06, "loss": 0.8698481917381287, "step": 35, "token_acc": 0.7554357130628317 }, { "epoch": 0.09696969696969697, "grad_norm": 1.8150097131729126, "learning_rate": 4.99995784840549e-06, "loss": 0.8450948596000671, "step": 36, "token_acc": 0.7601817569841804 }, { "epoch": 0.09966329966329966, "grad_norm": 1.5605357885360718, "learning_rate": 4.999905159245494e-06, "loss": 0.851024329662323, "step": 37, "token_acc": 0.7571258021190868 }, { "epoch": 0.10235690235690235, "grad_norm": 1.7855019569396973, "learning_rate": 4.999831395043363e-06, "loss": 0.767600417137146, "step": 38, "token_acc": 0.7822522522522523 }, { "epoch": 0.10505050505050505, "grad_norm": 1.9658234119415283, "learning_rate": 4.999736556420954e-06, "loss": 0.772192656993866, "step": 39, "token_acc": 0.7747822864554382 }, { "epoch": 0.10774410774410774, "grad_norm": 1.7635198831558228, "learning_rate": 4.999620644177788e-06, "loss": 0.7825717329978943, "step": 40, "token_acc": 0.7758901758901758 }, { "epoch": 0.11043771043771043, "grad_norm": 1.8620353937149048, "learning_rate": 4.999483659291046e-06, "loss": 0.7993900179862976, "step": 41, "token_acc": 0.7736837269135348 }, { "epoch": 0.11313131313131314, "grad_norm": 1.7334764003753662, "learning_rate": 4.9993256029155554e-06, "loss": 0.8015269041061401, "step": 42, "token_acc": 0.7708086785009862 }, { "epoch": 0.11582491582491583, "grad_norm": 1.7795546054840088, "learning_rate": 4.9991464763837844e-06, "loss": 0.7840657234191895, "step": 43, "token_acc": 0.7772016320979259 }, { "epoch": 0.11851851851851852, "grad_norm": 1.5327906608581543, "learning_rate": 4.99894628120583e-06, "loss": 0.7725589275360107, "step": 44, "token_acc": 0.7754705241872764 }, { "epoch": 0.12121212121212122, "grad_norm": 1.8035056591033936, "learning_rate": 4.998725019069406e-06, "loss": 0.7932400703430176, "step": 45, "token_acc": 0.7727597104699766 }, { "epoch": 0.12390572390572391, "grad_norm": 1.5412386655807495, "learning_rate": 4.998482691839825e-06, "loss": 0.7506765127182007, "step": 46, "token_acc": 0.7807536592961695 }, { "epoch": 0.1265993265993266, "grad_norm": 1.7333848476409912, "learning_rate": 4.99821930155999e-06, "loss": 0.7840985059738159, "step": 47, "token_acc": 0.7734573119188504 }, { "epoch": 0.1292929292929293, "grad_norm": 1.601171612739563, "learning_rate": 4.997934850450366e-06, "loss": 0.7393255233764648, "step": 48, "token_acc": 0.7823415537412676 }, { "epoch": 0.13198653198653199, "grad_norm": 1.7907651662826538, "learning_rate": 4.997629340908974e-06, "loss": 0.7617444396018982, "step": 49, "token_acc": 0.7781643029167782 }, { "epoch": 0.13468013468013468, "grad_norm": 1.656044363975525, "learning_rate": 4.997302775511362e-06, "loss": 0.7794668078422546, "step": 50, "token_acc": 0.7733383685800604 }, { "epoch": 0.13468013468013468, "eval_loss": 0.7521641850471497, "eval_runtime": 9.9453, "eval_samples_per_second": 4.525, "eval_steps_per_second": 1.508, "eval_token_acc": 0.7802433360458885, "step": 50 }, { "epoch": 0.13737373737373737, "grad_norm": 1.690145492553711, "learning_rate": 4.996955157010586e-06, "loss": 0.7708606123924255, "step": 51, "token_acc": 0.7751225490196079 }, { "epoch": 0.14006734006734006, "grad_norm": 1.5344611406326294, "learning_rate": 4.9965864883371876e-06, "loss": 0.7536622285842896, "step": 52, "token_acc": 0.7794153972828325 }, { "epoch": 0.14276094276094276, "grad_norm": 1.671951174736023, "learning_rate": 4.996196772599166e-06, "loss": 0.7434553503990173, "step": 53, "token_acc": 0.7819257248133424 }, { "epoch": 0.14545454545454545, "grad_norm": 1.653519868850708, "learning_rate": 4.995786013081958e-06, "loss": 0.7024948000907898, "step": 54, "token_acc": 0.7904866648336542 }, { "epoch": 0.14814814814814814, "grad_norm": 1.5602867603302002, "learning_rate": 4.995354213248404e-06, "loss": 0.7687169313430786, "step": 55, "token_acc": 0.7793266487323715 }, { "epoch": 0.15084175084175083, "grad_norm": 1.575129747390747, "learning_rate": 4.994901376738721e-06, "loss": 0.7831870913505554, "step": 56, "token_acc": 0.7737327991639087 }, { "epoch": 0.15353535353535352, "grad_norm": 1.6578128337860107, "learning_rate": 4.994427507370476e-06, "loss": 0.7633312940597534, "step": 57, "token_acc": 0.7800054431642929 }, { "epoch": 0.15622895622895622, "grad_norm": 1.4568887948989868, "learning_rate": 4.993932609138544e-06, "loss": 0.6862384676933289, "step": 58, "token_acc": 0.7965183408828492 }, { "epoch": 0.1589225589225589, "grad_norm": 1.4144127368927002, "learning_rate": 4.9934166862150855e-06, "loss": 0.7921740412712097, "step": 59, "token_acc": 0.7686545454545455 }, { "epoch": 0.16161616161616163, "grad_norm": 1.562859296798706, "learning_rate": 4.992879742949505e-06, "loss": 0.7341197729110718, "step": 60, "token_acc": 0.785241480755087 }, { "epoch": 0.16430976430976432, "grad_norm": 1.4484463930130005, "learning_rate": 4.9923217838684125e-06, "loss": 0.7432484030723572, "step": 61, "token_acc": 0.7860258345915115 }, { "epoch": 0.16700336700336701, "grad_norm": 1.501795768737793, "learning_rate": 4.991742813675594e-06, "loss": 0.7038604021072388, "step": 62, "token_acc": 0.7938489371325193 }, { "epoch": 0.1696969696969697, "grad_norm": 1.5137748718261719, "learning_rate": 4.991142837251961e-06, "loss": 0.7845227122306824, "step": 63, "token_acc": 0.7753755663301805 }, { "epoch": 0.1723905723905724, "grad_norm": 1.3648817539215088, "learning_rate": 4.990521859655517e-06, "loss": 0.7713663578033447, "step": 64, "token_acc": 0.7795909486510009 }, { "epoch": 0.1750841750841751, "grad_norm": 1.349432349205017, "learning_rate": 4.9898798861213124e-06, "loss": 0.7677146792411804, "step": 65, "token_acc": 0.7803008388776396 }, { "epoch": 0.17777777777777778, "grad_norm": 1.4505559206008911, "learning_rate": 4.989216922061402e-06, "loss": 0.7482562065124512, "step": 66, "token_acc": 0.7821600931935431 }, { "epoch": 0.18047138047138048, "grad_norm": 1.5365347862243652, "learning_rate": 4.988532973064793e-06, "loss": 0.759559154510498, "step": 67, "token_acc": 0.776214405360134 }, { "epoch": 0.18316498316498317, "grad_norm": 1.608627200126648, "learning_rate": 4.987828044897408e-06, "loss": 0.7093471884727478, "step": 68, "token_acc": 0.7875183314322959 }, { "epoch": 0.18585858585858586, "grad_norm": 1.4796956777572632, "learning_rate": 4.987102143502027e-06, "loss": 0.7373017072677612, "step": 69, "token_acc": 0.7837740471215505 }, { "epoch": 0.18855218855218855, "grad_norm": 1.319777488708496, "learning_rate": 4.986355274998245e-06, "loss": 0.7051599025726318, "step": 70, "token_acc": 0.7861350574712643 }, { "epoch": 0.19124579124579125, "grad_norm": 1.4464823007583618, "learning_rate": 4.985587445682414e-06, "loss": 0.700887143611908, "step": 71, "token_acc": 0.7884523620168602 }, { "epoch": 0.19393939393939394, "grad_norm": 1.5826096534729004, "learning_rate": 4.9847986620275935e-06, "loss": 0.7095004916191101, "step": 72, "token_acc": 0.7861657937511287 }, { "epoch": 0.19663299663299663, "grad_norm": 1.6072183847427368, "learning_rate": 4.983988930683496e-06, "loss": 0.6962832808494568, "step": 73, "token_acc": 0.791550058680148 }, { "epoch": 0.19932659932659932, "grad_norm": 1.4740757942199707, "learning_rate": 4.983158258476427e-06, "loss": 0.6713933348655701, "step": 74, "token_acc": 0.7946585272796642 }, { "epoch": 0.20202020202020202, "grad_norm": 1.5663025379180908, "learning_rate": 4.982306652409236e-06, "loss": 0.7387303709983826, "step": 75, "token_acc": 0.7880771580693026 }, { "epoch": 0.2047138047138047, "grad_norm": 1.5372223854064941, "learning_rate": 4.981434119661247e-06, "loss": 0.734302282333374, "step": 76, "token_acc": 0.7818578135479544 }, { "epoch": 0.2074074074074074, "grad_norm": 1.5432430505752563, "learning_rate": 4.980540667588206e-06, "loss": 0.7205886244773865, "step": 77, "token_acc": 0.7871540553511438 }, { "epoch": 0.2101010101010101, "grad_norm": 1.5569109916687012, "learning_rate": 4.979626303722213e-06, "loss": 0.7142922878265381, "step": 78, "token_acc": 0.7915945611866502 }, { "epoch": 0.2127946127946128, "grad_norm": 1.8364050388336182, "learning_rate": 4.978691035771666e-06, "loss": 0.7019655108451843, "step": 79, "token_acc": 0.792203301927565 }, { "epoch": 0.21548821548821548, "grad_norm": 1.5502214431762695, "learning_rate": 4.977734871621186e-06, "loss": 0.6758935451507568, "step": 80, "token_acc": 0.7997789491583064 }, { "epoch": 0.21818181818181817, "grad_norm": 1.5628806352615356, "learning_rate": 4.97675781933156e-06, "loss": 0.7367017269134521, "step": 81, "token_acc": 0.7840257879656161 }, { "epoch": 0.22087542087542086, "grad_norm": 1.3852771520614624, "learning_rate": 4.975759887139667e-06, "loss": 0.7794051170349121, "step": 82, "token_acc": 0.7735262918624034 }, { "epoch": 0.22356902356902356, "grad_norm": 1.44141685962677, "learning_rate": 4.9747410834584134e-06, "loss": 0.6803838610649109, "step": 83, "token_acc": 0.7969268782333411 }, { "epoch": 0.22626262626262628, "grad_norm": 1.4274157285690308, "learning_rate": 4.973701416876654e-06, "loss": 0.7624625563621521, "step": 84, "token_acc": 0.779842852825627 }, { "epoch": 0.22895622895622897, "grad_norm": 1.5253498554229736, "learning_rate": 4.97264089615913e-06, "loss": 0.7066762447357178, "step": 85, "token_acc": 0.7901936547177586 }, { "epoch": 0.23164983164983166, "grad_norm": 1.566710352897644, "learning_rate": 4.971559530246388e-06, "loss": 0.7388700246810913, "step": 86, "token_acc": 0.7812698125169821 }, { "epoch": 0.23434343434343435, "grad_norm": 1.608980655670166, "learning_rate": 4.970457328254707e-06, "loss": 0.7376335859298706, "step": 87, "token_acc": 0.7849904397705545 }, { "epoch": 0.23703703703703705, "grad_norm": 1.5348403453826904, "learning_rate": 4.96933429947602e-06, "loss": 0.7051222324371338, "step": 88, "token_acc": 0.7933425797503467 }, { "epoch": 0.23973063973063974, "grad_norm": 1.6195021867752075, "learning_rate": 4.968190453377838e-06, "loss": 0.7433156967163086, "step": 89, "token_acc": 0.7756093675322606 }, { "epoch": 0.24242424242424243, "grad_norm": 1.5101776123046875, "learning_rate": 4.96702579960317e-06, "loss": 0.6855279207229614, "step": 90, "token_acc": 0.7947722946258603 }, { "epoch": 0.24511784511784512, "grad_norm": 1.422930359840393, "learning_rate": 4.965840347970436e-06, "loss": 0.7089034914970398, "step": 91, "token_acc": 0.7886250601636451 }, { "epoch": 0.24781144781144782, "grad_norm": 1.5137948989868164, "learning_rate": 4.964634108473397e-06, "loss": 0.7032789587974548, "step": 92, "token_acc": 0.7931537405628003 }, { "epoch": 0.2505050505050505, "grad_norm": 1.7953946590423584, "learning_rate": 4.963407091281054e-06, "loss": 0.7121573686599731, "step": 93, "token_acc": 0.7858306188925082 }, { "epoch": 0.2531986531986532, "grad_norm": 1.7075124979019165, "learning_rate": 4.962159306737579e-06, "loss": 0.6887596845626831, "step": 94, "token_acc": 0.7907410284293925 }, { "epoch": 0.2558922558922559, "grad_norm": 1.8178852796554565, "learning_rate": 4.96089076536221e-06, "loss": 0.708042323589325, "step": 95, "token_acc": 0.7874444723828682 }, { "epoch": 0.2585858585858586, "grad_norm": 1.612705111503601, "learning_rate": 4.9596014778491845e-06, "loss": 0.823356568813324, "step": 96, "token_acc": 0.763885323063757 }, { "epoch": 0.2612794612794613, "grad_norm": 1.498325228691101, "learning_rate": 4.958291455067626e-06, "loss": 0.6871433258056641, "step": 97, "token_acc": 0.794099351284102 }, { "epoch": 0.26397306397306397, "grad_norm": 1.5064069032669067, "learning_rate": 4.956960708061469e-06, "loss": 0.659827709197998, "step": 98, "token_acc": 0.7992806386525134 }, { "epoch": 0.26666666666666666, "grad_norm": 2.216805934906006, "learning_rate": 4.9556092480493584e-06, "loss": 0.7191430926322937, "step": 99, "token_acc": 0.783303249097473 }, { "epoch": 0.26936026936026936, "grad_norm": 1.6004130840301514, "learning_rate": 4.954237086424557e-06, "loss": 0.6730518341064453, "step": 100, "token_acc": 0.7957275223061084 }, { "epoch": 0.26936026936026936, "eval_loss": 0.688450813293457, "eval_runtime": 10.7004, "eval_samples_per_second": 4.205, "eval_steps_per_second": 1.402, "eval_token_acc": 0.7928269455967291, "step": 100 }, { "epoch": 0.27205387205387205, "grad_norm": 1.4950542449951172, "learning_rate": 4.95284423475485e-06, "loss": 0.6506053805351257, "step": 101, "token_acc": 0.801009483022331 }, { "epoch": 0.27474747474747474, "grad_norm": 1.5452213287353516, "learning_rate": 4.951430704782445e-06, "loss": 0.6577173471450806, "step": 102, "token_acc": 0.8011915673693859 }, { "epoch": 0.27744107744107743, "grad_norm": 1.4081379175186157, "learning_rate": 4.949996508423877e-06, "loss": 0.7011491656303406, "step": 103, "token_acc": 0.7885227693446871 }, { "epoch": 0.2801346801346801, "grad_norm": 1.5239752531051636, "learning_rate": 4.948541657769902e-06, "loss": 0.6554501056671143, "step": 104, "token_acc": 0.8022690437601296 }, { "epoch": 0.2828282828282828, "grad_norm": 1.6855943202972412, "learning_rate": 4.9470661650854025e-06, "loss": 0.683285117149353, "step": 105, "token_acc": 0.7906821963394343 }, { "epoch": 0.2855218855218855, "grad_norm": 1.6443266868591309, "learning_rate": 4.945570042809278e-06, "loss": 0.6812073588371277, "step": 106, "token_acc": 0.7941420511118487 }, { "epoch": 0.2882154882154882, "grad_norm": 1.583317518234253, "learning_rate": 4.944053303554343e-06, "loss": 0.701356053352356, "step": 107, "token_acc": 0.7903143585386576 }, { "epoch": 0.2909090909090909, "grad_norm": 1.8499690294265747, "learning_rate": 4.94251596010722e-06, "loss": 0.7044717669487, "step": 108, "token_acc": 0.7875288683602771 }, { "epoch": 0.2936026936026936, "grad_norm": 1.6126689910888672, "learning_rate": 4.940958025428232e-06, "loss": 0.6549851894378662, "step": 109, "token_acc": 0.7947176839075595 }, { "epoch": 0.2962962962962963, "grad_norm": 2.0003018379211426, "learning_rate": 4.939379512651294e-06, "loss": 0.7019288539886475, "step": 110, "token_acc": 0.7900698347811276 }, { "epoch": 0.298989898989899, "grad_norm": 1.57441246509552, "learning_rate": 4.937780435083797e-06, "loss": 0.6819396018981934, "step": 111, "token_acc": 0.7925862068965517 }, { "epoch": 0.30168350168350166, "grad_norm": 1.470357894897461, "learning_rate": 4.9361608062065056e-06, "loss": 0.6797043085098267, "step": 112, "token_acc": 0.7984410743031433 }, { "epoch": 0.30437710437710436, "grad_norm": 1.6336921453475952, "learning_rate": 4.934520639673437e-06, "loss": 0.6929810047149658, "step": 113, "token_acc": 0.7940769443675616 }, { "epoch": 0.30707070707070705, "grad_norm": 1.454089879989624, "learning_rate": 4.9328599493117455e-06, "loss": 0.6688606142997742, "step": 114, "token_acc": 0.7972887767969735 }, { "epoch": 0.30976430976430974, "grad_norm": 1.562909722328186, "learning_rate": 4.931178749121612e-06, "loss": 0.6929804682731628, "step": 115, "token_acc": 0.7909925666812418 }, { "epoch": 0.31245791245791243, "grad_norm": 1.4460086822509766, "learning_rate": 4.929477053276118e-06, "loss": 0.6604232788085938, "step": 116, "token_acc": 0.7979707549985079 }, { "epoch": 0.3151515151515151, "grad_norm": 1.4917417764663696, "learning_rate": 4.927754876121133e-06, "loss": 0.6617642641067505, "step": 117, "token_acc": 0.7941540748080258 }, { "epoch": 0.3178451178451178, "grad_norm": 1.8659400939941406, "learning_rate": 4.926012232175191e-06, "loss": 0.660190224647522, "step": 118, "token_acc": 0.7996340347666971 }, { "epoch": 0.3205387205387205, "grad_norm": 1.3923507928848267, "learning_rate": 4.924249136129368e-06, "loss": 0.6587837934494019, "step": 119, "token_acc": 0.7988585531389789 }, { "epoch": 0.32323232323232326, "grad_norm": 1.4862157106399536, "learning_rate": 4.922465602847154e-06, "loss": 0.6848689913749695, "step": 120, "token_acc": 0.7927036886907175 }, { "epoch": 0.32592592592592595, "grad_norm": 1.5733389854431152, "learning_rate": 4.920661647364337e-06, "loss": 0.6536937952041626, "step": 121, "token_acc": 0.8047337278106509 }, { "epoch": 0.32861952861952864, "grad_norm": 1.5016732215881348, "learning_rate": 4.918837284888871e-06, "loss": 0.6825423240661621, "step": 122, "token_acc": 0.7921384516247814 }, { "epoch": 0.33131313131313134, "grad_norm": 1.4066386222839355, "learning_rate": 4.916992530800743e-06, "loss": 0.6302779912948608, "step": 123, "token_acc": 0.8045621181262729 }, { "epoch": 0.33400673400673403, "grad_norm": 1.6674556732177734, "learning_rate": 4.915127400651851e-06, "loss": 0.6894584894180298, "step": 124, "token_acc": 0.7919569626951231 }, { "epoch": 0.3367003367003367, "grad_norm": 1.409942388534546, "learning_rate": 4.9132419101658715e-06, "loss": 0.695641279220581, "step": 125, "token_acc": 0.7940739640576266 }, { "epoch": 0.3393939393939394, "grad_norm": 1.4408330917358398, "learning_rate": 4.911336075238124e-06, "loss": 0.6507354974746704, "step": 126, "token_acc": 0.798090467991861 }, { "epoch": 0.3420875420875421, "grad_norm": 1.447801947593689, "learning_rate": 4.909409911935439e-06, "loss": 0.6877073049545288, "step": 127, "token_acc": 0.7891542830467595 }, { "epoch": 0.3447811447811448, "grad_norm": 1.4129462242126465, "learning_rate": 4.907463436496019e-06, "loss": 0.7069047689437866, "step": 128, "token_acc": 0.7859568754442777 }, { "epoch": 0.3474747474747475, "grad_norm": 2.6743760108947754, "learning_rate": 4.90549666532931e-06, "loss": 0.6493954658508301, "step": 129, "token_acc": 0.8025272547076313 }, { "epoch": 0.3501683501683502, "grad_norm": 1.632967233657837, "learning_rate": 4.903509615015854e-06, "loss": 0.6996077299118042, "step": 130, "token_acc": 0.7915197806502154 }, { "epoch": 0.3528619528619529, "grad_norm": 1.4613730907440186, "learning_rate": 4.901502302307155e-06, "loss": 0.6708221435546875, "step": 131, "token_acc": 0.7932509505703422 }, { "epoch": 0.35555555555555557, "grad_norm": 1.577831745147705, "learning_rate": 4.899474744125534e-06, "loss": 0.6807202696800232, "step": 132, "token_acc": 0.7891966519975839 }, { "epoch": 0.35824915824915826, "grad_norm": 1.5211666822433472, "learning_rate": 4.897426957563989e-06, "loss": 0.694576621055603, "step": 133, "token_acc": 0.7932587885125895 }, { "epoch": 0.36094276094276095, "grad_norm": 1.5447793006896973, "learning_rate": 4.89535895988605e-06, "loss": 0.7088102102279663, "step": 134, "token_acc": 0.7862270923861704 }, { "epoch": 0.36363636363636365, "grad_norm": 1.5023735761642456, "learning_rate": 4.893270768525636e-06, "loss": 0.6190232038497925, "step": 135, "token_acc": 0.8081836519066372 }, { "epoch": 0.36632996632996634, "grad_norm": 1.5723309516906738, "learning_rate": 4.8911624010869e-06, "loss": 0.6615859270095825, "step": 136, "token_acc": 0.7973154362416107 }, { "epoch": 0.36902356902356903, "grad_norm": 1.676498293876648, "learning_rate": 4.8890338753440905e-06, "loss": 0.681195855140686, "step": 137, "token_acc": 0.792660702451955 }, { "epoch": 0.3717171717171717, "grad_norm": 1.3704557418823242, "learning_rate": 4.8868852092413965e-06, "loss": 0.633265495300293, "step": 138, "token_acc": 0.8052318851763254 }, { "epoch": 0.3744107744107744, "grad_norm": 1.5271934270858765, "learning_rate": 4.884716420892796e-06, "loss": 0.6930828094482422, "step": 139, "token_acc": 0.7903361666283789 }, { "epoch": 0.3771043771043771, "grad_norm": 1.5779917240142822, "learning_rate": 4.882527528581906e-06, "loss": 0.6777030825614929, "step": 140, "token_acc": 0.7896715968914515 }, { "epoch": 0.3797979797979798, "grad_norm": 1.5151053667068481, "learning_rate": 4.880318550761826e-06, "loss": 0.6391350626945496, "step": 141, "token_acc": 0.8044003666972248 }, { "epoch": 0.3824915824915825, "grad_norm": 1.4474672079086304, "learning_rate": 4.878089506054981e-06, "loss": 0.6872603893280029, "step": 142, "token_acc": 0.796155324875048 }, { "epoch": 0.3851851851851852, "grad_norm": 1.5188560485839844, "learning_rate": 4.875840413252968e-06, "loss": 0.6529471278190613, "step": 143, "token_acc": 0.796944261058191 }, { "epoch": 0.3878787878787879, "grad_norm": 1.4625803232192993, "learning_rate": 4.873571291316398e-06, "loss": 0.6514662504196167, "step": 144, "token_acc": 0.8014099981688335 }, { "epoch": 0.39057239057239057, "grad_norm": 2.1492409706115723, "learning_rate": 4.871282159374733e-06, "loss": 0.6540035009384155, "step": 145, "token_acc": 0.8008797917227758 }, { "epoch": 0.39326599326599326, "grad_norm": 1.3394057750701904, "learning_rate": 4.868973036726124e-06, "loss": 0.6402072906494141, "step": 146, "token_acc": 0.8034829721362229 }, { "epoch": 0.39595959595959596, "grad_norm": 4.576254844665527, "learning_rate": 4.866643942837256e-06, "loss": 0.6581475138664246, "step": 147, "token_acc": 0.8016129032258065 }, { "epoch": 0.39865319865319865, "grad_norm": 1.8025342226028442, "learning_rate": 4.864294897343171e-06, "loss": 0.6960440874099731, "step": 148, "token_acc": 0.7942329873125721 }, { "epoch": 0.40134680134680134, "grad_norm": 1.4524977207183838, "learning_rate": 4.8619259200471145e-06, "loss": 0.6716012954711914, "step": 149, "token_acc": 0.7917597227570273 }, { "epoch": 0.40404040404040403, "grad_norm": 1.4746606349945068, "learning_rate": 4.859537030920363e-06, "loss": 0.6893396377563477, "step": 150, "token_acc": 0.7934661354581674 }, { "epoch": 0.40404040404040403, "eval_loss": 0.663612425327301, "eval_runtime": 10.2215, "eval_samples_per_second": 4.402, "eval_steps_per_second": 1.467, "eval_token_acc": 0.7968759303733403, "step": 150 }, { "epoch": 0.4067340067340067, "grad_norm": 1.3243526220321655, "learning_rate": 4.857128250102057e-06, "loss": 0.6411515474319458, "step": 151, "token_acc": 0.8044450964571261 }, { "epoch": 0.4094276094276094, "grad_norm": 1.4105066061019897, "learning_rate": 4.854699597899028e-06, "loss": 0.6668831706047058, "step": 152, "token_acc": 0.7967024768725753 }, { "epoch": 0.4121212121212121, "grad_norm": 1.4559847116470337, "learning_rate": 4.8522510947856315e-06, "loss": 0.7121236324310303, "step": 153, "token_acc": 0.7892894384030985 }, { "epoch": 0.4148148148148148, "grad_norm": 1.5681324005126953, "learning_rate": 4.849782761403575e-06, "loss": 0.6802405118942261, "step": 154, "token_acc": 0.7976823211640672 }, { "epoch": 0.4175084175084175, "grad_norm": 1.3430850505828857, "learning_rate": 4.8472946185617395e-06, "loss": 0.6321492195129395, "step": 155, "token_acc": 0.8030325443786982 }, { "epoch": 0.4202020202020202, "grad_norm": 1.6316030025482178, "learning_rate": 4.844786687236006e-06, "loss": 0.644690752029419, "step": 156, "token_acc": 0.8052529182879378 }, { "epoch": 0.4228956228956229, "grad_norm": 1.5189744234085083, "learning_rate": 4.84225898856908e-06, "loss": 0.6942604780197144, "step": 157, "token_acc": 0.7909098248163398 }, { "epoch": 0.4255892255892256, "grad_norm": 1.4366726875305176, "learning_rate": 4.8397115438703124e-06, "loss": 0.674608588218689, "step": 158, "token_acc": 0.8000956480153036 }, { "epoch": 0.42828282828282827, "grad_norm": 1.6782517433166504, "learning_rate": 4.83714437461552e-06, "loss": 0.6574329137802124, "step": 159, "token_acc": 0.8037568861193896 }, { "epoch": 0.43097643097643096, "grad_norm": 1.4071483612060547, "learning_rate": 4.834557502446804e-06, "loss": 0.6415087580680847, "step": 160, "token_acc": 0.8019604585479315 }, { "epoch": 0.43367003367003365, "grad_norm": 1.6236811876296997, "learning_rate": 4.831950949172366e-06, "loss": 0.6526669263839722, "step": 161, "token_acc": 0.8004621882221691 }, { "epoch": 0.43636363636363634, "grad_norm": 1.5462931394577026, "learning_rate": 4.82932473676633e-06, "loss": 0.6682183742523193, "step": 162, "token_acc": 0.7942937324602433 }, { "epoch": 0.43905723905723903, "grad_norm": 1.499746322631836, "learning_rate": 4.82667888736855e-06, "loss": 0.6426495313644409, "step": 163, "token_acc": 0.7994458843507016 }, { "epoch": 0.4417508417508417, "grad_norm": 1.7462161779403687, "learning_rate": 4.824013423284427e-06, "loss": 0.6573182344436646, "step": 164, "token_acc": 0.7937711206502877 }, { "epoch": 0.4444444444444444, "grad_norm": 1.4604578018188477, "learning_rate": 4.82132836698472e-06, "loss": 0.6727365255355835, "step": 165, "token_acc": 0.79563852635057 }, { "epoch": 0.4471380471380471, "grad_norm": 1.3735791444778442, "learning_rate": 4.8186237411053586e-06, "loss": 0.6955207586288452, "step": 166, "token_acc": 0.7889630078835658 }, { "epoch": 0.4498316498316498, "grad_norm": 1.3954068422317505, "learning_rate": 4.815899568447249e-06, "loss": 0.6537412405014038, "step": 167, "token_acc": 0.7951333119177643 }, { "epoch": 0.45252525252525255, "grad_norm": 1.9232374429702759, "learning_rate": 4.813155871976083e-06, "loss": 0.6367711424827576, "step": 168, "token_acc": 0.8020715326104547 }, { "epoch": 0.45521885521885525, "grad_norm": 1.522840142250061, "learning_rate": 4.810392674822148e-06, "loss": 0.6694404482841492, "step": 169, "token_acc": 0.7986868955727087 }, { "epoch": 0.45791245791245794, "grad_norm": 1.5240240097045898, "learning_rate": 4.807610000280124e-06, "loss": 0.6188982129096985, "step": 170, "token_acc": 0.8056549437123658 }, { "epoch": 0.46060606060606063, "grad_norm": 1.5429683923721313, "learning_rate": 4.804807871808896e-06, "loss": 0.6906000971794128, "step": 171, "token_acc": 0.786556396269576 }, { "epoch": 0.4632996632996633, "grad_norm": 1.476540207862854, "learning_rate": 4.801986313031348e-06, "loss": 0.6591665744781494, "step": 172, "token_acc": 0.7989451668636901 }, { "epoch": 0.465993265993266, "grad_norm": 1.4720929861068726, "learning_rate": 4.799145347734173e-06, "loss": 0.6707397699356079, "step": 173, "token_acc": 0.7953920220082531 }, { "epoch": 0.4686868686868687, "grad_norm": 1.3819103240966797, "learning_rate": 4.796284999867663e-06, "loss": 0.6798187494277954, "step": 174, "token_acc": 0.7970967480334117 }, { "epoch": 0.4713804713804714, "grad_norm": 1.5060410499572754, "learning_rate": 4.793405293545515e-06, "loss": 0.642928957939148, "step": 175, "token_acc": 0.7990887975701269 }, { "epoch": 0.4740740740740741, "grad_norm": 1.4929656982421875, "learning_rate": 4.7905062530446215e-06, "loss": 0.6785327792167664, "step": 176, "token_acc": 0.7943346620303622 }, { "epoch": 0.4767676767676768, "grad_norm": 1.5510073900222778, "learning_rate": 4.787587902804871e-06, "loss": 0.6635218858718872, "step": 177, "token_acc": 0.7946361307819798 }, { "epoch": 0.4794612794612795, "grad_norm": 1.4254915714263916, "learning_rate": 4.784650267428938e-06, "loss": 0.6418628692626953, "step": 178, "token_acc": 0.7978632478632479 }, { "epoch": 0.48215488215488217, "grad_norm": 1.3948981761932373, "learning_rate": 4.781693371682078e-06, "loss": 0.6414530277252197, "step": 179, "token_acc": 0.8002401182120429 }, { "epoch": 0.48484848484848486, "grad_norm": 1.351783037185669, "learning_rate": 4.778717240491917e-06, "loss": 0.6842470169067383, "step": 180, "token_acc": 0.7921679822208598 }, { "epoch": 0.48754208754208755, "grad_norm": 1.3929100036621094, "learning_rate": 4.7757218989482435e-06, "loss": 0.6762309074401855, "step": 181, "token_acc": 0.7978657593566347 }, { "epoch": 0.49023569023569025, "grad_norm": 1.2857002019882202, "learning_rate": 4.772707372302796e-06, "loss": 0.6315745115280151, "step": 182, "token_acc": 0.804658952496955 }, { "epoch": 0.49292929292929294, "grad_norm": 1.4794925451278687, "learning_rate": 4.769673685969047e-06, "loss": 0.6472344994544983, "step": 183, "token_acc": 0.8007412636780797 }, { "epoch": 0.49562289562289563, "grad_norm": 1.36336088180542, "learning_rate": 4.766620865521995e-06, "loss": 0.6527389883995056, "step": 184, "token_acc": 0.8003865883577429 }, { "epoch": 0.4983164983164983, "grad_norm": 1.4888983964920044, "learning_rate": 4.763548936697944e-06, "loss": 0.6935924887657166, "step": 185, "token_acc": 0.7931911030413074 }, { "epoch": 0.501010101010101, "grad_norm": 1.5464200973510742, "learning_rate": 4.760457925394287e-06, "loss": 0.6785871982574463, "step": 186, "token_acc": 0.7926067337729955 }, { "epoch": 0.5037037037037037, "grad_norm": 1.5041953325271606, "learning_rate": 4.757347857669292e-06, "loss": 0.626428484916687, "step": 187, "token_acc": 0.8077879749549705 }, { "epoch": 0.5063973063973064, "grad_norm": 1.4720526933670044, "learning_rate": 4.7542187597418765e-06, "loss": 0.6494168043136597, "step": 188, "token_acc": 0.798493686775456 }, { "epoch": 0.509090909090909, "grad_norm": 1.410652995109558, "learning_rate": 4.751070657991388e-06, "loss": 0.6165794730186462, "step": 189, "token_acc": 0.8126244696510521 }, { "epoch": 0.5117845117845118, "grad_norm": 1.5380399227142334, "learning_rate": 4.747903578957386e-06, "loss": 0.6231393218040466, "step": 190, "token_acc": 0.8051296093464768 }, { "epoch": 0.5144781144781144, "grad_norm": 1.6040352582931519, "learning_rate": 4.744717549339412e-06, "loss": 0.6653971076011658, "step": 191, "token_acc": 0.797651040111476 }, { "epoch": 0.5171717171717172, "grad_norm": 1.4092189073562622, "learning_rate": 4.7415125959967675e-06, "loss": 0.6882997751235962, "step": 192, "token_acc": 0.7899116410740328 }, { "epoch": 0.5198653198653199, "grad_norm": 1.3522484302520752, "learning_rate": 4.73828874594829e-06, "loss": 0.6539648771286011, "step": 193, "token_acc": 0.7986216902430178 }, { "epoch": 0.5225589225589226, "grad_norm": 1.383599877357483, "learning_rate": 4.73504602637212e-06, "loss": 0.6455013155937195, "step": 194, "token_acc": 0.8032252673796791 }, { "epoch": 0.5252525252525253, "grad_norm": 1.49760901927948, "learning_rate": 4.731784464605474e-06, "loss": 0.6501985788345337, "step": 195, "token_acc": 0.8007028973084176 }, { "epoch": 0.5279461279461279, "grad_norm": 1.3667562007904053, "learning_rate": 4.728504088144418e-06, "loss": 0.6648920774459839, "step": 196, "token_acc": 0.7954490438150569 }, { "epoch": 0.5306397306397307, "grad_norm": 1.4059678316116333, "learning_rate": 4.72520492464363e-06, "loss": 0.6432155966758728, "step": 197, "token_acc": 0.801195049082373 }, { "epoch": 0.5333333333333333, "grad_norm": 1.510453462600708, "learning_rate": 4.721887001916166e-06, "loss": 0.6649519205093384, "step": 198, "token_acc": 0.7971860260612359 }, { "epoch": 0.5360269360269361, "grad_norm": 1.379142165184021, "learning_rate": 4.718550347933234e-06, "loss": 0.6348834037780762, "step": 199, "token_acc": 0.8074321697893182 }, { "epoch": 0.5387205387205387, "grad_norm": 1.5178873538970947, "learning_rate": 4.7151949908239505e-06, "loss": 0.6440741419792175, "step": 200, "token_acc": 0.7974092351075079 }, { "epoch": 0.5387205387205387, "eval_loss": 0.6449831128120422, "eval_runtime": 10.6846, "eval_samples_per_second": 4.212, "eval_steps_per_second": 1.404, "eval_token_acc": 0.8011035468312725, "step": 200 }, { "epoch": 0.5414141414141415, "grad_norm": 1.463577151298523, "learning_rate": 4.711820958875105e-06, "loss": 0.6602286696434021, "step": 201, "token_acc": 0.794498809838667 }, { "epoch": 0.5441077441077441, "grad_norm": 1.387987494468689, "learning_rate": 4.708428280530923e-06, "loss": 0.6514633893966675, "step": 202, "token_acc": 0.8016805351607114 }, { "epoch": 0.5468013468013468, "grad_norm": 1.563628911972046, "learning_rate": 4.705016984392825e-06, "loss": 0.6323914527893066, "step": 203, "token_acc": 0.8050993124522536 }, { "epoch": 0.5494949494949495, "grad_norm": 1.469205617904663, "learning_rate": 4.701587099219186e-06, "loss": 0.6253467798233032, "step": 204, "token_acc": 0.8067083570210347 }, { "epoch": 0.5521885521885522, "grad_norm": 2.464313507080078, "learning_rate": 4.6981386539250924e-06, "loss": 0.6150229573249817, "step": 205, "token_acc": 0.8108945300695074 }, { "epoch": 0.5548821548821549, "grad_norm": 1.3276612758636475, "learning_rate": 4.6946716775820994e-06, "loss": 0.6248606443405151, "step": 206, "token_acc": 0.8043144424131627 }, { "epoch": 0.5575757575757576, "grad_norm": 1.4308656454086304, "learning_rate": 4.691186199417985e-06, "loss": 0.6651084423065186, "step": 207, "token_acc": 0.7983706720977597 }, { "epoch": 0.5602693602693603, "grad_norm": 1.4694702625274658, "learning_rate": 4.687682248816503e-06, "loss": 0.6575374007225037, "step": 208, "token_acc": 0.7988240235195296 }, { "epoch": 0.562962962962963, "grad_norm": 1.4247393608093262, "learning_rate": 4.6841598553171365e-06, "loss": 0.6926687955856323, "step": 209, "token_acc": 0.7923339264531435 }, { "epoch": 0.5656565656565656, "grad_norm": 1.426153540611267, "learning_rate": 4.6806190486148496e-06, "loss": 0.6178348064422607, "step": 210, "token_acc": 0.8086300930381778 }, { "epoch": 0.5683501683501684, "grad_norm": 1.383428692817688, "learning_rate": 4.677059858559835e-06, "loss": 0.610993504524231, "step": 211, "token_acc": 0.8073745002221235 }, { "epoch": 0.571043771043771, "grad_norm": 1.4248594045639038, "learning_rate": 4.673482315157261e-06, "loss": 0.6640750765800476, "step": 212, "token_acc": 0.794893401870734 }, { "epoch": 0.5737373737373738, "grad_norm": 1.4956046342849731, "learning_rate": 4.669886448567025e-06, "loss": 0.6169435977935791, "step": 213, "token_acc": 0.8054385964912281 }, { "epoch": 0.5764309764309764, "grad_norm": 1.3690674304962158, "learning_rate": 4.6662722891034925e-06, "loss": 0.648930549621582, "step": 214, "token_acc": 0.8020551218735065 }, { "epoch": 0.5791245791245792, "grad_norm": 1.4141255617141724, "learning_rate": 4.662639867235244e-06, "loss": 0.6473791599273682, "step": 215, "token_acc": 0.8010534236267871 }, { "epoch": 0.5818181818181818, "grad_norm": 4.767739772796631, "learning_rate": 4.658989213584819e-06, "loss": 0.6729704737663269, "step": 216, "token_acc": 0.7912264995523725 }, { "epoch": 0.5845117845117845, "grad_norm": 1.5111453533172607, "learning_rate": 4.655320358928458e-06, "loss": 0.6705379486083984, "step": 217, "token_acc": 0.7926394582070933 }, { "epoch": 0.5872053872053872, "grad_norm": 1.3978606462478638, "learning_rate": 4.65163333419584e-06, "loss": 0.6085922718048096, "step": 218, "token_acc": 0.8078737595575077 }, { "epoch": 0.5898989898989899, "grad_norm": 1.4686959981918335, "learning_rate": 4.647928170469824e-06, "loss": 0.6249260902404785, "step": 219, "token_acc": 0.8083245606931735 }, { "epoch": 0.5925925925925926, "grad_norm": 1.7406705617904663, "learning_rate": 4.644204898986188e-06, "loss": 0.6442081928253174, "step": 220, "token_acc": 0.8012058720727019 }, { "epoch": 0.5952861952861953, "grad_norm": 1.613510251045227, "learning_rate": 4.640463551133365e-06, "loss": 0.6724644899368286, "step": 221, "token_acc": 0.7930131004366813 }, { "epoch": 0.597979797979798, "grad_norm": 1.3158519268035889, "learning_rate": 4.636704158452177e-06, "loss": 0.654852569103241, "step": 222, "token_acc": 0.8033882128370318 }, { "epoch": 0.6006734006734007, "grad_norm": 1.4011584520339966, "learning_rate": 4.632926752635569e-06, "loss": 0.6084727048873901, "step": 223, "token_acc": 0.8089878476470108 }, { "epoch": 0.6033670033670033, "grad_norm": 1.579725980758667, "learning_rate": 4.629131365528344e-06, "loss": 0.6124751567840576, "step": 224, "token_acc": 0.8060446285660484 }, { "epoch": 0.6060606060606061, "grad_norm": 1.3729747533798218, "learning_rate": 4.625318029126894e-06, "loss": 0.6324076652526855, "step": 225, "token_acc": 0.8005574227395689 }, { "epoch": 0.6087542087542087, "grad_norm": 1.74544358253479, "learning_rate": 4.621486775578928e-06, "loss": 0.6855249404907227, "step": 226, "token_acc": 0.7887133899515835 }, { "epoch": 0.6114478114478115, "grad_norm": 1.4112721681594849, "learning_rate": 4.617637637183204e-06, "loss": 0.6279335618019104, "step": 227, "token_acc": 0.8057792091608517 }, { "epoch": 0.6141414141414141, "grad_norm": 1.4111381769180298, "learning_rate": 4.6137706463892525e-06, "loss": 0.6664260625839233, "step": 228, "token_acc": 0.7987636245322922 }, { "epoch": 0.6168350168350168, "grad_norm": 1.3960089683532715, "learning_rate": 4.60988583579711e-06, "loss": 0.6487903594970703, "step": 229, "token_acc": 0.7964207769532955 }, { "epoch": 0.6195286195286195, "grad_norm": 1.2427705526351929, "learning_rate": 4.6059832381570365e-06, "loss": 0.6533714532852173, "step": 230, "token_acc": 0.7994145740255739 }, { "epoch": 0.6222222222222222, "grad_norm": 1.3796330690383911, "learning_rate": 4.602062886369244e-06, "loss": 0.6828411817550659, "step": 231, "token_acc": 0.7927759613887592 }, { "epoch": 0.6249158249158249, "grad_norm": 1.4528419971466064, "learning_rate": 4.598124813483619e-06, "loss": 0.6494259834289551, "step": 232, "token_acc": 0.8018725700230104 }, { "epoch": 0.6276094276094276, "grad_norm": 1.435736060142517, "learning_rate": 4.594169052699441e-06, "loss": 0.622862696647644, "step": 233, "token_acc": 0.8094949319934159 }, { "epoch": 0.6303030303030303, "grad_norm": 1.5834755897521973, "learning_rate": 4.590195637365105e-06, "loss": 0.6480487585067749, "step": 234, "token_acc": 0.801642888024211 }, { "epoch": 0.632996632996633, "grad_norm": 1.4489094018936157, "learning_rate": 4.58620460097784e-06, "loss": 0.6775221824645996, "step": 235, "token_acc": 0.790010460251046 }, { "epoch": 0.6356902356902356, "grad_norm": 1.4737313985824585, "learning_rate": 4.582195977183428e-06, "loss": 0.6557160019874573, "step": 236, "token_acc": 0.7978380263272601 }, { "epoch": 0.6383838383838384, "grad_norm": 1.378084659576416, "learning_rate": 4.578169799775915e-06, "loss": 0.6423713564872742, "step": 237, "token_acc": 0.7974491829414109 }, { "epoch": 0.641077441077441, "grad_norm": 1.398463487625122, "learning_rate": 4.5741261026973326e-06, "loss": 0.6303367614746094, "step": 238, "token_acc": 0.8042439458977889 }, { "epoch": 0.6437710437710438, "grad_norm": 1.241166353225708, "learning_rate": 4.5700649200374095e-06, "loss": 0.6047324538230896, "step": 239, "token_acc": 0.8104506662674053 }, { "epoch": 0.6464646464646465, "grad_norm": 1.3392280340194702, "learning_rate": 4.565986286033281e-06, "loss": 0.6553015112876892, "step": 240, "token_acc": 0.8002187999663385 }, { "epoch": 0.6491582491582492, "grad_norm": 1.2629891633987427, "learning_rate": 4.561890235069207e-06, "loss": 0.6453153491020203, "step": 241, "token_acc": 0.8027916697768157 }, { "epoch": 0.6518518518518519, "grad_norm": 1.2458757162094116, "learning_rate": 4.557776801676276e-06, "loss": 0.6232687830924988, "step": 242, "token_acc": 0.8055068653228162 }, { "epoch": 0.6545454545454545, "grad_norm": 1.3733580112457275, "learning_rate": 4.553646020532114e-06, "loss": 0.6175482869148254, "step": 243, "token_acc": 0.8033527316946061 }, { "epoch": 0.6572390572390573, "grad_norm": 1.2111055850982666, "learning_rate": 4.5494979264605984e-06, "loss": 0.658862829208374, "step": 244, "token_acc": 0.79887190143981 }, { "epoch": 0.6599326599326599, "grad_norm": 1.4032889604568481, "learning_rate": 4.5453325544315595e-06, "loss": 0.6743654608726501, "step": 245, "token_acc": 0.7968588057083632 }, { "epoch": 0.6626262626262627, "grad_norm": 1.5625030994415283, "learning_rate": 4.5411499395604855e-06, "loss": 0.6618049144744873, "step": 246, "token_acc": 0.7974051021302709 }, { "epoch": 0.6653198653198653, "grad_norm": 1.3218791484832764, "learning_rate": 4.536950117108226e-06, "loss": 0.670454204082489, "step": 247, "token_acc": 0.7965987539989897 }, { "epoch": 0.6680134680134681, "grad_norm": 1.3733317852020264, "learning_rate": 4.532733122480701e-06, "loss": 0.6310135722160339, "step": 248, "token_acc": 0.8073618598382749 }, { "epoch": 0.6707070707070707, "grad_norm": 1.4527934789657593, "learning_rate": 4.528498991228592e-06, "loss": 0.6445793509483337, "step": 249, "token_acc": 0.8020671361931766 }, { "epoch": 0.6734006734006734, "grad_norm": 1.4612786769866943, "learning_rate": 4.524247759047054e-06, "loss": 0.6323232054710388, "step": 250, "token_acc": 0.8020689093426733 }, { "epoch": 0.6734006734006734, "eval_loss": 0.6333047151565552, "eval_runtime": 10.6774, "eval_samples_per_second": 4.215, "eval_steps_per_second": 1.405, "eval_token_acc": 0.8027707758569359, "step": 250 }, { "epoch": 0.6760942760942761, "grad_norm": 1.3326635360717773, "learning_rate": 4.519979461775404e-06, "loss": 0.6178995966911316, "step": 251, "token_acc": 0.807043689701153 }, { "epoch": 0.6787878787878788, "grad_norm": 1.3488386869430542, "learning_rate": 4.515694135396825e-06, "loss": 0.6283398866653442, "step": 252, "token_acc": 0.8057862309134745 }, { "epoch": 0.6814814814814815, "grad_norm": 1.3067935705184937, "learning_rate": 4.511391816038062e-06, "loss": 0.6029171347618103, "step": 253, "token_acc": 0.8103040806680322 }, { "epoch": 0.6841750841750842, "grad_norm": 1.4021010398864746, "learning_rate": 4.507072539969114e-06, "loss": 0.6080332398414612, "step": 254, "token_acc": 0.8116283124128313 }, { "epoch": 0.6868686868686869, "grad_norm": 1.329466700553894, "learning_rate": 4.502736343602934e-06, "loss": 0.6389551162719727, "step": 255, "token_acc": 0.8020992043338412 }, { "epoch": 0.6895622895622896, "grad_norm": 1.3976147174835205, "learning_rate": 4.498383263495115e-06, "loss": 0.6327084898948669, "step": 256, "token_acc": 0.802124183006536 }, { "epoch": 0.6922558922558922, "grad_norm": 1.4792864322662354, "learning_rate": 4.494013336343591e-06, "loss": 0.6102913618087769, "step": 257, "token_acc": 0.8098326753149088 }, { "epoch": 0.694949494949495, "grad_norm": 1.400282859802246, "learning_rate": 4.489626598988317e-06, "loss": 0.622687816619873, "step": 258, "token_acc": 0.8013112656912129 }, { "epoch": 0.6976430976430976, "grad_norm": 2.8782758712768555, "learning_rate": 4.485223088410967e-06, "loss": 0.6606238484382629, "step": 259, "token_acc": 0.7951414068165337 }, { "epoch": 0.7003367003367004, "grad_norm": 1.6955695152282715, "learning_rate": 4.480802841734616e-06, "loss": 0.7015712261199951, "step": 260, "token_acc": 0.7882762709352304 }, { "epoch": 0.703030303030303, "grad_norm": 1.354624629020691, "learning_rate": 4.476365896223433e-06, "loss": 0.6276664137840271, "step": 261, "token_acc": 0.8033577270770556 }, { "epoch": 0.7057239057239058, "grad_norm": 1.3531155586242676, "learning_rate": 4.471912289282362e-06, "loss": 0.6379947066307068, "step": 262, "token_acc": 0.8018305140148729 }, { "epoch": 0.7084175084175084, "grad_norm": 1.3611644506454468, "learning_rate": 4.4674420584568105e-06, "loss": 0.6564674377441406, "step": 263, "token_acc": 0.7969466856510046 }, { "epoch": 0.7111111111111111, "grad_norm": 1.366257667541504, "learning_rate": 4.4629552414323265e-06, "loss": 0.6314417123794556, "step": 264, "token_acc": 0.8047823650289557 }, { "epoch": 0.7138047138047138, "grad_norm": 1.4048471450805664, "learning_rate": 4.458451876034289e-06, "loss": 0.6412490606307983, "step": 265, "token_acc": 0.8024264762343546 }, { "epoch": 0.7164983164983165, "grad_norm": 1.2517162561416626, "learning_rate": 4.453932000227586e-06, "loss": 0.6598700284957886, "step": 266, "token_acc": 0.7968887584385089 }, { "epoch": 0.7191919191919192, "grad_norm": 1.2877793312072754, "learning_rate": 4.449395652116291e-06, "loss": 0.6938184499740601, "step": 267, "token_acc": 0.7894219977553311 }, { "epoch": 0.7218855218855219, "grad_norm": 1.5130810737609863, "learning_rate": 4.4448428699433454e-06, "loss": 0.6534684896469116, "step": 268, "token_acc": 0.7929938900203666 }, { "epoch": 0.7245791245791245, "grad_norm": 1.393796682357788, "learning_rate": 4.4402736920902365e-06, "loss": 0.6696171164512634, "step": 269, "token_acc": 0.8024032042723631 }, { "epoch": 0.7272727272727273, "grad_norm": 1.883646845817566, "learning_rate": 4.435688157076672e-06, "loss": 0.6155288219451904, "step": 270, "token_acc": 0.8048249287987937 }, { "epoch": 0.7299663299663299, "grad_norm": 1.3048070669174194, "learning_rate": 4.431086303560256e-06, "loss": 0.6105529069900513, "step": 271, "token_acc": 0.8063757300454251 }, { "epoch": 0.7326599326599327, "grad_norm": 1.2713009119033813, "learning_rate": 4.426468170336162e-06, "loss": 0.6412911415100098, "step": 272, "token_acc": 0.7992995279427441 }, { "epoch": 0.7353535353535353, "grad_norm": 1.3200876712799072, "learning_rate": 4.421833796336808e-06, "loss": 0.6293469071388245, "step": 273, "token_acc": 0.8056821882123087 }, { "epoch": 0.7380471380471381, "grad_norm": 1.2717974185943604, "learning_rate": 4.417183220631529e-06, "loss": 0.6326468586921692, "step": 274, "token_acc": 0.8046588270541533 }, { "epoch": 0.7407407407407407, "grad_norm": 1.2860804796218872, "learning_rate": 4.412516482426241e-06, "loss": 0.6422741413116455, "step": 275, "token_acc": 0.798630258827715 }, { "epoch": 0.7434343434343434, "grad_norm": 1.2367945909500122, "learning_rate": 4.40783362106312e-06, "loss": 0.60337895154953, "step": 276, "token_acc": 0.8106986570092047 }, { "epoch": 0.7461279461279461, "grad_norm": 2.803143262863159, "learning_rate": 4.403134676020264e-06, "loss": 0.6581419706344604, "step": 277, "token_acc": 0.7994444898292623 }, { "epoch": 0.7488215488215488, "grad_norm": 3.547379732131958, "learning_rate": 4.398419686911361e-06, "loss": 0.6723440885543823, "step": 278, "token_acc": 0.7926359256710255 }, { "epoch": 0.7515151515151515, "grad_norm": 1.2745715379714966, "learning_rate": 4.393688693485355e-06, "loss": 0.6211361885070801, "step": 279, "token_acc": 0.8053908781914473 }, { "epoch": 0.7542087542087542, "grad_norm": 1.3655184507369995, "learning_rate": 4.388941735626117e-06, "loss": 0.6414827704429626, "step": 280, "token_acc": 0.8040871729130533 }, { "epoch": 0.7569023569023569, "grad_norm": 1.3749865293502808, "learning_rate": 4.384178853352098e-06, "loss": 0.6513990163803101, "step": 281, "token_acc": 0.8015166501813387 }, { "epoch": 0.7595959595959596, "grad_norm": 1.3521699905395508, "learning_rate": 4.379400086815999e-06, "loss": 0.6124094724655151, "step": 282, "token_acc": 0.8062779450679807 }, { "epoch": 0.7622895622895622, "grad_norm": 1.3598625659942627, "learning_rate": 4.374605476304431e-06, "loss": 0.658810019493103, "step": 283, "token_acc": 0.8020377298415984 }, { "epoch": 0.764983164983165, "grad_norm": 1.352273941040039, "learning_rate": 4.369795062237574e-06, "loss": 0.6353999376296997, "step": 284, "token_acc": 0.8041209040573659 }, { "epoch": 0.7676767676767676, "grad_norm": 1.3834254741668701, "learning_rate": 4.3649688851688385e-06, "loss": 0.6343318819999695, "step": 285, "token_acc": 0.8049785100286533 }, { "epoch": 0.7703703703703704, "grad_norm": 1.3714983463287354, "learning_rate": 4.3601269857845235e-06, "loss": 0.628028154373169, "step": 286, "token_acc": 0.803818301514154 }, { "epoch": 0.773063973063973, "grad_norm": 1.3499248027801514, "learning_rate": 4.355269404903469e-06, "loss": 0.6142684817314148, "step": 287, "token_acc": 0.8069987649238369 }, { "epoch": 0.7757575757575758, "grad_norm": 1.3227360248565674, "learning_rate": 4.350396183476719e-06, "loss": 0.634795069694519, "step": 288, "token_acc": 0.8028880296693759 }, { "epoch": 0.7784511784511785, "grad_norm": 1.311724305152893, "learning_rate": 4.345507362587169e-06, "loss": 0.6354604363441467, "step": 289, "token_acc": 0.800047288776797 }, { "epoch": 0.7811447811447811, "grad_norm": 1.4476823806762695, "learning_rate": 4.3406029834492255e-06, "loss": 0.6149255633354187, "step": 290, "token_acc": 0.8058300888489568 }, { "epoch": 0.7838383838383839, "grad_norm": 1.427740454673767, "learning_rate": 4.335683087408456e-06, "loss": 0.5945029854774475, "step": 291, "token_acc": 0.8152855727963024 }, { "epoch": 0.7865319865319865, "grad_norm": 1.3106461763381958, "learning_rate": 4.330747715941241e-06, "loss": 0.6137604713439941, "step": 292, "token_acc": 0.8070275095350158 }, { "epoch": 0.7892255892255893, "grad_norm": 1.2887437343597412, "learning_rate": 4.325796910654424e-06, "loss": 0.6244805455207825, "step": 293, "token_acc": 0.8020366289231704 }, { "epoch": 0.7919191919191919, "grad_norm": 1.572029948234558, "learning_rate": 4.320830713284958e-06, "loss": 0.6702027320861816, "step": 294, "token_acc": 0.7933612985591829 }, { "epoch": 0.7946127946127947, "grad_norm": 1.394925832748413, "learning_rate": 4.315849165699562e-06, "loss": 0.6500352621078491, "step": 295, "token_acc": 0.7955105392827813 }, { "epoch": 0.7973063973063973, "grad_norm": 1.4325146675109863, "learning_rate": 4.310852309894355e-06, "loss": 0.631843090057373, "step": 296, "token_acc": 0.8035635529681574 }, { "epoch": 0.8, "grad_norm": 1.3451151847839355, "learning_rate": 4.305840187994519e-06, "loss": 0.6260757446289062, "step": 297, "token_acc": 0.8042101800838747 }, { "epoch": 0.8026936026936027, "grad_norm": 1.3584214448928833, "learning_rate": 4.300812842253925e-06, "loss": 0.6340255737304688, "step": 298, "token_acc": 0.805345436207766 }, { "epoch": 0.8053872053872054, "grad_norm": 1.4243718385696411, "learning_rate": 4.295770315054792e-06, "loss": 0.6472539901733398, "step": 299, "token_acc": 0.796274093885456 }, { "epoch": 0.8080808080808081, "grad_norm": 1.2927734851837158, "learning_rate": 4.290712648907321e-06, "loss": 0.6308562755584717, "step": 300, "token_acc": 0.8003868160206302 }, { "epoch": 0.8080808080808081, "eval_loss": 0.6202093958854675, "eval_runtime": 9.3016, "eval_samples_per_second": 4.838, "eval_steps_per_second": 1.613, "eval_token_acc": 0.8064823452354961, "step": 300 }, { "epoch": 0.8107744107744108, "grad_norm": 1.2718696594238281, "learning_rate": 4.285639886449341e-06, "loss": 0.6473962068557739, "step": 301, "token_acc": 0.8020563535044345 }, { "epoch": 0.8134680134680135, "grad_norm": 1.323541283607483, "learning_rate": 4.280552070445947e-06, "loss": 0.6537383198738098, "step": 302, "token_acc": 0.8004378578646009 }, { "epoch": 0.8161616161616162, "grad_norm": 1.4289733171463013, "learning_rate": 4.275449243789141e-06, "loss": 0.5881657600402832, "step": 303, "token_acc": 0.813713182037663 }, { "epoch": 0.8188552188552188, "grad_norm": 1.2583872079849243, "learning_rate": 4.2703314494974706e-06, "loss": 0.6576846837997437, "step": 304, "token_acc": 0.7964076532604452 }, { "epoch": 0.8215488215488216, "grad_norm": 1.2593477964401245, "learning_rate": 4.265198730715663e-06, "loss": 0.6494150161743164, "step": 305, "token_acc": 0.8000819000819 }, { "epoch": 0.8242424242424242, "grad_norm": 3.2120871543884277, "learning_rate": 4.260051130714267e-06, "loss": 0.6383477449417114, "step": 306, "token_acc": 0.8003783579265986 }, { "epoch": 0.826936026936027, "grad_norm": 1.3929730653762817, "learning_rate": 4.254888692889283e-06, "loss": 0.7026652097702026, "step": 307, "token_acc": 0.7872829183129652 }, { "epoch": 0.8296296296296296, "grad_norm": 1.2780957221984863, "learning_rate": 4.249711460761798e-06, "loss": 0.6049579977989197, "step": 308, "token_acc": 0.8100491122024934 }, { "epoch": 0.8323232323232324, "grad_norm": 1.3311514854431152, "learning_rate": 4.244519477977626e-06, "loss": 0.6329802870750427, "step": 309, "token_acc": 0.8046345811051694 }, { "epoch": 0.835016835016835, "grad_norm": 1.2647929191589355, "learning_rate": 4.239312788306927e-06, "loss": 0.5801459550857544, "step": 310, "token_acc": 0.817417470572617 }, { "epoch": 0.8377104377104377, "grad_norm": 1.312452793121338, "learning_rate": 4.234091435643848e-06, "loss": 0.634760320186615, "step": 311, "token_acc": 0.7996171582702515 }, { "epoch": 0.8404040404040404, "grad_norm": 1.3058985471725464, "learning_rate": 4.228855464006151e-06, "loss": 0.6680688261985779, "step": 312, "token_acc": 0.7919860046651116 }, { "epoch": 0.8430976430976431, "grad_norm": 1.2756344079971313, "learning_rate": 4.223604917534839e-06, "loss": 0.6050231456756592, "step": 313, "token_acc": 0.8082277983166837 }, { "epoch": 0.8457912457912458, "grad_norm": 1.3164767026901245, "learning_rate": 4.218339840493786e-06, "loss": 0.6186568737030029, "step": 314, "token_acc": 0.8093800978792822 }, { "epoch": 0.8484848484848485, "grad_norm": 1.2220737934112549, "learning_rate": 4.213060277269364e-06, "loss": 0.6095640659332275, "step": 315, "token_acc": 0.8092312539382482 }, { "epoch": 0.8511784511784511, "grad_norm": 1.33602774143219, "learning_rate": 4.20776627237007e-06, "loss": 0.6227450370788574, "step": 316, "token_acc": 0.8077814996135017 }, { "epoch": 0.8538720538720539, "grad_norm": 1.4767441749572754, "learning_rate": 4.202457870426146e-06, "loss": 0.6034683585166931, "step": 317, "token_acc": 0.8091102445941155 }, { "epoch": 0.8565656565656565, "grad_norm": 1.2231708765029907, "learning_rate": 4.197135116189209e-06, "loss": 0.5930472612380981, "step": 318, "token_acc": 0.8129340277777778 }, { "epoch": 0.8592592592592593, "grad_norm": 1.3354220390319824, "learning_rate": 4.191798054531868e-06, "loss": 0.6018139123916626, "step": 319, "token_acc": 0.8121427467943766 }, { "epoch": 0.8619528619528619, "grad_norm": 1.2527638673782349, "learning_rate": 4.18644673044735e-06, "loss": 0.588690459728241, "step": 320, "token_acc": 0.8173191930468064 }, { "epoch": 0.8646464646464647, "grad_norm": 1.4055863618850708, "learning_rate": 4.181081189049118e-06, "loss": 0.6474159955978394, "step": 321, "token_acc": 0.8031363515548862 }, { "epoch": 0.8673400673400673, "grad_norm": 1.2820357084274292, "learning_rate": 4.175701475570494e-06, "loss": 0.6186543703079224, "step": 322, "token_acc": 0.8074764937822263 }, { "epoch": 0.87003367003367, "grad_norm": 1.3725589513778687, "learning_rate": 4.170307635364272e-06, "loss": 0.6288110017776489, "step": 323, "token_acc": 0.8019865722431712 }, { "epoch": 0.8727272727272727, "grad_norm": 1.4308316707611084, "learning_rate": 4.1648997139023415e-06, "loss": 0.6475309133529663, "step": 324, "token_acc": 0.7991119971003987 }, { "epoch": 0.8754208754208754, "grad_norm": 1.4370498657226562, "learning_rate": 4.159477756775302e-06, "loss": 0.6345180869102478, "step": 325, "token_acc": 0.8039657503379901 }, { "epoch": 0.8781144781144781, "grad_norm": 1.3458034992218018, "learning_rate": 4.1540418096920775e-06, "loss": 0.5873920321464539, "step": 326, "token_acc": 0.8150874761822277 }, { "epoch": 0.8808080808080808, "grad_norm": 1.429321050643921, "learning_rate": 4.148591918479531e-06, "loss": 0.656361997127533, "step": 327, "token_acc": 0.8012454592631033 }, { "epoch": 0.8835016835016835, "grad_norm": 1.4021203517913818, "learning_rate": 4.143128129082079e-06, "loss": 0.610224187374115, "step": 328, "token_acc": 0.8053464582127002 }, { "epoch": 0.8861952861952862, "grad_norm": 1.2797141075134277, "learning_rate": 4.137650487561309e-06, "loss": 0.5894150137901306, "step": 329, "token_acc": 0.813053613053613 }, { "epoch": 0.8888888888888888, "grad_norm": 1.3794041872024536, "learning_rate": 4.132159040095581e-06, "loss": 0.6062744855880737, "step": 330, "token_acc": 0.8098327698071827 }, { "epoch": 0.8915824915824916, "grad_norm": 1.4458425045013428, "learning_rate": 4.126653832979647e-06, "loss": 0.6741775274276733, "step": 331, "token_acc": 0.793698175787728 }, { "epoch": 0.8942760942760942, "grad_norm": 1.3147836923599243, "learning_rate": 4.121134912624255e-06, "loss": 0.6521025896072388, "step": 332, "token_acc": 0.7987073608617594 }, { "epoch": 0.896969696969697, "grad_norm": 2.282785654067993, "learning_rate": 4.115602325555762e-06, "loss": 0.5951105952262878, "step": 333, "token_acc": 0.809300444803882 }, { "epoch": 0.8996632996632996, "grad_norm": 1.470426321029663, "learning_rate": 4.110056118415741e-06, "loss": 0.6331968307495117, "step": 334, "token_acc": 0.8072904203557469 }, { "epoch": 0.9023569023569024, "grad_norm": 1.181194543838501, "learning_rate": 4.104496337960583e-06, "loss": 0.6103177070617676, "step": 335, "token_acc": 0.811405352388959 }, { "epoch": 0.9050505050505051, "grad_norm": 1.2480560541152954, "learning_rate": 4.098923031061112e-06, "loss": 0.594902753829956, "step": 336, "token_acc": 0.8151051481118565 }, { "epoch": 0.9077441077441077, "grad_norm": 1.606463074684143, "learning_rate": 4.093336244702179e-06, "loss": 0.6445667147636414, "step": 337, "token_acc": 0.8001265593925149 }, { "epoch": 0.9104377104377105, "grad_norm": 2.0144426822662354, "learning_rate": 4.0877360259822766e-06, "loss": 0.6311855316162109, "step": 338, "token_acc": 0.8043667278523814 }, { "epoch": 0.9131313131313131, "grad_norm": 1.3138763904571533, "learning_rate": 4.08212242211313e-06, "loss": 0.6070884466171265, "step": 339, "token_acc": 0.8056159117499582 }, { "epoch": 0.9158249158249159, "grad_norm": 1.2995100021362305, "learning_rate": 4.076495480419314e-06, "loss": 0.622340738773346, "step": 340, "token_acc": 0.8035791475749978 }, { "epoch": 0.9185185185185185, "grad_norm": 1.3418028354644775, "learning_rate": 4.07085524833784e-06, "loss": 0.6580623984336853, "step": 341, "token_acc": 0.79566627561935 }, { "epoch": 0.9212121212121213, "grad_norm": 1.238775610923767, "learning_rate": 4.065201773417761e-06, "loss": 0.622681736946106, "step": 342, "token_acc": 0.8105924596050269 }, { "epoch": 0.9239057239057239, "grad_norm": 1.3434486389160156, "learning_rate": 4.059535103319777e-06, "loss": 0.5811754465103149, "step": 343, "token_acc": 0.8166681997976267 }, { "epoch": 0.9265993265993266, "grad_norm": 1.2851468324661255, "learning_rate": 4.053855285815825e-06, "loss": 0.6614164113998413, "step": 344, "token_acc": 0.7950795302512264 }, { "epoch": 0.9292929292929293, "grad_norm": 1.277409315109253, "learning_rate": 4.048162368788675e-06, "loss": 0.5891940593719482, "step": 345, "token_acc": 0.8148665154423311 }, { "epoch": 0.931986531986532, "grad_norm": 1.249624252319336, "learning_rate": 4.042456400231538e-06, "loss": 0.6209766864776611, "step": 346, "token_acc": 0.8062995660631453 }, { "epoch": 0.9346801346801347, "grad_norm": 1.1767240762710571, "learning_rate": 4.036737428247648e-06, "loss": 0.6419370770454407, "step": 347, "token_acc": 0.7993218462390146 }, { "epoch": 0.9373737373737374, "grad_norm": 1.326284646987915, "learning_rate": 4.031005501049865e-06, "loss": 0.6018705368041992, "step": 348, "token_acc": 0.812960235640648 }, { "epoch": 0.94006734006734, "grad_norm": 1.4354084730148315, "learning_rate": 4.025260666960265e-06, "loss": 0.6035539507865906, "step": 349, "token_acc": 0.815 }, { "epoch": 0.9427609427609428, "grad_norm": 1.3850467205047607, "learning_rate": 4.019502974409734e-06, "loss": 0.6294830441474915, "step": 350, "token_acc": 0.8029786171894501 }, { "epoch": 0.9427609427609428, "eval_loss": 0.6160600781440735, "eval_runtime": 9.3174, "eval_samples_per_second": 4.83, "eval_steps_per_second": 1.61, "eval_token_acc": 0.8070777841732331, "step": 350 }, { "epoch": 0.9454545454545454, "grad_norm": 1.38473379611969, "learning_rate": 4.013732471937557e-06, "loss": 0.6183304190635681, "step": 351, "token_acc": 0.8079681274900399 }, { "epoch": 0.9481481481481482, "grad_norm": 1.2525638341903687, "learning_rate": 4.0079492081910135e-06, "loss": 0.641049861907959, "step": 352, "token_acc": 0.8025520483546004 }, { "epoch": 0.9508417508417508, "grad_norm": 1.278059720993042, "learning_rate": 4.002153231924964e-06, "loss": 0.60345458984375, "step": 353, "token_acc": 0.8105279974438853 }, { "epoch": 0.9535353535353536, "grad_norm": 1.2627270221710205, "learning_rate": 3.99634459200144e-06, "loss": 0.6103197336196899, "step": 354, "token_acc": 0.8081699346405229 }, { "epoch": 0.9562289562289562, "grad_norm": 1.3325384855270386, "learning_rate": 3.9905233373892316e-06, "loss": 0.6163114309310913, "step": 355, "token_acc": 0.8084607447706594 }, { "epoch": 0.958922558922559, "grad_norm": 1.2898777723312378, "learning_rate": 3.984689517163475e-06, "loss": 0.6009296774864197, "step": 356, "token_acc": 0.8126103404791929 }, { "epoch": 0.9616161616161616, "grad_norm": 1.2848820686340332, "learning_rate": 3.978843180505238e-06, "loss": 0.608750581741333, "step": 357, "token_acc": 0.8123308008034232 }, { "epoch": 0.9643097643097643, "grad_norm": 1.3651995658874512, "learning_rate": 3.972984376701108e-06, "loss": 0.642690122127533, "step": 358, "token_acc": 0.8009699853587116 }, { "epoch": 0.967003367003367, "grad_norm": 1.2806298732757568, "learning_rate": 3.9671131551427726e-06, "loss": 0.6235046982765198, "step": 359, "token_acc": 0.8060428152072743 }, { "epoch": 0.9696969696969697, "grad_norm": 1.362613558769226, "learning_rate": 3.9612295653266064e-06, "loss": 0.6590780019760132, "step": 360, "token_acc": 0.7958989031950405 }, { "epoch": 0.9723905723905724, "grad_norm": 1.2619017362594604, "learning_rate": 3.955333656853253e-06, "loss": 0.6305599212646484, "step": 361, "token_acc": 0.8033726812816189 }, { "epoch": 0.9750841750841751, "grad_norm": 1.282491683959961, "learning_rate": 3.949425479427206e-06, "loss": 0.6032223105430603, "step": 362, "token_acc": 0.8121951219512196 }, { "epoch": 0.9777777777777777, "grad_norm": 1.2996447086334229, "learning_rate": 3.943505082856389e-06, "loss": 0.6222999095916748, "step": 363, "token_acc": 0.8028031290743155 }, { "epoch": 0.9804713804713805, "grad_norm": 1.2675151824951172, "learning_rate": 3.93757251705174e-06, "loss": 0.6197265386581421, "step": 364, "token_acc": 0.8095277893857084 }, { "epoch": 0.9831649831649831, "grad_norm": 1.2647666931152344, "learning_rate": 3.931627832026783e-06, "loss": 0.6092216372489929, "step": 365, "token_acc": 0.8077023158990372 }, { "epoch": 0.9858585858585859, "grad_norm": 1.2002745866775513, "learning_rate": 3.9256710778972175e-06, "loss": 0.6214941740036011, "step": 366, "token_acc": 0.8103213094877235 }, { "epoch": 0.9885521885521885, "grad_norm": 1.3631004095077515, "learning_rate": 3.919702304880485e-06, "loss": 0.5962947607040405, "step": 367, "token_acc": 0.8148809523809524 }, { "epoch": 0.9912457912457913, "grad_norm": 1.2404322624206543, "learning_rate": 3.913721563295351e-06, "loss": 0.5918201804161072, "step": 368, "token_acc": 0.814863423837257 }, { "epoch": 0.9939393939393939, "grad_norm": 1.8101868629455566, "learning_rate": 3.907728903561481e-06, "loss": 0.6121522784233093, "step": 369, "token_acc": 0.8078007091553777 }, { "epoch": 0.9966329966329966, "grad_norm": 1.2212328910827637, "learning_rate": 3.901724376199014e-06, "loss": 0.621286928653717, "step": 370, "token_acc": 0.8086637403756206 }, { "epoch": 0.9993265993265993, "grad_norm": 1.2790559530258179, "learning_rate": 3.895708031828138e-06, "loss": 0.6123597025871277, "step": 371, "token_acc": 0.8084513582540052 }, { "epoch": 1.0, "grad_norm": 2.5845959186553955, "learning_rate": 3.889679921168661e-06, "loss": 0.6059808731079102, "step": 372, "token_acc": 0.8147672552166935 }, { "epoch": 1.0026936026936026, "grad_norm": 1.2107398509979248, "learning_rate": 3.883640095039585e-06, "loss": 0.5354050993919373, "step": 373, "token_acc": 0.8307560137457045 }, { "epoch": 1.0053872053872055, "grad_norm": 1.3025703430175781, "learning_rate": 3.877588604358678e-06, "loss": 0.5310766100883484, "step": 374, "token_acc": 0.830131700360708 }, { "epoch": 1.0080808080808081, "grad_norm": 1.1903347969055176, "learning_rate": 3.871525500142044e-06, "loss": 0.5627352595329285, "step": 375, "token_acc": 0.8214966816690823 }, { "epoch": 1.0107744107744108, "grad_norm": 1.181552529335022, "learning_rate": 3.865450833503692e-06, "loss": 0.5286879539489746, "step": 376, "token_acc": 0.8341930116472546 }, { "epoch": 1.0134680134680134, "grad_norm": 1.224401593208313, "learning_rate": 3.859364655655108e-06, "loss": 0.5587303638458252, "step": 377, "token_acc": 0.8240906866742782 }, { "epoch": 1.0161616161616163, "grad_norm": 1.3323237895965576, "learning_rate": 3.85326701790482e-06, "loss": 0.5411142110824585, "step": 378, "token_acc": 0.82635564452955 }, { "epoch": 1.018855218855219, "grad_norm": 1.3120346069335938, "learning_rate": 3.847157971657966e-06, "loss": 0.5225459337234497, "step": 379, "token_acc": 0.833174527552803 }, { "epoch": 1.0215488215488215, "grad_norm": 1.3633784055709839, "learning_rate": 3.841037568415865e-06, "loss": 0.5734001398086548, "step": 380, "token_acc": 0.8235608625913384 }, { "epoch": 1.0242424242424242, "grad_norm": 1.3613659143447876, "learning_rate": 3.834905859775574e-06, "loss": 0.574379026889801, "step": 381, "token_acc": 0.8142807505211953 }, { "epoch": 1.026936026936027, "grad_norm": 1.3735803365707397, "learning_rate": 3.8287628974294636e-06, "loss": 0.6180782318115234, "step": 382, "token_acc": 0.8045180971225987 }, { "epoch": 1.0296296296296297, "grad_norm": 1.4102107286453247, "learning_rate": 3.822608733164773e-06, "loss": 0.5362061262130737, "step": 383, "token_acc": 0.8302898155719088 }, { "epoch": 1.0323232323232323, "grad_norm": 1.265261173248291, "learning_rate": 3.816443418863179e-06, "loss": 0.5303924083709717, "step": 384, "token_acc": 0.8299191586217722 }, { "epoch": 1.035016835016835, "grad_norm": 1.2556697130203247, "learning_rate": 3.810267006500358e-06, "loss": 0.5327094197273254, "step": 385, "token_acc": 0.8293135435992579 }, { "epoch": 1.0377104377104378, "grad_norm": 1.4236747026443481, "learning_rate": 3.804079548145545e-06, "loss": 0.5340734124183655, "step": 386, "token_acc": 0.8251402491204716 }, { "epoch": 1.0404040404040404, "grad_norm": 1.4386234283447266, "learning_rate": 3.7978810959610963e-06, "loss": 0.5607868432998657, "step": 387, "token_acc": 0.8208308126638435 }, { "epoch": 1.043097643097643, "grad_norm": 1.2860978841781616, "learning_rate": 3.791671702202052e-06, "loss": 0.5836136341094971, "step": 388, "token_acc": 0.8140788415124698 }, { "epoch": 1.0457912457912457, "grad_norm": 1.3116002082824707, "learning_rate": 3.785451419215691e-06, "loss": 0.5193370580673218, "step": 389, "token_acc": 0.8313313733725326 }, { "epoch": 1.0484848484848486, "grad_norm": 1.3094614744186401, "learning_rate": 3.7792202994410926e-06, "loss": 0.5170670747756958, "step": 390, "token_acc": 0.83340643916133 }, { "epoch": 1.0511784511784512, "grad_norm": 1.30987548828125, "learning_rate": 3.7729783954086957e-06, "loss": 0.48863929510116577, "step": 391, "token_acc": 0.8401282262041756 }, { "epoch": 1.0538720538720538, "grad_norm": 1.2975558042526245, "learning_rate": 3.766725759739852e-06, "loss": 0.5394982695579529, "step": 392, "token_acc": 0.8306819158999054 }, { "epoch": 1.0565656565656565, "grad_norm": 1.279407262802124, "learning_rate": 3.760462445146386e-06, "loss": 0.5402341485023499, "step": 393, "token_acc": 0.8242831471208688 }, { "epoch": 1.0592592592592593, "grad_norm": 1.3311103582382202, "learning_rate": 3.754188504430147e-06, "loss": 0.5199674367904663, "step": 394, "token_acc": 0.8314483765049252 }, { "epoch": 1.061952861952862, "grad_norm": 1.5482090711593628, "learning_rate": 3.7479039904825687e-06, "loss": 0.5613769888877869, "step": 395, "token_acc": 0.8199751029397683 }, { "epoch": 1.0646464646464646, "grad_norm": 1.3084946870803833, "learning_rate": 3.7416089562842183e-06, "loss": 0.5345817804336548, "step": 396, "token_acc": 0.8300970873786407 }, { "epoch": 1.0673400673400673, "grad_norm": 1.3354644775390625, "learning_rate": 3.7353034549043547e-06, "loss": 0.5638906955718994, "step": 397, "token_acc": 0.8189598897512149 }, { "epoch": 1.0700336700336701, "grad_norm": 1.3179502487182617, "learning_rate": 3.728987539500477e-06, "loss": 0.5404822826385498, "step": 398, "token_acc": 0.8283151825752723 }, { "epoch": 1.0727272727272728, "grad_norm": 1.2330291271209717, "learning_rate": 3.722661263317878e-06, "loss": 0.5371161103248596, "step": 399, "token_acc": 0.8319857697283312 }, { "epoch": 1.0754208754208754, "grad_norm": 1.2985093593597412, "learning_rate": 3.7163246796891966e-06, "loss": 0.5526595115661621, "step": 400, "token_acc": 0.8253691062073345 }, { "epoch": 1.0754208754208754, "eval_loss": 0.6135379076004028, "eval_runtime": 10.8399, "eval_samples_per_second": 4.151, "eval_steps_per_second": 1.384, "eval_token_acc": 0.8076335271817875, "step": 400 }, { "epoch": 1.078114478114478, "grad_norm": 1.3081196546554565, "learning_rate": 3.709977842033965e-06, "loss": 0.5595347285270691, "step": 401, "token_acc": 0.8191884901390236 }, { "epoch": 1.0808080808080809, "grad_norm": 1.3319206237792969, "learning_rate": 3.7036208038581636e-06, "loss": 0.5521378517150879, "step": 402, "token_acc": 0.8206453518422802 }, { "epoch": 1.0835016835016835, "grad_norm": 1.2976937294006348, "learning_rate": 3.697253618753761e-06, "loss": 0.5452443957328796, "step": 403, "token_acc": 0.8250878847637829 }, { "epoch": 1.0861952861952862, "grad_norm": 1.263430118560791, "learning_rate": 3.690876340398274e-06, "loss": 0.5694794058799744, "step": 404, "token_acc": 0.8206374501992032 }, { "epoch": 1.0888888888888888, "grad_norm": 1.3104808330535889, "learning_rate": 3.684489022554305e-06, "loss": 0.5729893445968628, "step": 405, "token_acc": 0.8231562922868741 }, { "epoch": 1.0915824915824917, "grad_norm": 1.2618986368179321, "learning_rate": 3.6780917190690947e-06, "loss": 0.5527254343032837, "step": 406, "token_acc": 0.8256124371859297 }, { "epoch": 1.0942760942760943, "grad_norm": 1.3281644582748413, "learning_rate": 3.6716844838740644e-06, "loss": 0.5191165804862976, "step": 407, "token_acc": 0.8306807766019498 }, { "epoch": 1.096969696969697, "grad_norm": 2.0966298580169678, "learning_rate": 3.6652673709843644e-06, "loss": 0.565962016582489, "step": 408, "token_acc": 0.8215366991244579 }, { "epoch": 1.0996632996632996, "grad_norm": 1.2551101446151733, "learning_rate": 3.658840434498417e-06, "loss": 0.5210585594177246, "step": 409, "token_acc": 0.836066931742876 }, { "epoch": 1.1023569023569024, "grad_norm": 1.2106307744979858, "learning_rate": 3.6524037285974597e-06, "loss": 0.5300991535186768, "step": 410, "token_acc": 0.8302979501638345 }, { "epoch": 1.105050505050505, "grad_norm": 1.2738215923309326, "learning_rate": 3.6459573075450904e-06, "loss": 0.5216208100318909, "step": 411, "token_acc": 0.8291586384071933 }, { "epoch": 1.1077441077441077, "grad_norm": 1.3951414823532104, "learning_rate": 3.63950122568681e-06, "loss": 0.5257141590118408, "step": 412, "token_acc": 0.8331073139860772 }, { "epoch": 1.1104377104377103, "grad_norm": 1.2970991134643555, "learning_rate": 3.6330355374495617e-06, "loss": 0.5708835124969482, "step": 413, "token_acc": 0.8206222806009359 }, { "epoch": 1.1131313131313132, "grad_norm": 1.3784953355789185, "learning_rate": 3.6265602973412734e-06, "loss": 0.519316554069519, "step": 414, "token_acc": 0.831505124977522 }, { "epoch": 1.1158249158249158, "grad_norm": 1.451308012008667, "learning_rate": 3.6200755599503987e-06, "loss": 0.5285217761993408, "step": 415, "token_acc": 0.8323894775301425 }, { "epoch": 1.1185185185185185, "grad_norm": 1.2700974941253662, "learning_rate": 3.6135813799454594e-06, "loss": 0.516826868057251, "step": 416, "token_acc": 0.833495855680156 }, { "epoch": 1.121212121212121, "grad_norm": 1.3021750450134277, "learning_rate": 3.607077812074578e-06, "loss": 0.5593036413192749, "step": 417, "token_acc": 0.8223140495867769 }, { "epoch": 1.123905723905724, "grad_norm": 1.320070505142212, "learning_rate": 3.6005649111650204e-06, "loss": 0.5939220786094666, "step": 418, "token_acc": 0.8156046814044213 }, { "epoch": 1.1265993265993266, "grad_norm": 1.4927456378936768, "learning_rate": 3.594042732122735e-06, "loss": 0.569314181804657, "step": 419, "token_acc": 0.8175260746661468 }, { "epoch": 1.1292929292929292, "grad_norm": 1.2169201374053955, "learning_rate": 3.587511329931887e-06, "loss": 0.5512276887893677, "step": 420, "token_acc": 0.8272996603800853 }, { "epoch": 1.131986531986532, "grad_norm": 1.1618006229400635, "learning_rate": 3.5809707596543952e-06, "loss": 0.5184974670410156, "step": 421, "token_acc": 0.8334866605335787 }, { "epoch": 1.1346801346801347, "grad_norm": 1.2485661506652832, "learning_rate": 3.574421076429469e-06, "loss": 0.545513927936554, "step": 422, "token_acc": 0.8271564978151804 }, { "epoch": 1.1373737373737374, "grad_norm": 1.2175967693328857, "learning_rate": 3.567862335473144e-06, "loss": 0.5255947113037109, "step": 423, "token_acc": 0.827901838554407 }, { "epoch": 1.14006734006734, "grad_norm": 1.2209067344665527, "learning_rate": 3.561294592077813e-06, "loss": 0.5185179114341736, "step": 424, "token_acc": 0.830621002347988 }, { "epoch": 1.1427609427609426, "grad_norm": 1.4181879758834839, "learning_rate": 3.5547179016117654e-06, "loss": 0.5795246958732605, "step": 425, "token_acc": 0.8156288156288156 }, { "epoch": 1.1454545454545455, "grad_norm": 1.2633882761001587, "learning_rate": 3.5481323195187158e-06, "loss": 0.540844202041626, "step": 426, "token_acc": 0.8282099115905589 }, { "epoch": 1.1481481481481481, "grad_norm": 1.4012353420257568, "learning_rate": 3.541537901317338e-06, "loss": 0.5333842039108276, "step": 427, "token_acc": 0.8260448746150462 }, { "epoch": 1.1508417508417508, "grad_norm": 1.2287529706954956, "learning_rate": 3.534934702600798e-06, "loss": 0.5648769736289978, "step": 428, "token_acc": 0.8178366588577429 }, { "epoch": 1.1535353535353536, "grad_norm": 1.4325664043426514, "learning_rate": 3.528322779036283e-06, "loss": 0.557217001914978, "step": 429, "token_acc": 0.8254905523255814 }, { "epoch": 1.1562289562289563, "grad_norm": 1.2846944332122803, "learning_rate": 3.521702186364536e-06, "loss": 0.544150710105896, "step": 430, "token_acc": 0.8289528927701738 }, { "epoch": 1.158922558922559, "grad_norm": 1.2782920598983765, "learning_rate": 3.5150729803993822e-06, "loss": 0.5318150520324707, "step": 431, "token_acc": 0.8285758133824432 }, { "epoch": 1.1616161616161615, "grad_norm": 1.3036065101623535, "learning_rate": 3.5084352170272583e-06, "loss": 0.5590845346450806, "step": 432, "token_acc": 0.822337493921219 }, { "epoch": 1.1643097643097644, "grad_norm": 1.4358584880828857, "learning_rate": 3.501788952206746e-06, "loss": 0.5511110424995422, "step": 433, "token_acc": 0.8255236066737665 }, { "epoch": 1.167003367003367, "grad_norm": 1.37118399143219, "learning_rate": 3.4951342419680945e-06, "loss": 0.5491899251937866, "step": 434, "token_acc": 0.8254078444984381 }, { "epoch": 1.1696969696969697, "grad_norm": 1.2891911268234253, "learning_rate": 3.4884711424127505e-06, "loss": 0.5500789284706116, "step": 435, "token_acc": 0.8237688237688238 }, { "epoch": 1.1723905723905723, "grad_norm": 1.3558974266052246, "learning_rate": 3.4817997097128887e-06, "loss": 0.5416366457939148, "step": 436, "token_acc": 0.830026455026455 }, { "epoch": 1.1750841750841752, "grad_norm": 1.3657586574554443, "learning_rate": 3.475120000110931e-06, "loss": 0.5585858225822449, "step": 437, "token_acc": 0.8221683929168059 }, { "epoch": 1.1777777777777778, "grad_norm": 1.3856815099716187, "learning_rate": 3.4684320699190793e-06, "loss": 0.5819730758666992, "step": 438, "token_acc": 0.8174688057040999 }, { "epoch": 1.1804713804713804, "grad_norm": 1.3156895637512207, "learning_rate": 3.461735975518836e-06, "loss": 0.5601409077644348, "step": 439, "token_acc": 0.8225970896283251 }, { "epoch": 1.183164983164983, "grad_norm": 1.2738596200942993, "learning_rate": 3.4550317733605323e-06, "loss": 0.5344141721725464, "step": 440, "token_acc": 0.8315357372826787 }, { "epoch": 1.185858585858586, "grad_norm": 1.2126624584197998, "learning_rate": 3.4483195199628485e-06, "loss": 0.5381360054016113, "step": 441, "token_acc": 0.8226922777820982 }, { "epoch": 1.1885521885521886, "grad_norm": 1.329675316810608, "learning_rate": 3.4415992719123407e-06, "loss": 0.5552048087120056, "step": 442, "token_acc": 0.822452616231978 }, { "epoch": 1.1912457912457912, "grad_norm": 1.272271752357483, "learning_rate": 3.4348710858629626e-06, "loss": 0.5635249614715576, "step": 443, "token_acc": 0.8222959844559585 }, { "epoch": 1.1939393939393939, "grad_norm": 1.3355112075805664, "learning_rate": 3.428135018535588e-06, "loss": 0.5478355288505554, "step": 444, "token_acc": 0.8247084233261339 }, { "epoch": 1.1966329966329967, "grad_norm": 1.2914936542510986, "learning_rate": 3.421391126717533e-06, "loss": 0.5437073707580566, "step": 445, "token_acc": 0.8206349206349206 }, { "epoch": 1.1993265993265994, "grad_norm": 1.2925519943237305, "learning_rate": 3.4146394672620733e-06, "loss": 0.5275407433509827, "step": 446, "token_acc": 0.8253012048192772 }, { "epoch": 1.202020202020202, "grad_norm": 1.2825841903686523, "learning_rate": 3.4078800970879742e-06, "loss": 0.5082290172576904, "step": 447, "token_acc": 0.836911246395386 }, { "epoch": 1.2047138047138046, "grad_norm": 1.2897424697875977, "learning_rate": 3.401113073179001e-06, "loss": 0.513004720211029, "step": 448, "token_acc": 0.8352039907357919 }, { "epoch": 1.2074074074074075, "grad_norm": 1.2468048334121704, "learning_rate": 3.3943384525834422e-06, "loss": 0.5425832271575928, "step": 449, "token_acc": 0.8297776442307693 }, { "epoch": 1.2101010101010101, "grad_norm": 1.3281177282333374, "learning_rate": 3.387556292413633e-06, "loss": 0.5295072197914124, "step": 450, "token_acc": 0.8288854003139717 }, { "epoch": 1.2101010101010101, "eval_loss": 0.6087726354598999, "eval_runtime": 10.4455, "eval_samples_per_second": 4.308, "eval_steps_per_second": 1.436, "eval_token_acc": 0.809737411428458, "step": 450 }, { "epoch": 1.2127946127946128, "grad_norm": 1.1957502365112305, "learning_rate": 3.3807666498454637e-06, "loss": 0.538082480430603, "step": 451, "token_acc": 0.8266778758582037 }, { "epoch": 1.2154882154882154, "grad_norm": 1.403494954109192, "learning_rate": 3.3739695821179097e-06, "loss": 0.5686549544334412, "step": 452, "token_acc": 0.820275693596231 }, { "epoch": 1.2181818181818183, "grad_norm": 1.2056913375854492, "learning_rate": 3.3671651465325395e-06, "loss": 0.5350559949874878, "step": 453, "token_acc": 0.825503355704698 }, { "epoch": 1.220875420875421, "grad_norm": 1.327231764793396, "learning_rate": 3.360353400453035e-06, "loss": 0.5617187023162842, "step": 454, "token_acc": 0.8209196641762304 }, { "epoch": 1.2235690235690235, "grad_norm": 1.3351339101791382, "learning_rate": 3.3535344013047107e-06, "loss": 0.5405756831169128, "step": 455, "token_acc": 0.8280983270740867 }, { "epoch": 1.2262626262626264, "grad_norm": 1.3544232845306396, "learning_rate": 3.346708206574023e-06, "loss": 0.5182971954345703, "step": 456, "token_acc": 0.830613632500425 }, { "epoch": 1.228956228956229, "grad_norm": 1.3298012018203735, "learning_rate": 3.3398748738080934e-06, "loss": 0.5590284466743469, "step": 457, "token_acc": 0.8227119480305896 }, { "epoch": 1.2316498316498317, "grad_norm": 1.311065435409546, "learning_rate": 3.333034460614217e-06, "loss": 0.5234758257865906, "step": 458, "token_acc": 0.8302091143880697 }, { "epoch": 1.2343434343434343, "grad_norm": 1.2877165079116821, "learning_rate": 3.3261870246593804e-06, "loss": 0.5486662983894348, "step": 459, "token_acc": 0.8230109023428439 }, { "epoch": 1.237037037037037, "grad_norm": 1.2355042695999146, "learning_rate": 3.319332623669773e-06, "loss": 0.5302286148071289, "step": 460, "token_acc": 0.8306136520340152 }, { "epoch": 1.2397306397306398, "grad_norm": 1.1928359270095825, "learning_rate": 3.312471315430304e-06, "loss": 0.5469911098480225, "step": 461, "token_acc": 0.8247140215083225 }, { "epoch": 1.2424242424242424, "grad_norm": 1.2898064851760864, "learning_rate": 3.305603157784111e-06, "loss": 0.5500743389129639, "step": 462, "token_acc": 0.8241236336223143 }, { "epoch": 1.245117845117845, "grad_norm": 1.3768513202667236, "learning_rate": 3.2987282086320756e-06, "loss": 0.5643160343170166, "step": 463, "token_acc": 0.82232070910556 }, { "epoch": 1.247811447811448, "grad_norm": 1.271673560142517, "learning_rate": 3.2918465259323348e-06, "loss": 0.5550126433372498, "step": 464, "token_acc": 0.8225634652938044 }, { "epoch": 1.2505050505050506, "grad_norm": 1.32992422580719, "learning_rate": 3.2849581676997884e-06, "loss": 0.5668783783912659, "step": 465, "token_acc": 0.8185292768689802 }, { "epoch": 1.2531986531986532, "grad_norm": 1.4883946180343628, "learning_rate": 3.2780631920056166e-06, "loss": 0.5236798524856567, "step": 466, "token_acc": 0.8318674750234822 }, { "epoch": 1.2558922558922558, "grad_norm": 1.340959906578064, "learning_rate": 3.271161656976787e-06, "loss": 0.5666428804397583, "step": 467, "token_acc": 0.8210294117647059 }, { "epoch": 1.2585858585858585, "grad_norm": 1.3402849435806274, "learning_rate": 3.2642536207955606e-06, "loss": 0.5475507974624634, "step": 468, "token_acc": 0.8269214650909396 }, { "epoch": 1.2612794612794613, "grad_norm": 1.4390969276428223, "learning_rate": 3.2573391416990115e-06, "loss": 0.5739935040473938, "step": 469, "token_acc": 0.8182610210838125 }, { "epoch": 1.263973063973064, "grad_norm": 1.4115692377090454, "learning_rate": 3.2504182779785233e-06, "loss": 0.5649391412734985, "step": 470, "token_acc": 0.8186932849364791 }, { "epoch": 1.2666666666666666, "grad_norm": 1.5037870407104492, "learning_rate": 3.243491087979309e-06, "loss": 0.5666574835777283, "step": 471, "token_acc": 0.8210151380231523 }, { "epoch": 1.2693602693602695, "grad_norm": 1.428099513053894, "learning_rate": 3.2365576300999117e-06, "loss": 0.5119885206222534, "step": 472, "token_acc": 0.8306535025858016 }, { "epoch": 1.272053872053872, "grad_norm": 1.2497246265411377, "learning_rate": 3.2296179627917167e-06, "loss": 0.5266587734222412, "step": 473, "token_acc": 0.8308429884139034 }, { "epoch": 1.2747474747474747, "grad_norm": 1.2414636611938477, "learning_rate": 3.2226721445584546e-06, "loss": 0.5784754753112793, "step": 474, "token_acc": 0.8196862745098039 }, { "epoch": 1.2774410774410774, "grad_norm": 1.3926820755004883, "learning_rate": 3.2157202339557124e-06, "loss": 0.5508587956428528, "step": 475, "token_acc": 0.8268912203202651 }, { "epoch": 1.28013468013468, "grad_norm": 1.309871792793274, "learning_rate": 3.208762289590436e-06, "loss": 0.52978515625, "step": 476, "token_acc": 0.830207551887972 }, { "epoch": 1.2828282828282829, "grad_norm": 1.3359417915344238, "learning_rate": 3.20179837012044e-06, "loss": 0.5271775722503662, "step": 477, "token_acc": 0.8285899094437258 }, { "epoch": 1.2855218855218855, "grad_norm": 2.925462007522583, "learning_rate": 3.1948285342539094e-06, "loss": 0.5068267583847046, "step": 478, "token_acc": 0.831629392971246 }, { "epoch": 1.2882154882154881, "grad_norm": 1.2830573320388794, "learning_rate": 3.1878528407489068e-06, "loss": 0.4996684491634369, "step": 479, "token_acc": 0.8383691886229957 }, { "epoch": 1.290909090909091, "grad_norm": 1.2911094427108765, "learning_rate": 3.180871348412878e-06, "loss": 0.53104567527771, "step": 480, "token_acc": 0.8277979797979798 }, { "epoch": 1.2936026936026936, "grad_norm": 1.318032145500183, "learning_rate": 3.173884116102153e-06, "loss": 0.5269219875335693, "step": 481, "token_acc": 0.8309563391372824 }, { "epoch": 1.2962962962962963, "grad_norm": 1.4209520816802979, "learning_rate": 3.166891202721452e-06, "loss": 0.5224370360374451, "step": 482, "token_acc": 0.8288948069241012 }, { "epoch": 1.298989898989899, "grad_norm": 1.3289130926132202, "learning_rate": 3.1598926672233916e-06, "loss": 0.5280287861824036, "step": 483, "token_acc": 0.8303893294881038 }, { "epoch": 1.3016835016835016, "grad_norm": 1.4162652492523193, "learning_rate": 3.1528885686079803e-06, "loss": 0.5535218715667725, "step": 484, "token_acc": 0.8216313398940197 }, { "epoch": 1.3043771043771044, "grad_norm": 1.2746840715408325, "learning_rate": 3.1458789659221267e-06, "loss": 0.5383344888687134, "step": 485, "token_acc": 0.8252516010978957 }, { "epoch": 1.307070707070707, "grad_norm": 1.3036350011825562, "learning_rate": 3.138863918259143e-06, "loss": 0.5644752979278564, "step": 486, "token_acc": 0.8222005525759792 }, { "epoch": 1.3097643097643097, "grad_norm": 1.3619461059570312, "learning_rate": 3.131843484758242e-06, "loss": 0.5206588506698608, "step": 487, "token_acc": 0.8331756814859403 }, { "epoch": 1.3124579124579125, "grad_norm": 1.257187843322754, "learning_rate": 3.124817724604042e-06, "loss": 0.5301422476768494, "step": 488, "token_acc": 0.8304655870445344 }, { "epoch": 1.3151515151515152, "grad_norm": 1.3443907499313354, "learning_rate": 3.1177866970260665e-06, "loss": 0.5144416689872742, "step": 489, "token_acc": 0.8325577066269546 }, { "epoch": 1.3178451178451178, "grad_norm": 1.4184023141860962, "learning_rate": 3.1107504612982442e-06, "loss": 0.5325008630752563, "step": 490, "token_acc": 0.8345790715971676 }, { "epoch": 1.3205387205387205, "grad_norm": 1.5279210805892944, "learning_rate": 3.1037090767384128e-06, "loss": 0.5602642297744751, "step": 491, "token_acc": 0.825359477124183 }, { "epoch": 1.3232323232323233, "grad_norm": 1.207060694694519, "learning_rate": 3.0966626027078146e-06, "loss": 0.5232222080230713, "step": 492, "token_acc": 0.8313089454129752 }, { "epoch": 1.325925925925926, "grad_norm": 1.3826746940612793, "learning_rate": 3.0896110986105954e-06, "loss": 0.5378683805465698, "step": 493, "token_acc": 0.8286690680705643 }, { "epoch": 1.3286195286195286, "grad_norm": 1.3640094995498657, "learning_rate": 3.082554623893312e-06, "loss": 0.5667814016342163, "step": 494, "token_acc": 0.8192684100754051 }, { "epoch": 1.3313131313131312, "grad_norm": 1.313131332397461, "learning_rate": 3.0754932380444204e-06, "loss": 0.5575794577598572, "step": 495, "token_acc": 0.8202969007726365 }, { "epoch": 1.334006734006734, "grad_norm": 1.2927592992782593, "learning_rate": 3.06842700059378e-06, "loss": 0.5171545743942261, "step": 496, "token_acc": 0.830954356846473 }, { "epoch": 1.3367003367003367, "grad_norm": 1.3114001750946045, "learning_rate": 3.0613559711121536e-06, "loss": 0.5419325828552246, "step": 497, "token_acc": 0.8285569363428861 }, { "epoch": 1.3393939393939394, "grad_norm": 1.359096884727478, "learning_rate": 3.054280209210699e-06, "loss": 0.5572642087936401, "step": 498, "token_acc": 0.821904924760602 }, { "epoch": 1.3420875420875422, "grad_norm": 1.3104668855667114, "learning_rate": 3.04719977454047e-06, "loss": 0.507584273815155, "step": 499, "token_acc": 0.835638157315247 }, { "epoch": 1.3447811447811449, "grad_norm": 1.2429931163787842, "learning_rate": 3.040114726791917e-06, "loss": 0.5190429091453552, "step": 500, "token_acc": 0.8307447232178415 }, { "epoch": 1.3447811447811449, "eval_loss": 0.6031205058097839, "eval_runtime": 10.1782, "eval_samples_per_second": 4.421, "eval_steps_per_second": 1.474, "eval_token_acc": 0.8107695055872021, "step": 500 }, { "epoch": 1.3474747474747475, "grad_norm": 1.259064793586731, "learning_rate": 3.0330251256943773e-06, "loss": 0.5427736639976501, "step": 501, "token_acc": 0.8283856818359834 }, { "epoch": 1.3501683501683501, "grad_norm": 1.3342455625534058, "learning_rate": 3.0259310310155737e-06, "loss": 0.5549718141555786, "step": 502, "token_acc": 0.8202182653647329 }, { "epoch": 1.3528619528619528, "grad_norm": 1.2504429817199707, "learning_rate": 3.018832502561114e-06, "loss": 0.5487222671508789, "step": 503, "token_acc": 0.8213312293788553 }, { "epoch": 1.3555555555555556, "grad_norm": 1.2704901695251465, "learning_rate": 3.011729600173982e-06, "loss": 0.5619911551475525, "step": 504, "token_acc": 0.8213736939151813 }, { "epoch": 1.3582491582491583, "grad_norm": 1.2943352460861206, "learning_rate": 3.004622383734037e-06, "loss": 0.5145593881607056, "step": 505, "token_acc": 0.8308698606863745 }, { "epoch": 1.360942760942761, "grad_norm": 1.319124460220337, "learning_rate": 2.997510913157506e-06, "loss": 0.5448309779167175, "step": 506, "token_acc": 0.8268982715624731 }, { "epoch": 1.3636363636363638, "grad_norm": 1.7210705280303955, "learning_rate": 2.9903952483964804e-06, "loss": 0.5392090082168579, "step": 507, "token_acc": 0.8238273142382732 }, { "epoch": 1.3663299663299664, "grad_norm": 1.5822205543518066, "learning_rate": 2.983275449438409e-06, "loss": 0.5423921942710876, "step": 508, "token_acc": 0.823862675460849 }, { "epoch": 1.369023569023569, "grad_norm": 1.2416781187057495, "learning_rate": 2.976151576305595e-06, "loss": 0.5292619466781616, "step": 509, "token_acc": 0.8302878395860285 }, { "epoch": 1.3717171717171717, "grad_norm": 1.346342921257019, "learning_rate": 2.9690236890546863e-06, "loss": 0.536871612071991, "step": 510, "token_acc": 0.8296441341733356 }, { "epoch": 1.3744107744107743, "grad_norm": 1.2723190784454346, "learning_rate": 2.961891847776174e-06, "loss": 0.5455788373947144, "step": 511, "token_acc": 0.8278379058645859 }, { "epoch": 1.3771043771043772, "grad_norm": 1.39217209815979, "learning_rate": 2.9547561125938795e-06, "loss": 0.5269730687141418, "step": 512, "token_acc": 0.830226792764668 }, { "epoch": 1.3797979797979798, "grad_norm": 1.3171322345733643, "learning_rate": 2.947616543664452e-06, "loss": 0.5228381156921387, "step": 513, "token_acc": 0.8275237273511648 }, { "epoch": 1.3824915824915824, "grad_norm": 1.208625316619873, "learning_rate": 2.940473201176863e-06, "loss": 0.5182669758796692, "step": 514, "token_acc": 0.829126213592233 }, { "epoch": 1.3851851851851853, "grad_norm": 1.3438341617584229, "learning_rate": 2.933326145351895e-06, "loss": 0.5639322996139526, "step": 515, "token_acc": 0.8174266365688487 }, { "epoch": 1.387878787878788, "grad_norm": 1.3947347402572632, "learning_rate": 2.9261754364416335e-06, "loss": 0.549400269985199, "step": 516, "token_acc": 0.8239555890101619 }, { "epoch": 1.3905723905723906, "grad_norm": 1.374576449394226, "learning_rate": 2.919021134728962e-06, "loss": 0.5237486958503723, "step": 517, "token_acc": 0.8260982809515541 }, { "epoch": 1.3932659932659932, "grad_norm": 1.30466890335083, "learning_rate": 2.9118633005270526e-06, "loss": 0.5331214666366577, "step": 518, "token_acc": 0.8332927126492811 }, { "epoch": 1.3959595959595958, "grad_norm": 1.4099949598312378, "learning_rate": 2.9047019941788584e-06, "loss": 0.5340089201927185, "step": 519, "token_acc": 0.8248608195673999 }, { "epoch": 1.3986531986531987, "grad_norm": 1.2624729871749878, "learning_rate": 2.8975372760566025e-06, "loss": 0.513485848903656, "step": 520, "token_acc": 0.836085577157732 }, { "epoch": 1.4013468013468013, "grad_norm": 1.4065229892730713, "learning_rate": 2.8903692065612696e-06, "loss": 0.5167554616928101, "step": 521, "token_acc": 0.8322787630335977 }, { "epoch": 1.404040404040404, "grad_norm": 1.3203905820846558, "learning_rate": 2.883197846122101e-06, "loss": 0.5349131226539612, "step": 522, "token_acc": 0.8233560663467205 }, { "epoch": 1.4067340067340068, "grad_norm": 1.28236722946167, "learning_rate": 2.8760232551960782e-06, "loss": 0.5219249129295349, "step": 523, "token_acc": 0.8281660104986877 }, { "epoch": 1.4094276094276095, "grad_norm": 1.4474546909332275, "learning_rate": 2.868845494267418e-06, "loss": 0.5671615600585938, "step": 524, "token_acc": 0.8192450824029771 }, { "epoch": 1.412121212121212, "grad_norm": 1.4233651161193848, "learning_rate": 2.8616646238470623e-06, "loss": 0.5198219418525696, "step": 525, "token_acc": 0.8312307692307692 }, { "epoch": 1.4148148148148147, "grad_norm": 1.2015082836151123, "learning_rate": 2.854480704472167e-06, "loss": 0.5247815847396851, "step": 526, "token_acc": 0.8337353029846246 }, { "epoch": 1.4175084175084174, "grad_norm": 1.2745591402053833, "learning_rate": 2.847293796705588e-06, "loss": 0.5137478709220886, "step": 527, "token_acc": 0.8291554508748318 }, { "epoch": 1.4202020202020202, "grad_norm": 1.418441891670227, "learning_rate": 2.840103961135381e-06, "loss": 0.5737914443016052, "step": 528, "token_acc": 0.8170220053086736 }, { "epoch": 1.4228956228956229, "grad_norm": 1.2372572422027588, "learning_rate": 2.8329112583742784e-06, "loss": 0.5425206422805786, "step": 529, "token_acc": 0.8266031513374863 }, { "epoch": 1.4255892255892255, "grad_norm": 1.2991135120391846, "learning_rate": 2.825715749059186e-06, "loss": 0.4841446280479431, "step": 530, "token_acc": 0.8445903408579908 }, { "epoch": 1.4282828282828284, "grad_norm": 1.3004472255706787, "learning_rate": 2.8185174938506714e-06, "loss": 0.502529501914978, "step": 531, "token_acc": 0.8362589138782227 }, { "epoch": 1.430976430976431, "grad_norm": 1.36200749874115, "learning_rate": 2.811316553432448e-06, "loss": 0.5651768445968628, "step": 532, "token_acc": 0.8221194677738983 }, { "epoch": 1.4336700336700336, "grad_norm": 1.2359154224395752, "learning_rate": 2.804112988510868e-06, "loss": 0.522117018699646, "step": 533, "token_acc": 0.8353997774598633 }, { "epoch": 1.4363636363636363, "grad_norm": 1.348143219947815, "learning_rate": 2.7969068598144093e-06, "loss": 0.5406180024147034, "step": 534, "token_acc": 0.8245795601552394 }, { "epoch": 1.439057239057239, "grad_norm": 1.3418415784835815, "learning_rate": 2.7896982280931627e-06, "loss": 0.562376856803894, "step": 535, "token_acc": 0.8199601835021206 }, { "epoch": 1.4417508417508418, "grad_norm": 1.3659731149673462, "learning_rate": 2.7824871541183214e-06, "loss": 0.519110381603241, "step": 536, "token_acc": 0.8348350975417723 }, { "epoch": 1.4444444444444444, "grad_norm": 1.2767136096954346, "learning_rate": 2.7752736986816655e-06, "loss": 0.5346537828445435, "step": 537, "token_acc": 0.827257564927329 }, { "epoch": 1.447138047138047, "grad_norm": 1.3904614448547363, "learning_rate": 2.7680579225950528e-06, "loss": 0.518096923828125, "step": 538, "token_acc": 0.8289608813369433 }, { "epoch": 1.44983164983165, "grad_norm": 1.2587382793426514, "learning_rate": 2.760839886689906e-06, "loss": 0.5264335870742798, "step": 539, "token_acc": 0.8294718081659106 }, { "epoch": 1.4525252525252526, "grad_norm": 1.2666003704071045, "learning_rate": 2.753619651816698e-06, "loss": 0.5176980495452881, "step": 540, "token_acc": 0.8303348124243647 }, { "epoch": 1.4552188552188552, "grad_norm": 1.2171458005905151, "learning_rate": 2.7463972788444386e-06, "loss": 0.5227718949317932, "step": 541, "token_acc": 0.8319059483027095 }, { "epoch": 1.457912457912458, "grad_norm": 1.251086711883545, "learning_rate": 2.7391728286601644e-06, "loss": 0.5308544635772705, "step": 542, "token_acc": 0.8298146655922644 }, { "epoch": 1.4606060606060607, "grad_norm": 1.2548424005508423, "learning_rate": 2.731946362168422e-06, "loss": 0.5205682516098022, "step": 543, "token_acc": 0.8365216005101227 }, { "epoch": 1.4632996632996633, "grad_norm": 1.3236373662948608, "learning_rate": 2.7247179402907566e-06, "loss": 0.5258733034133911, "step": 544, "token_acc": 0.8308457711442786 }, { "epoch": 1.465993265993266, "grad_norm": 1.2871671915054321, "learning_rate": 2.7174876239652003e-06, "loss": 0.5433669090270996, "step": 545, "token_acc": 0.8254850088183422 }, { "epoch": 1.4686868686868686, "grad_norm": 1.237703800201416, "learning_rate": 2.710255474145751e-06, "loss": 0.5113095641136169, "step": 546, "token_acc": 0.8326550155614078 }, { "epoch": 1.4713804713804715, "grad_norm": 1.3590874671936035, "learning_rate": 2.703021551801869e-06, "loss": 0.5083249807357788, "step": 547, "token_acc": 0.8392263781204737 }, { "epoch": 1.474074074074074, "grad_norm": 1.179952621459961, "learning_rate": 2.6957859179179535e-06, "loss": 0.49858736991882324, "step": 548, "token_acc": 0.8342109593549448 }, { "epoch": 1.4767676767676767, "grad_norm": 1.2405438423156738, "learning_rate": 2.688548633492835e-06, "loss": 0.5498375296592712, "step": 549, "token_acc": 0.8245572794232879 }, { "epoch": 1.4794612794612796, "grad_norm": 1.2973542213439941, "learning_rate": 2.681309759539257e-06, "loss": 0.5828981399536133, "step": 550, "token_acc": 0.8183702758730665 }, { "epoch": 1.4794612794612796, "eval_loss": 0.6028392314910889, "eval_runtime": 9.9553, "eval_samples_per_second": 4.52, "eval_steps_per_second": 1.507, "eval_token_acc": 0.8104916340829248, "step": 550 }, { "epoch": 1.4821548821548822, "grad_norm": 1.3448232412338257, "learning_rate": 2.6740693570833655e-06, "loss": 0.5428876280784607, "step": 551, "token_acc": 0.8226091567968888 }, { "epoch": 1.4848484848484849, "grad_norm": 1.3072960376739502, "learning_rate": 2.666827487164189e-06, "loss": 0.5359790325164795, "step": 552, "token_acc": 0.8246982510879383 }, { "epoch": 1.4875420875420875, "grad_norm": 1.3080604076385498, "learning_rate": 2.6595842108331306e-06, "loss": 0.5467526912689209, "step": 553, "token_acc": 0.82443724420191 }, { "epoch": 1.4902356902356901, "grad_norm": 1.2727245092391968, "learning_rate": 2.652339589153447e-06, "loss": 0.5319467782974243, "step": 554, "token_acc": 0.8287709721441759 }, { "epoch": 1.492929292929293, "grad_norm": 1.3607213497161865, "learning_rate": 2.645093683199739e-06, "loss": 0.5611762404441833, "step": 555, "token_acc": 0.82168409466153 }, { "epoch": 1.4956228956228956, "grad_norm": 1.2150683403015137, "learning_rate": 2.637846554057434e-06, "loss": 0.5310118794441223, "step": 556, "token_acc": 0.8300183598531212 }, { "epoch": 1.4983164983164983, "grad_norm": 1.3266682624816895, "learning_rate": 2.6305982628222675e-06, "loss": 0.5346218347549438, "step": 557, "token_acc": 0.8243365228380574 }, { "epoch": 1.5010101010101011, "grad_norm": 1.32683527469635, "learning_rate": 2.6233488705997785e-06, "loss": 0.5147403478622437, "step": 558, "token_acc": 0.8308162531463502 }, { "epoch": 1.5037037037037035, "grad_norm": 1.3884978294372559, "learning_rate": 2.616098438504783e-06, "loss": 0.5621322989463806, "step": 559, "token_acc": 0.8179541871464342 }, { "epoch": 1.5063973063973064, "grad_norm": 1.2559587955474854, "learning_rate": 2.6088470276608622e-06, "loss": 0.5357745885848999, "step": 560, "token_acc": 0.8261898723476202 }, { "epoch": 1.509090909090909, "grad_norm": 1.2674764394760132, "learning_rate": 2.6015946991998538e-06, "loss": 0.5309295654296875, "step": 561, "token_acc": 0.8305371834166801 }, { "epoch": 1.5117845117845117, "grad_norm": 1.3178752660751343, "learning_rate": 2.594341514261327e-06, "loss": 0.5390812754631042, "step": 562, "token_acc": 0.8273782737827379 }, { "epoch": 1.5144781144781145, "grad_norm": 1.3601493835449219, "learning_rate": 2.5870875339920727e-06, "loss": 0.531241774559021, "step": 563, "token_acc": 0.8280414438502673 }, { "epoch": 1.5171717171717172, "grad_norm": 1.180949091911316, "learning_rate": 2.579832819545588e-06, "loss": 0.5350216627120972, "step": 564, "token_acc": 0.827735806717032 }, { "epoch": 1.5198653198653198, "grad_norm": 1.3770560026168823, "learning_rate": 2.572577432081557e-06, "loss": 0.5127845406532288, "step": 565, "token_acc": 0.830991984831509 }, { "epoch": 1.5225589225589227, "grad_norm": 1.2565054893493652, "learning_rate": 2.565321432765338e-06, "loss": 0.5962499380111694, "step": 566, "token_acc": 0.8098079161816065 }, { "epoch": 1.5252525252525253, "grad_norm": 1.25298273563385, "learning_rate": 2.55806488276745e-06, "loss": 0.5310741662979126, "step": 567, "token_acc": 0.8283845425596007 }, { "epoch": 1.527946127946128, "grad_norm": 1.4767351150512695, "learning_rate": 2.550807843263051e-06, "loss": 0.5730972290039062, "step": 568, "token_acc": 0.824765478424015 }, { "epoch": 1.5306397306397308, "grad_norm": 1.3834646940231323, "learning_rate": 2.5435503754314284e-06, "loss": 0.5368469953536987, "step": 569, "token_acc": 0.8276893391220453 }, { "epoch": 1.5333333333333332, "grad_norm": 1.2553482055664062, "learning_rate": 2.536292540455479e-06, "loss": 0.5239652991294861, "step": 570, "token_acc": 0.8319018404907975 }, { "epoch": 1.536026936026936, "grad_norm": 1.2867164611816406, "learning_rate": 2.5290343995211936e-06, "loss": 0.5163215398788452, "step": 571, "token_acc": 0.8320152713389691 }, { "epoch": 1.5387205387205387, "grad_norm": 1.2642592191696167, "learning_rate": 2.5217760138171465e-06, "loss": 0.5208362936973572, "step": 572, "token_acc": 0.8308771376783041 }, { "epoch": 1.5414141414141413, "grad_norm": 1.272409439086914, "learning_rate": 2.5145174445339717e-06, "loss": 0.5341788530349731, "step": 573, "token_acc": 0.8269955744959843 }, { "epoch": 1.5441077441077442, "grad_norm": 1.2027113437652588, "learning_rate": 2.5072587528638508e-06, "loss": 0.5663927793502808, "step": 574, "token_acc": 0.8192544716669172 }, { "epoch": 1.5468013468013468, "grad_norm": 1.2623393535614014, "learning_rate": 2.5e-06, "loss": 0.5190889239311218, "step": 575, "token_acc": 0.8292145084024029 }, { "epoch": 1.5494949494949495, "grad_norm": 1.2543082237243652, "learning_rate": 2.4927412471361496e-06, "loss": 0.5369378924369812, "step": 576, "token_acc": 0.8267584354540877 }, { "epoch": 1.5521885521885523, "grad_norm": 1.3330254554748535, "learning_rate": 2.4854825554660287e-06, "loss": 0.48903486132621765, "step": 577, "token_acc": 0.8397146469207152 }, { "epoch": 1.5548821548821548, "grad_norm": 1.3317588567733765, "learning_rate": 2.478223986182854e-06, "loss": 0.5207327604293823, "step": 578, "token_acc": 0.8307900983384198 }, { "epoch": 1.5575757575757576, "grad_norm": 1.2640886306762695, "learning_rate": 2.4709656004788064e-06, "loss": 0.5404777526855469, "step": 579, "token_acc": 0.8259972489683631 }, { "epoch": 1.5602693602693603, "grad_norm": 1.2670387029647827, "learning_rate": 2.4637074595445226e-06, "loss": 0.5143177509307861, "step": 580, "token_acc": 0.8324677500418831 }, { "epoch": 1.5629629629629629, "grad_norm": 1.3045003414154053, "learning_rate": 2.4564496245685725e-06, "loss": 0.529194712638855, "step": 581, "token_acc": 0.8295454545454546 }, { "epoch": 1.5656565656565657, "grad_norm": 1.3042017221450806, "learning_rate": 2.4491921567369493e-06, "loss": 0.49675196409225464, "step": 582, "token_acc": 0.8343526574209665 }, { "epoch": 1.5683501683501684, "grad_norm": 1.653615951538086, "learning_rate": 2.441935117232551e-06, "loss": 0.6656795740127563, "step": 583, "token_acc": 0.7994200497100249 }, { "epoch": 1.571043771043771, "grad_norm": 1.288814663887024, "learning_rate": 2.4346785672346625e-06, "loss": 0.550855278968811, "step": 584, "token_acc": 0.8255979605608458 }, { "epoch": 1.5737373737373739, "grad_norm": 1.2563583850860596, "learning_rate": 2.427422567918444e-06, "loss": 0.5222804546356201, "step": 585, "token_acc": 0.831921672235605 }, { "epoch": 1.5764309764309763, "grad_norm": 1.1677101850509644, "learning_rate": 2.420167180454413e-06, "loss": 0.514311671257019, "step": 586, "token_acc": 0.8355374953823421 }, { "epoch": 1.5791245791245792, "grad_norm": 1.3954452276229858, "learning_rate": 2.4129124660079277e-06, "loss": 0.513930082321167, "step": 587, "token_acc": 0.8323309640969564 }, { "epoch": 1.5818181818181818, "grad_norm": 1.279220461845398, "learning_rate": 2.405658485738673e-06, "loss": 0.5450801253318787, "step": 588, "token_acc": 0.8231040716191788 }, { "epoch": 1.5845117845117844, "grad_norm": 1.2687897682189941, "learning_rate": 2.3984053008001466e-06, "loss": 0.5601872205734253, "step": 589, "token_acc": 0.8187315862924485 }, { "epoch": 1.5872053872053873, "grad_norm": 1.2388274669647217, "learning_rate": 2.3911529723391386e-06, "loss": 0.5085536241531372, "step": 590, "token_acc": 0.8335733039722198 }, { "epoch": 1.58989898989899, "grad_norm": 1.2651325464248657, "learning_rate": 2.383901561495219e-06, "loss": 0.5268658399581909, "step": 591, "token_acc": 0.8272446825783406 }, { "epoch": 1.5925925925925926, "grad_norm": 1.295714020729065, "learning_rate": 2.3766511294002223e-06, "loss": 0.5428168177604675, "step": 592, "token_acc": 0.8279340984820437 }, { "epoch": 1.5952861952861954, "grad_norm": 1.3340668678283691, "learning_rate": 2.369401737177733e-06, "loss": 0.5438997149467468, "step": 593, "token_acc": 0.8215983496277693 }, { "epoch": 1.5979797979797978, "grad_norm": 1.2575315237045288, "learning_rate": 2.3621534459425673e-06, "loss": 0.5158650875091553, "step": 594, "token_acc": 0.8297910341132345 }, { "epoch": 1.6006734006734007, "grad_norm": 1.281363844871521, "learning_rate": 2.354906316800261e-06, "loss": 0.5311204195022583, "step": 595, "token_acc": 0.8302522389286008 }, { "epoch": 1.6033670033670033, "grad_norm": 1.2831847667694092, "learning_rate": 2.3476604108465533e-06, "loss": 0.5498952865600586, "step": 596, "token_acc": 0.8255871709359256 }, { "epoch": 1.606060606060606, "grad_norm": 1.2498493194580078, "learning_rate": 2.34041578916687e-06, "loss": 0.5144690871238708, "step": 597, "token_acc": 0.8301239572776176 }, { "epoch": 1.6087542087542088, "grad_norm": 1.2545496225357056, "learning_rate": 2.3331725128358112e-06, "loss": 0.5205769538879395, "step": 598, "token_acc": 0.8280215550423402 }, { "epoch": 1.6114478114478115, "grad_norm": 1.2974599599838257, "learning_rate": 2.3259306429166358e-06, "loss": 0.5149669647216797, "step": 599, "token_acc": 0.8335331550308955 }, { "epoch": 1.614141414141414, "grad_norm": 1.2752381563186646, "learning_rate": 2.318690240460744e-06, "loss": 0.5539026260375977, "step": 600, "token_acc": 0.8221780604133545 }, { "epoch": 1.614141414141414, "eval_loss": 0.5976670980453491, "eval_runtime": 10.3311, "eval_samples_per_second": 4.356, "eval_steps_per_second": 1.452, "eval_token_acc": 0.8109084413393407, "step": 600 }, { "epoch": 1.616835016835017, "grad_norm": 1.3146109580993652, "learning_rate": 2.3114513665071663e-06, "loss": 0.5532636046409607, "step": 601, "token_acc": 0.8253512380656679 }, { "epoch": 1.6195286195286194, "grad_norm": 1.298060655593872, "learning_rate": 2.3042140820820474e-06, "loss": 0.540547251701355, "step": 602, "token_acc": 0.8259932776006206 }, { "epoch": 1.6222222222222222, "grad_norm": 1.254390835762024, "learning_rate": 2.2969784481981324e-06, "loss": 0.5156750082969666, "step": 603, "token_acc": 0.8339814749345966 }, { "epoch": 1.6249158249158249, "grad_norm": 1.2328174114227295, "learning_rate": 2.2897445258542496e-06, "loss": 0.5261924266815186, "step": 604, "token_acc": 0.8278160246081734 }, { "epoch": 1.6276094276094275, "grad_norm": 1.2821675539016724, "learning_rate": 2.2825123760348005e-06, "loss": 0.5245330333709717, "step": 605, "token_acc": 0.8320428336079078 }, { "epoch": 1.6303030303030304, "grad_norm": 1.3222136497497559, "learning_rate": 2.2752820597092434e-06, "loss": 0.5352270007133484, "step": 606, "token_acc": 0.8269348491473546 }, { "epoch": 1.632996632996633, "grad_norm": 1.4110769033432007, "learning_rate": 2.2680536378315786e-06, "loss": 0.5218610763549805, "step": 607, "token_acc": 0.8341478009374689 }, { "epoch": 1.6356902356902356, "grad_norm": 1.1906623840332031, "learning_rate": 2.260827171339836e-06, "loss": 0.5340770483016968, "step": 608, "token_acc": 0.8314176245210728 }, { "epoch": 1.6383838383838385, "grad_norm": 1.3124808073043823, "learning_rate": 2.2536027211555626e-06, "loss": 0.5174824595451355, "step": 609, "token_acc": 0.8308198299374298 }, { "epoch": 1.641077441077441, "grad_norm": 1.3049713373184204, "learning_rate": 2.246380348183303e-06, "loss": 0.5074042081832886, "step": 610, "token_acc": 0.8355952864689151 }, { "epoch": 1.6437710437710438, "grad_norm": 1.3064461946487427, "learning_rate": 2.2391601133100947e-06, "loss": 0.5343910455703735, "step": 611, "token_acc": 0.8336621163102659 }, { "epoch": 1.6464646464646466, "grad_norm": 1.2432670593261719, "learning_rate": 2.231942077404948e-06, "loss": 0.5154263973236084, "step": 612, "token_acc": 0.8325506197551774 }, { "epoch": 1.649158249158249, "grad_norm": 1.2782169580459595, "learning_rate": 2.2247263013183354e-06, "loss": 0.5306354761123657, "step": 613, "token_acc": 0.8307692307692308 }, { "epoch": 1.651851851851852, "grad_norm": 1.3431081771850586, "learning_rate": 2.2175128458816794e-06, "loss": 0.5445829629898071, "step": 614, "token_acc": 0.8258225739275302 }, { "epoch": 1.6545454545454545, "grad_norm": 1.338997721672058, "learning_rate": 2.2103017719068377e-06, "loss": 0.5337082743644714, "step": 615, "token_acc": 0.8259402121504339 }, { "epoch": 1.6572390572390572, "grad_norm": 1.3283079862594604, "learning_rate": 2.2030931401855907e-06, "loss": 0.5346357822418213, "step": 616, "token_acc": 0.8253311258278145 }, { "epoch": 1.65993265993266, "grad_norm": 1.3279273509979248, "learning_rate": 2.1958870114891323e-06, "loss": 0.5013105869293213, "step": 617, "token_acc": 0.8338644007788989 }, { "epoch": 1.6626262626262627, "grad_norm": 1.3095924854278564, "learning_rate": 2.188683446567553e-06, "loss": 0.533822774887085, "step": 618, "token_acc": 0.8239015512609411 }, { "epoch": 1.6653198653198653, "grad_norm": 1.3109451532363892, "learning_rate": 2.18148250614933e-06, "loss": 0.5089848041534424, "step": 619, "token_acc": 0.8327678571428572 }, { "epoch": 1.6680134680134682, "grad_norm": 1.316028356552124, "learning_rate": 2.1742842509408147e-06, "loss": 0.5245605707168579, "step": 620, "token_acc": 0.8330134357005758 }, { "epoch": 1.6707070707070706, "grad_norm": 1.248524785041809, "learning_rate": 2.1670887416257224e-06, "loss": 0.5462322235107422, "step": 621, "token_acc": 0.8228379513014273 }, { "epoch": 1.6734006734006734, "grad_norm": 1.2953842878341675, "learning_rate": 2.15989603886462e-06, "loss": 0.5244719982147217, "step": 622, "token_acc": 0.829001820890581 }, { "epoch": 1.676094276094276, "grad_norm": 1.2619190216064453, "learning_rate": 2.1527062032944125e-06, "loss": 0.5725884437561035, "step": 623, "token_acc": 0.8206841046277666 }, { "epoch": 1.6787878787878787, "grad_norm": 1.226233959197998, "learning_rate": 2.145519295527834e-06, "loss": 0.5763660669326782, "step": 624, "token_acc": 0.8239312627649834 }, { "epoch": 1.6814814814814816, "grad_norm": 1.2436288595199585, "learning_rate": 2.1383353761529377e-06, "loss": 0.5172542929649353, "step": 625, "token_acc": 0.834533637800808 }, { "epoch": 1.6841750841750842, "grad_norm": 1.2660465240478516, "learning_rate": 2.1311545057325823e-06, "loss": 0.5646339654922485, "step": 626, "token_acc": 0.8192951541850221 }, { "epoch": 1.6868686868686869, "grad_norm": 1.23878812789917, "learning_rate": 2.123976744803922e-06, "loss": 0.5387336015701294, "step": 627, "token_acc": 0.825098814229249 }, { "epoch": 1.6895622895622897, "grad_norm": 1.3175020217895508, "learning_rate": 2.1168021538779e-06, "loss": 0.5135160684585571, "step": 628, "token_acc": 0.8383429372371904 }, { "epoch": 1.6922558922558921, "grad_norm": 1.448819875717163, "learning_rate": 2.109630793438731e-06, "loss": 0.5551309585571289, "step": 629, "token_acc": 0.8227941176470588 }, { "epoch": 1.694949494949495, "grad_norm": 1.2088183164596558, "learning_rate": 2.102462723943399e-06, "loss": 0.577957034111023, "step": 630, "token_acc": 0.8213105938388261 }, { "epoch": 1.6976430976430976, "grad_norm": 1.2394400835037231, "learning_rate": 2.0952980058211425e-06, "loss": 0.5343284606933594, "step": 631, "token_acc": 0.824252518042668 }, { "epoch": 1.7003367003367003, "grad_norm": 1.3279739618301392, "learning_rate": 2.0881366994729483e-06, "loss": 0.5638749599456787, "step": 632, "token_acc": 0.823017003099087 }, { "epoch": 1.7030303030303031, "grad_norm": 1.2435907125473022, "learning_rate": 2.080978865271039e-06, "loss": 0.5501276254653931, "step": 633, "token_acc": 0.8228162884675163 }, { "epoch": 1.7057239057239058, "grad_norm": 1.4053887128829956, "learning_rate": 2.0738245635583677e-06, "loss": 0.5238338708877563, "step": 634, "token_acc": 0.8280698659139026 }, { "epoch": 1.7084175084175084, "grad_norm": 1.3361895084381104, "learning_rate": 2.066673854648106e-06, "loss": 0.5295643210411072, "step": 635, "token_acc": 0.826449119793903 }, { "epoch": 1.7111111111111112, "grad_norm": 1.2041525840759277, "learning_rate": 2.0595267988231364e-06, "loss": 0.5745410919189453, "step": 636, "token_acc": 0.8177246556373442 }, { "epoch": 1.7138047138047137, "grad_norm": 1.4238810539245605, "learning_rate": 2.052383456335549e-06, "loss": 0.5795806050300598, "step": 637, "token_acc": 0.8120729914382081 }, { "epoch": 1.7164983164983165, "grad_norm": 1.4430276155471802, "learning_rate": 2.045243887406122e-06, "loss": 0.531055748462677, "step": 638, "token_acc": 0.8295709957979087 }, { "epoch": 1.7191919191919192, "grad_norm": 1.3926464319229126, "learning_rate": 2.038108152223827e-06, "loss": 0.5470781326293945, "step": 639, "token_acc": 0.8267962513016314 }, { "epoch": 1.7218855218855218, "grad_norm": 1.273355484008789, "learning_rate": 2.030976310945314e-06, "loss": 0.541801929473877, "step": 640, "token_acc": 0.826353760676673 }, { "epoch": 1.7245791245791247, "grad_norm": 1.255687952041626, "learning_rate": 2.0238484236944057e-06, "loss": 0.5213568210601807, "step": 641, "token_acc": 0.8273381294964028 }, { "epoch": 1.7272727272727273, "grad_norm": 1.324515461921692, "learning_rate": 2.0167245505615913e-06, "loss": 0.5439770817756653, "step": 642, "token_acc": 0.8232744520771998 }, { "epoch": 1.72996632996633, "grad_norm": 1.2916676998138428, "learning_rate": 2.0096047516035205e-06, "loss": 0.591974139213562, "step": 643, "token_acc": 0.816005142214366 }, { "epoch": 1.7326599326599328, "grad_norm": 1.2665143013000488, "learning_rate": 2.002489086842494e-06, "loss": 0.5224664211273193, "step": 644, "token_acc": 0.8290283954840917 }, { "epoch": 1.7353535353535352, "grad_norm": 1.299863576889038, "learning_rate": 1.9953776162659634e-06, "loss": 0.5518234968185425, "step": 645, "token_acc": 0.8229669269234143 }, { "epoch": 1.738047138047138, "grad_norm": 1.2852531671524048, "learning_rate": 1.9882703998260185e-06, "loss": 0.5503530502319336, "step": 646, "token_acc": 0.8266446007447249 }, { "epoch": 1.7407407407407407, "grad_norm": 1.2648011445999146, "learning_rate": 1.981167497438887e-06, "loss": 0.49466848373413086, "step": 647, "token_acc": 0.8402766607688595 }, { "epoch": 1.7434343434343433, "grad_norm": 1.2010273933410645, "learning_rate": 1.974068968984427e-06, "loss": 0.5226249694824219, "step": 648, "token_acc": 0.833119931711481 }, { "epoch": 1.7461279461279462, "grad_norm": 1.3490245342254639, "learning_rate": 1.9669748743056236e-06, "loss": 0.5106188058853149, "step": 649, "token_acc": 0.8338640194793032 }, { "epoch": 1.7488215488215488, "grad_norm": 1.2186928987503052, "learning_rate": 1.9598852732080837e-06, "loss": 0.5778922438621521, "step": 650, "token_acc": 0.819358140214928 }, { "epoch": 1.7488215488215488, "eval_loss": 0.5938109159469604, "eval_runtime": 9.9367, "eval_samples_per_second": 4.529, "eval_steps_per_second": 1.51, "eval_token_acc": 0.8122382549669531, "step": 650 }, { "epoch": 1.7515151515151515, "grad_norm": 1.348225474357605, "learning_rate": 1.9528002254595307e-06, "loss": 0.5642192363739014, "step": 651, "token_acc": 0.8158061953931692 }, { "epoch": 1.7542087542087543, "grad_norm": 1.2582446336746216, "learning_rate": 1.945719790789302e-06, "loss": 0.594222903251648, "step": 652, "token_acc": 0.8187897300421035 }, { "epoch": 1.7569023569023567, "grad_norm": 1.302146553993225, "learning_rate": 1.9386440288878472e-06, "loss": 0.5239572525024414, "step": 653, "token_acc": 0.8287427230862803 }, { "epoch": 1.7595959595959596, "grad_norm": 1.2902361154556274, "learning_rate": 1.9315729994062203e-06, "loss": 0.5335549116134644, "step": 654, "token_acc": 0.8269704957061474 }, { "epoch": 1.7622895622895622, "grad_norm": 1.317023754119873, "learning_rate": 1.92450676195558e-06, "loss": 0.5502699613571167, "step": 655, "token_acc": 0.8189782288142944 }, { "epoch": 1.7649831649831649, "grad_norm": 1.2522796392440796, "learning_rate": 1.9174453761066892e-06, "loss": 0.5344890356063843, "step": 656, "token_acc": 0.8274484220826878 }, { "epoch": 1.7676767676767677, "grad_norm": 1.400010347366333, "learning_rate": 1.910388901389405e-06, "loss": 0.5248029232025146, "step": 657, "token_acc": 0.8231552162849872 }, { "epoch": 1.7703703703703704, "grad_norm": 1.1979683637619019, "learning_rate": 1.903337397292187e-06, "loss": 0.5697048306465149, "step": 658, "token_acc": 0.8234200743494424 }, { "epoch": 1.773063973063973, "grad_norm": 1.3244385719299316, "learning_rate": 1.8962909232615879e-06, "loss": 0.5362570285797119, "step": 659, "token_acc": 0.8254422476586889 }, { "epoch": 1.7757575757575759, "grad_norm": 1.3624088764190674, "learning_rate": 1.8892495387017562e-06, "loss": 0.53550785779953, "step": 660, "token_acc": 0.8256700270469634 }, { "epoch": 1.7784511784511785, "grad_norm": 1.4096314907073975, "learning_rate": 1.8822133029739343e-06, "loss": 0.5697856545448303, "step": 661, "token_acc": 0.8196874707588659 }, { "epoch": 1.7811447811447811, "grad_norm": 1.406380534172058, "learning_rate": 1.8751822753959587e-06, "loss": 0.5291132926940918, "step": 662, "token_acc": 0.8274296990626542 }, { "epoch": 1.783838383838384, "grad_norm": 1.3589434623718262, "learning_rate": 1.8681565152417585e-06, "loss": 0.5520192384719849, "step": 663, "token_acc": 0.8268347591175146 }, { "epoch": 1.7865319865319864, "grad_norm": 1.4493297338485718, "learning_rate": 1.8611360817408576e-06, "loss": 0.5677556991577148, "step": 664, "token_acc": 0.8153485545620279 }, { "epoch": 1.7892255892255893, "grad_norm": 1.362634539604187, "learning_rate": 1.8541210340778737e-06, "loss": 0.5682973265647888, "step": 665, "token_acc": 0.8178981184029371 }, { "epoch": 1.791919191919192, "grad_norm": 1.2826429605484009, "learning_rate": 1.8471114313920214e-06, "loss": 0.4945811629295349, "step": 666, "token_acc": 0.8422693266832918 }, { "epoch": 1.7946127946127945, "grad_norm": 1.2596176862716675, "learning_rate": 1.8401073327766095e-06, "loss": 0.5566763281822205, "step": 667, "token_acc": 0.8207394194908383 }, { "epoch": 1.7973063973063974, "grad_norm": 1.3357676267623901, "learning_rate": 1.8331087972785484e-06, "loss": 0.5182400941848755, "step": 668, "token_acc": 0.8333333333333334 }, { "epoch": 1.8, "grad_norm": 1.2208564281463623, "learning_rate": 1.8261158838978476e-06, "loss": 0.5057079792022705, "step": 669, "token_acc": 0.8304025009769441 }, { "epoch": 1.8026936026936027, "grad_norm": 1.2862075567245483, "learning_rate": 1.819128651587123e-06, "loss": 0.5547915697097778, "step": 670, "token_acc": 0.8239161336176262 }, { "epoch": 1.8053872053872055, "grad_norm": 1.1941677331924438, "learning_rate": 1.8121471592510939e-06, "loss": 0.5653791427612305, "step": 671, "token_acc": 0.8214179326779864 }, { "epoch": 1.808080808080808, "grad_norm": 1.2401403188705444, "learning_rate": 1.8051714657460912e-06, "loss": 0.5270607471466064, "step": 672, "token_acc": 0.831774941052118 }, { "epoch": 1.8107744107744108, "grad_norm": 1.2294210195541382, "learning_rate": 1.7982016298795608e-06, "loss": 0.5383309125900269, "step": 673, "token_acc": 0.829323720711543 }, { "epoch": 1.8134680134680135, "grad_norm": 1.3348898887634277, "learning_rate": 1.7912377104095647e-06, "loss": 0.5016952753067017, "step": 674, "token_acc": 0.8358699736941 }, { "epoch": 1.816161616161616, "grad_norm": 1.2346816062927246, "learning_rate": 1.784279766044289e-06, "loss": 0.5267403721809387, "step": 675, "token_acc": 0.8340515489064152 }, { "epoch": 1.818855218855219, "grad_norm": 1.3472524881362915, "learning_rate": 1.7773278554415463e-06, "loss": 0.5835098624229431, "step": 676, "token_acc": 0.8139516193843929 }, { "epoch": 1.8215488215488216, "grad_norm": 1.30152428150177, "learning_rate": 1.7703820372082841e-06, "loss": 0.5219746232032776, "step": 677, "token_acc": 0.8316998986143967 }, { "epoch": 1.8242424242424242, "grad_norm": 1.3686583042144775, "learning_rate": 1.7634423699000885e-06, "loss": 0.5688308477401733, "step": 678, "token_acc": 0.8201884004432951 }, { "epoch": 1.826936026936027, "grad_norm": 1.3115898370742798, "learning_rate": 1.756508912020692e-06, "loss": 0.5253725051879883, "step": 679, "token_acc": 0.8297489503898552 }, { "epoch": 1.8296296296296295, "grad_norm": 1.2729463577270508, "learning_rate": 1.7495817220214773e-06, "loss": 0.5547829866409302, "step": 680, "token_acc": 0.8240281554951208 }, { "epoch": 1.8323232323232324, "grad_norm": 1.16997230052948, "learning_rate": 1.7426608583009897e-06, "loss": 0.500565767288208, "step": 681, "token_acc": 0.8362385679668569 }, { "epoch": 1.835016835016835, "grad_norm": 1.2954226732254028, "learning_rate": 1.7357463792044396e-06, "loss": 0.5582454204559326, "step": 682, "token_acc": 0.8207501995211492 }, { "epoch": 1.8377104377104376, "grad_norm": 1.2927864789962769, "learning_rate": 1.7288383430232137e-06, "loss": 0.48352307081222534, "step": 683, "token_acc": 0.8400272874562974 }, { "epoch": 1.8404040404040405, "grad_norm": 1.1866931915283203, "learning_rate": 1.7219368079943832e-06, "loss": 0.5158557891845703, "step": 684, "token_acc": 0.833374133006936 }, { "epoch": 1.8430976430976431, "grad_norm": 1.361531138420105, "learning_rate": 1.7150418323002126e-06, "loss": 0.5504743456840515, "step": 685, "token_acc": 0.8234368167905554 }, { "epoch": 1.8457912457912458, "grad_norm": 1.2374825477600098, "learning_rate": 1.7081534740676667e-06, "loss": 0.5149903297424316, "step": 686, "token_acc": 0.834185923443288 }, { "epoch": 1.8484848484848486, "grad_norm": 1.2474849224090576, "learning_rate": 1.701271791367925e-06, "loss": 0.5427623987197876, "step": 687, "token_acc": 0.8264128757257294 }, { "epoch": 1.851178451178451, "grad_norm": 1.2757774591445923, "learning_rate": 1.6943968422158897e-06, "loss": 0.5179962515830994, "step": 688, "token_acc": 0.8321573337336736 }, { "epoch": 1.853872053872054, "grad_norm": 1.3033387660980225, "learning_rate": 1.687528684569697e-06, "loss": 0.5499054193496704, "step": 689, "token_acc": 0.8222606404873631 }, { "epoch": 1.8565656565656565, "grad_norm": 1.3101298809051514, "learning_rate": 1.6806673763302277e-06, "loss": 0.526096761226654, "step": 690, "token_acc": 0.8283924062470776 }, { "epoch": 1.8592592592592592, "grad_norm": 1.1969289779663086, "learning_rate": 1.6738129753406202e-06, "loss": 0.5161941051483154, "step": 691, "token_acc": 0.831053084199828 }, { "epoch": 1.861952861952862, "grad_norm": 1.2458161115646362, "learning_rate": 1.6669655393857834e-06, "loss": 0.5472921133041382, "step": 692, "token_acc": 0.8268748038908064 }, { "epoch": 1.8646464646464647, "grad_norm": 1.2256419658660889, "learning_rate": 1.660125126191907e-06, "loss": 0.5722250938415527, "step": 693, "token_acc": 0.8247653429602888 }, { "epoch": 1.8673400673400673, "grad_norm": 1.3248149156570435, "learning_rate": 1.6532917934259778e-06, "loss": 0.5472182035446167, "step": 694, "token_acc": 0.8264470500651405 }, { "epoch": 1.8700336700336702, "grad_norm": 1.3485087156295776, "learning_rate": 1.6464655986952908e-06, "loss": 0.5427552461624146, "step": 695, "token_acc": 0.8312013828867761 }, { "epoch": 1.8727272727272726, "grad_norm": 1.3065476417541504, "learning_rate": 1.6396465995469657e-06, "loss": 0.5140445232391357, "step": 696, "token_acc": 0.830106237148732 }, { "epoch": 1.8754208754208754, "grad_norm": 1.2525544166564941, "learning_rate": 1.6328348534674611e-06, "loss": 0.5383151173591614, "step": 697, "token_acc": 0.8297469685287727 }, { "epoch": 1.878114478114478, "grad_norm": 1.36207115650177, "learning_rate": 1.6260304178820907e-06, "loss": 0.528560996055603, "step": 698, "token_acc": 0.8284442116291252 }, { "epoch": 1.8808080808080807, "grad_norm": 1.2991206645965576, "learning_rate": 1.6192333501545365e-06, "loss": 0.5305230021476746, "step": 699, "token_acc": 0.8292322406747239 }, { "epoch": 1.8835016835016836, "grad_norm": 1.3228743076324463, "learning_rate": 1.6124437075863677e-06, "loss": 0.5044213533401489, "step": 700, "token_acc": 0.8337653360981511 }, { "epoch": 1.8835016835016836, "eval_loss": 0.5908685922622681, "eval_runtime": 10.1916, "eval_samples_per_second": 4.415, "eval_steps_per_second": 1.472, "eval_token_acc": 0.813091717444376, "step": 700 }, { "epoch": 1.8861952861952862, "grad_norm": 1.3232721090316772, "learning_rate": 1.6056615474165576e-06, "loss": 0.5056213736534119, "step": 701, "token_acc": 0.8368498982047011 }, { "epoch": 1.8888888888888888, "grad_norm": 1.2825931310653687, "learning_rate": 1.598886926821e-06, "loss": 0.5258585810661316, "step": 702, "token_acc": 0.8325717734287439 }, { "epoch": 1.8915824915824917, "grad_norm": 1.3526164293289185, "learning_rate": 1.592119902912026e-06, "loss": 0.4964277744293213, "step": 703, "token_acc": 0.8393481784830645 }, { "epoch": 1.8942760942760941, "grad_norm": 1.2393525838851929, "learning_rate": 1.585360532737928e-06, "loss": 0.5068771839141846, "step": 704, "token_acc": 0.8355304456999372 }, { "epoch": 1.896969696969697, "grad_norm": 1.2667325735092163, "learning_rate": 1.5786088732824687e-06, "loss": 0.5010797381401062, "step": 705, "token_acc": 0.8334483022917145 }, { "epoch": 1.8996632996632996, "grad_norm": 1.2145148515701294, "learning_rate": 1.5718649814644128e-06, "loss": 0.5344822406768799, "step": 706, "token_acc": 0.8261214767764986 }, { "epoch": 1.9023569023569022, "grad_norm": 1.3118435144424438, "learning_rate": 1.5651289141370382e-06, "loss": 0.5301315188407898, "step": 707, "token_acc": 0.8279668049792531 }, { "epoch": 1.905050505050505, "grad_norm": 1.34031343460083, "learning_rate": 1.5584007280876597e-06, "loss": 0.49499744176864624, "step": 708, "token_acc": 0.8381543921916593 }, { "epoch": 1.9077441077441077, "grad_norm": 1.3371686935424805, "learning_rate": 1.5516804800371522e-06, "loss": 0.5466580390930176, "step": 709, "token_acc": 0.8279836708556791 }, { "epoch": 1.9104377104377104, "grad_norm": 1.2403314113616943, "learning_rate": 1.5449682266394683e-06, "loss": 0.5002031326293945, "step": 710, "token_acc": 0.8336476182667869 }, { "epoch": 1.9131313131313132, "grad_norm": 1.3555506467819214, "learning_rate": 1.5382640244811637e-06, "loss": 0.5605737566947937, "step": 711, "token_acc": 0.8189495696935575 }, { "epoch": 1.9158249158249159, "grad_norm": 1.2753198146820068, "learning_rate": 1.5315679300809209e-06, "loss": 0.5408428907394409, "step": 712, "token_acc": 0.8257171576587026 }, { "epoch": 1.9185185185185185, "grad_norm": 1.2395925521850586, "learning_rate": 1.5248799998890695e-06, "loss": 0.5435607433319092, "step": 713, "token_acc": 0.8229604709840201 }, { "epoch": 1.9212121212121214, "grad_norm": 1.2461837530136108, "learning_rate": 1.5182002902871123e-06, "loss": 0.543999433517456, "step": 714, "token_acc": 0.8252886030835981 }, { "epoch": 1.9239057239057238, "grad_norm": 1.3267256021499634, "learning_rate": 1.5115288575872503e-06, "loss": 0.5384064316749573, "step": 715, "token_acc": 0.8225376262145171 }, { "epoch": 1.9265993265993266, "grad_norm": 1.2665374279022217, "learning_rate": 1.5048657580319065e-06, "loss": 0.5020281076431274, "step": 716, "token_acc": 0.8385009436505797 }, { "epoch": 1.9292929292929293, "grad_norm": 1.2413218021392822, "learning_rate": 1.4982110477932549e-06, "loss": 0.5284104347229004, "step": 717, "token_acc": 0.8333204304405047 }, { "epoch": 1.931986531986532, "grad_norm": 1.2744404077529907, "learning_rate": 1.491564782972742e-06, "loss": 0.5155625939369202, "step": 718, "token_acc": 0.833970552346864 }, { "epoch": 1.9346801346801348, "grad_norm": 1.2459313869476318, "learning_rate": 1.4849270196006182e-06, "loss": 0.5720392465591431, "step": 719, "token_acc": 0.8193439088248884 }, { "epoch": 1.9373737373737374, "grad_norm": 1.1901607513427734, "learning_rate": 1.4782978136354641e-06, "loss": 0.5340495109558105, "step": 720, "token_acc": 0.8253644771907822 }, { "epoch": 1.94006734006734, "grad_norm": 1.1660528182983398, "learning_rate": 1.471677220963717e-06, "loss": 0.49016767740249634, "step": 721, "token_acc": 0.8396541262135923 }, { "epoch": 1.942760942760943, "grad_norm": 1.2249945402145386, "learning_rate": 1.4650652973992024e-06, "loss": 0.52580326795578, "step": 722, "token_acc": 0.8312557008209183 }, { "epoch": 1.9454545454545453, "grad_norm": 1.534612774848938, "learning_rate": 1.4584620986826624e-06, "loss": 0.5356962084770203, "step": 723, "token_acc": 0.8272711298541556 }, { "epoch": 1.9481481481481482, "grad_norm": 1.2436175346374512, "learning_rate": 1.4518676804812849e-06, "loss": 0.5211765766143799, "step": 724, "token_acc": 0.8303682250724038 }, { "epoch": 1.9508417508417508, "grad_norm": 1.5046383142471313, "learning_rate": 1.4452820983882355e-06, "loss": 0.5644816756248474, "step": 725, "token_acc": 0.8248088895894081 }, { "epoch": 1.9535353535353535, "grad_norm": 1.193669080734253, "learning_rate": 1.438705407922188e-06, "loss": 0.5343789458274841, "step": 726, "token_acc": 0.8293079252529729 }, { "epoch": 1.9562289562289563, "grad_norm": 1.3214915990829468, "learning_rate": 1.4321376645268575e-06, "loss": 0.5013030171394348, "step": 727, "token_acc": 0.8385492583383349 }, { "epoch": 1.958922558922559, "grad_norm": 1.4772727489471436, "learning_rate": 1.425578923570532e-06, "loss": 0.5212735533714294, "step": 728, "token_acc": 0.8286651053864169 }, { "epoch": 1.9616161616161616, "grad_norm": 1.2685352563858032, "learning_rate": 1.4190292403456052e-06, "loss": 0.5028020143508911, "step": 729, "token_acc": 0.8346644010195412 }, { "epoch": 1.9643097643097645, "grad_norm": 1.3483223915100098, "learning_rate": 1.4124886700681133e-06, "loss": 0.526803195476532, "step": 730, "token_acc": 0.8280333984721976 }, { "epoch": 1.9670033670033669, "grad_norm": 1.214234471321106, "learning_rate": 1.4059572678772649e-06, "loss": 0.5196961164474487, "step": 731, "token_acc": 0.8321319018404908 }, { "epoch": 1.9696969696969697, "grad_norm": 1.3940813541412354, "learning_rate": 1.3994350888349806e-06, "loss": 0.5086055994033813, "step": 732, "token_acc": 0.8320145070886911 }, { "epoch": 1.9723905723905724, "grad_norm": 1.3331146240234375, "learning_rate": 1.3929221879254235e-06, "loss": 0.5352920889854431, "step": 733, "token_acc": 0.8305500090760574 }, { "epoch": 1.975084175084175, "grad_norm": 1.3154863119125366, "learning_rate": 1.3864186200545404e-06, "loss": 0.5398126840591431, "step": 734, "token_acc": 0.825450999512433 }, { "epoch": 1.9777777777777779, "grad_norm": 1.2367801666259766, "learning_rate": 1.3799244400496008e-06, "loss": 0.5438534617424011, "step": 735, "token_acc": 0.8239122209610291 }, { "epoch": 1.9804713804713805, "grad_norm": 1.2550886869430542, "learning_rate": 1.3734397026587274e-06, "loss": 0.5108153820037842, "step": 736, "token_acc": 0.8327215144317283 }, { "epoch": 1.9831649831649831, "grad_norm": 1.3578271865844727, "learning_rate": 1.3669644625504391e-06, "loss": 0.48666584491729736, "step": 737, "token_acc": 0.841148244801751 }, { "epoch": 1.985858585858586, "grad_norm": 1.2612732648849487, "learning_rate": 1.3604987743131904e-06, "loss": 0.5176371335983276, "step": 738, "token_acc": 0.8317484150817485 }, { "epoch": 1.9885521885521884, "grad_norm": 1.3087098598480225, "learning_rate": 1.35404269245491e-06, "loss": 0.49360013008117676, "step": 739, "token_acc": 0.840167714884696 }, { "epoch": 1.9912457912457913, "grad_norm": 1.2846872806549072, "learning_rate": 1.3475962714025403e-06, "loss": 0.5183590650558472, "step": 740, "token_acc": 0.8321240105540897 }, { "epoch": 1.993939393939394, "grad_norm": 1.155347228050232, "learning_rate": 1.341159565501583e-06, "loss": 0.5489883422851562, "step": 741, "token_acc": 0.8227894124490983 }, { "epoch": 1.9966329966329965, "grad_norm": 1.261084794998169, "learning_rate": 1.3347326290156364e-06, "loss": 0.509272575378418, "step": 742, "token_acc": 0.831580575317043 }, { "epoch": 1.9993265993265994, "grad_norm": 1.1923277378082275, "learning_rate": 1.3283155161259364e-06, "loss": 0.5400043725967407, "step": 743, "token_acc": 0.8252063015753939 }, { "epoch": 2.0, "grad_norm": 2.4817962646484375, "learning_rate": 1.3219082809309063e-06, "loss": 0.4832876920700073, "step": 744, "token_acc": 0.8439765274421815 }, { "epoch": 2.002693602693603, "grad_norm": 1.5201246738433838, "learning_rate": 1.3155109774456959e-06, "loss": 0.46549856662750244, "step": 745, "token_acc": 0.8457242582897033 }, { "epoch": 2.0053872053872053, "grad_norm": 1.3259707689285278, "learning_rate": 1.3091236596017261e-06, "loss": 0.4740462005138397, "step": 746, "token_acc": 0.8441335740072202 }, { "epoch": 2.008080808080808, "grad_norm": 1.2127329111099243, "learning_rate": 1.3027463812462393e-06, "loss": 0.457489550113678, "step": 747, "token_acc": 0.8515366430260047 }, { "epoch": 2.010774410774411, "grad_norm": 1.29241144657135, "learning_rate": 1.2963791961418377e-06, "loss": 0.48088178038597107, "step": 748, "token_acc": 0.8416610398379474 }, { "epoch": 2.0134680134680134, "grad_norm": 1.3490641117095947, "learning_rate": 1.2900221579660349e-06, "loss": 0.49098795652389526, "step": 749, "token_acc": 0.8423091110291405 }, { "epoch": 2.0161616161616163, "grad_norm": 1.2735066413879395, "learning_rate": 1.2836753203108038e-06, "loss": 0.4601465165615082, "step": 750, "token_acc": 0.8497678885865215 }, { "epoch": 2.0161616161616163, "eval_loss": 0.591926634311676, "eval_runtime": 10.4395, "eval_samples_per_second": 4.311, "eval_steps_per_second": 1.437, "eval_token_acc": 0.8128733898338726, "step": 750 }, { "epoch": 2.0188552188552187, "grad_norm": 1.3356337547302246, "learning_rate": 1.2773387366821221e-06, "loss": 0.4521638751029968, "step": 751, "token_acc": 0.8504950495049505 }, { "epoch": 2.0215488215488215, "grad_norm": 1.299659013748169, "learning_rate": 1.2710124604995236e-06, "loss": 0.4591137766838074, "step": 752, "token_acc": 0.8477296005462616 }, { "epoch": 2.0242424242424244, "grad_norm": 1.251734972000122, "learning_rate": 1.2646965450956461e-06, "loss": 0.49253854155540466, "step": 753, "token_acc": 0.8398543426047881 }, { "epoch": 2.026936026936027, "grad_norm": 1.2952572107315063, "learning_rate": 1.2583910437157826e-06, "loss": 0.46003422141075134, "step": 754, "token_acc": 0.8512418300653595 }, { "epoch": 2.0296296296296297, "grad_norm": 1.3239188194274902, "learning_rate": 1.2520960095174328e-06, "loss": 0.4658181369304657, "step": 755, "token_acc": 0.844964095010127 }, { "epoch": 2.0323232323232325, "grad_norm": 1.3352136611938477, "learning_rate": 1.2458114955698542e-06, "loss": 0.44239068031311035, "step": 756, "token_acc": 0.8557759831460674 }, { "epoch": 2.035016835016835, "grad_norm": 1.327793836593628, "learning_rate": 1.2395375548536146e-06, "loss": 0.44215822219848633, "step": 757, "token_acc": 0.8547748039136411 }, { "epoch": 2.037710437710438, "grad_norm": 1.27240788936615, "learning_rate": 1.233274240260148e-06, "loss": 0.4459666907787323, "step": 758, "token_acc": 0.8562292358803987 }, { "epoch": 2.04040404040404, "grad_norm": 2.3232054710388184, "learning_rate": 1.2270216045913045e-06, "loss": 0.4521208703517914, "step": 759, "token_acc": 0.8478436439993539 }, { "epoch": 2.043097643097643, "grad_norm": 1.4111300706863403, "learning_rate": 1.2207797005589076e-06, "loss": 0.495017945766449, "step": 760, "token_acc": 0.8407483100141487 }, { "epoch": 2.045791245791246, "grad_norm": 1.354033350944519, "learning_rate": 1.2145485807843108e-06, "loss": 0.4807535409927368, "step": 761, "token_acc": 0.8409781045229343 }, { "epoch": 2.0484848484848484, "grad_norm": 1.3360085487365723, "learning_rate": 1.2083282977979489e-06, "loss": 0.40576547384262085, "step": 762, "token_acc": 0.8633187772925764 }, { "epoch": 2.051178451178451, "grad_norm": 1.3525052070617676, "learning_rate": 1.2021189040389043e-06, "loss": 0.4770188331604004, "step": 763, "token_acc": 0.8465214222078238 }, { "epoch": 2.053872053872054, "grad_norm": 1.4380874633789062, "learning_rate": 1.1959204518544557e-06, "loss": 0.46124327182769775, "step": 764, "token_acc": 0.8492949710652865 }, { "epoch": 2.0565656565656565, "grad_norm": 1.3977106809616089, "learning_rate": 1.1897329934996425e-06, "loss": 0.45053279399871826, "step": 765, "token_acc": 0.8521248915871639 }, { "epoch": 2.0592592592592593, "grad_norm": 1.4043830633163452, "learning_rate": 1.1835565811368214e-06, "loss": 0.4243241548538208, "step": 766, "token_acc": 0.8593791456619793 }, { "epoch": 2.0619528619528618, "grad_norm": 1.3883718252182007, "learning_rate": 1.1773912668352277e-06, "loss": 0.46199798583984375, "step": 767, "token_acc": 0.8463699038216018 }, { "epoch": 2.0646464646464646, "grad_norm": 1.3109889030456543, "learning_rate": 1.1712371025705366e-06, "loss": 0.43962642550468445, "step": 768, "token_acc": 0.8544589383870703 }, { "epoch": 2.0673400673400675, "grad_norm": 1.3482389450073242, "learning_rate": 1.1650941402244256e-06, "loss": 0.4773801267147064, "step": 769, "token_acc": 0.8456292106795309 }, { "epoch": 2.07003367003367, "grad_norm": 1.3810179233551025, "learning_rate": 1.1589624315841364e-06, "loss": 0.4833033084869385, "step": 770, "token_acc": 0.8437526507761473 }, { "epoch": 2.0727272727272728, "grad_norm": 1.3291088342666626, "learning_rate": 1.1528420283420344e-06, "loss": 0.4300866723060608, "step": 771, "token_acc": 0.8581143480816052 }, { "epoch": 2.0754208754208756, "grad_norm": 1.3594409227371216, "learning_rate": 1.146732982095181e-06, "loss": 0.43691128492355347, "step": 772, "token_acc": 0.8547653770266792 }, { "epoch": 2.078114478114478, "grad_norm": 1.2811214923858643, "learning_rate": 1.140635344344892e-06, "loss": 0.49896928668022156, "step": 773, "token_acc": 0.8399723693299562 }, { "epoch": 2.080808080808081, "grad_norm": 1.3307174444198608, "learning_rate": 1.1345491664963079e-06, "loss": 0.4128844141960144, "step": 774, "token_acc": 0.8628727594244836 }, { "epoch": 2.0835016835016833, "grad_norm": 1.4902700185775757, "learning_rate": 1.1284744998579563e-06, "loss": 0.4534751772880554, "step": 775, "token_acc": 0.8469096105752054 }, { "epoch": 2.086195286195286, "grad_norm": 1.3512163162231445, "learning_rate": 1.1224113956413223e-06, "loss": 0.4707953631877899, "step": 776, "token_acc": 0.8491567502782296 }, { "epoch": 2.088888888888889, "grad_norm": 1.320871114730835, "learning_rate": 1.1163599049604153e-06, "loss": 0.4697865843772888, "step": 777, "token_acc": 0.8442389758179232 }, { "epoch": 2.0915824915824914, "grad_norm": 1.3463060855865479, "learning_rate": 1.1103200788313395e-06, "loss": 0.45691460371017456, "step": 778, "token_acc": 0.8487209893274607 }, { "epoch": 2.0942760942760943, "grad_norm": 1.3180919885635376, "learning_rate": 1.1042919681718622e-06, "loss": 0.4619991183280945, "step": 779, "token_acc": 0.8504467666212328 }, { "epoch": 2.096969696969697, "grad_norm": 1.365323781967163, "learning_rate": 1.0982756238009862e-06, "loss": 0.4676735997200012, "step": 780, "token_acc": 0.8483736854976767 }, { "epoch": 2.0996632996632996, "grad_norm": 1.340248942375183, "learning_rate": 1.0922710964385196e-06, "loss": 0.4222599267959595, "step": 781, "token_acc": 0.856798937406608 }, { "epoch": 2.1023569023569024, "grad_norm": 1.347386121749878, "learning_rate": 1.08627843670465e-06, "loss": 0.44251081347465515, "step": 782, "token_acc": 0.8517746975402612 }, { "epoch": 2.105050505050505, "grad_norm": 1.365870714187622, "learning_rate": 1.0802976951195162e-06, "loss": 0.4731717109680176, "step": 783, "token_acc": 0.8458721704394141 }, { "epoch": 2.1077441077441077, "grad_norm": 1.2980282306671143, "learning_rate": 1.0743289221027835e-06, "loss": 0.43013912439346313, "step": 784, "token_acc": 0.8557708229839377 }, { "epoch": 2.1104377104377106, "grad_norm": 1.3515002727508545, "learning_rate": 1.068372167973217e-06, "loss": 0.4812471866607666, "step": 785, "token_acc": 0.8425378885178526 }, { "epoch": 2.113131313131313, "grad_norm": 1.3523588180541992, "learning_rate": 1.0624274829482612e-06, "loss": 0.4659135043621063, "step": 786, "token_acc": 0.8484217272872937 }, { "epoch": 2.115824915824916, "grad_norm": 1.287131428718567, "learning_rate": 1.0564949171436115e-06, "loss": 0.46489572525024414, "step": 787, "token_acc": 0.8497056799938842 }, { "epoch": 2.1185185185185187, "grad_norm": 1.3072798252105713, "learning_rate": 1.0505745205727947e-06, "loss": 0.4592118263244629, "step": 788, "token_acc": 0.8513568576318149 }, { "epoch": 2.121212121212121, "grad_norm": 1.3331784009933472, "learning_rate": 1.044666343146748e-06, "loss": 0.45482009649276733, "step": 789, "token_acc": 0.8482496435925867 }, { "epoch": 2.123905723905724, "grad_norm": 1.4434539079666138, "learning_rate": 1.038770434673394e-06, "loss": 0.48350390791893005, "step": 790, "token_acc": 0.840907116169954 }, { "epoch": 2.126599326599327, "grad_norm": 1.2732055187225342, "learning_rate": 1.0328868448572283e-06, "loss": 0.4215206503868103, "step": 791, "token_acc": 0.8594700097590271 }, { "epoch": 2.1292929292929292, "grad_norm": 1.2999489307403564, "learning_rate": 1.027015623298893e-06, "loss": 0.5205777883529663, "step": 792, "token_acc": 0.8326462660535913 }, { "epoch": 2.131986531986532, "grad_norm": 1.3253458738327026, "learning_rate": 1.021156819494763e-06, "loss": 0.47617095708847046, "step": 793, "token_acc": 0.845251857968621 }, { "epoch": 2.1346801346801345, "grad_norm": 1.3700222969055176, "learning_rate": 1.0153104828365263e-06, "loss": 0.460770845413208, "step": 794, "token_acc": 0.8474403599626454 }, { "epoch": 2.1373737373737374, "grad_norm": 1.3877501487731934, "learning_rate": 1.0094766626107693e-06, "loss": 0.4602683186531067, "step": 795, "token_acc": 0.8503518105371546 }, { "epoch": 2.1400673400673402, "grad_norm": 1.3150943517684937, "learning_rate": 1.0036554079985597e-06, "loss": 0.4521007537841797, "step": 796, "token_acc": 0.851318338808862 }, { "epoch": 2.1427609427609426, "grad_norm": 1.4103986024856567, "learning_rate": 9.97846768075036e-07, "loss": 0.4490131735801697, "step": 797, "token_acc": 0.8515500608785239 }, { "epoch": 2.1454545454545455, "grad_norm": 1.3065500259399414, "learning_rate": 9.920507918089867e-07, "loss": 0.4458083510398865, "step": 798, "token_acc": 0.8549029367844699 }, { "epoch": 2.148148148148148, "grad_norm": 1.4567983150482178, "learning_rate": 9.862675280624445e-07, "loss": 0.4766731262207031, "step": 799, "token_acc": 0.8456146207126763 }, { "epoch": 2.1508417508417508, "grad_norm": 1.4127930402755737, "learning_rate": 9.804970255902677e-07, "loss": 0.5290741324424744, "step": 800, "token_acc": 0.83719606408067 }, { "epoch": 2.1508417508417508, "eval_loss": 0.6022514700889587, "eval_runtime": 9.9907, "eval_samples_per_second": 4.504, "eval_steps_per_second": 1.501, "eval_token_acc": 0.8126947581525514, "step": 800 }, { "epoch": 2.1535353535353536, "grad_norm": 1.2756026983261108, "learning_rate": 9.74739333039735e-07, "loss": 0.4524504840373993, "step": 801, "token_acc": 0.8523999686790384 }, { "epoch": 2.156228956228956, "grad_norm": 1.3844348192214966, "learning_rate": 9.689944989501352e-07, "loss": 0.4393746256828308, "step": 802, "token_acc": 0.8529206016089542 }, { "epoch": 2.158922558922559, "grad_norm": 1.3835124969482422, "learning_rate": 9.632625717523523e-07, "loss": 0.44908076524734497, "step": 803, "token_acc": 0.8519647696476965 }, { "epoch": 2.1616161616161618, "grad_norm": 1.444185733795166, "learning_rate": 9.575435997684626e-07, "loss": 0.4466667175292969, "step": 804, "token_acc": 0.8521380279216788 }, { "epoch": 2.164309764309764, "grad_norm": 1.2958163022994995, "learning_rate": 9.518376312113254e-07, "loss": 0.48053178191185, "step": 805, "token_acc": 0.8461009174311926 }, { "epoch": 2.167003367003367, "grad_norm": 1.2664419412612915, "learning_rate": 9.461447141841768e-07, "loss": 0.4667130708694458, "step": 806, "token_acc": 0.8458058295631536 }, { "epoch": 2.16969696969697, "grad_norm": 1.4141355752944946, "learning_rate": 9.404648966802224e-07, "loss": 0.4576037526130676, "step": 807, "token_acc": 0.848497123875881 }, { "epoch": 2.1723905723905723, "grad_norm": 1.4003151655197144, "learning_rate": 9.347982265822395e-07, "loss": 0.46019813418388367, "step": 808, "token_acc": 0.8516360376512775 }, { "epoch": 2.175084175084175, "grad_norm": 1.3728474378585815, "learning_rate": 9.291447516621616e-07, "loss": 0.4723733365535736, "step": 809, "token_acc": 0.8496134342006817 }, { "epoch": 2.1777777777777776, "grad_norm": 1.2846670150756836, "learning_rate": 9.235045195806868e-07, "loss": 0.4514288902282715, "step": 810, "token_acc": 0.8517991004497751 }, { "epoch": 2.1804713804713804, "grad_norm": 1.4378248453140259, "learning_rate": 9.178775778868704e-07, "loss": 0.45471686124801636, "step": 811, "token_acc": 0.8521775197013687 }, { "epoch": 2.1831649831649833, "grad_norm": 1.5887566804885864, "learning_rate": 9.122639740177252e-07, "loss": 0.4699516296386719, "step": 812, "token_acc": 0.8463956000785701 }, { "epoch": 2.1858585858585857, "grad_norm": 1.3284804821014404, "learning_rate": 9.066637552978211e-07, "loss": 0.45408710837364197, "step": 813, "token_acc": 0.8497267759562842 }, { "epoch": 2.1885521885521886, "grad_norm": 1.2986119985580444, "learning_rate": 9.010769689388885e-07, "loss": 0.4095146059989929, "step": 814, "token_acc": 0.8644397600129723 }, { "epoch": 2.1912457912457914, "grad_norm": 1.2913188934326172, "learning_rate": 8.955036620394172e-07, "loss": 0.46850842237472534, "step": 815, "token_acc": 0.8480884528536946 }, { "epoch": 2.193939393939394, "grad_norm": 1.2667969465255737, "learning_rate": 8.899438815842601e-07, "loss": 0.4501957297325134, "step": 816, "token_acc": 0.8509140977587394 }, { "epoch": 2.1966329966329967, "grad_norm": 1.3271890878677368, "learning_rate": 8.843976744442387e-07, "loss": 0.4614192247390747, "step": 817, "token_acc": 0.8503440366972477 }, { "epoch": 2.199326599326599, "grad_norm": 1.3917871713638306, "learning_rate": 8.788650873757462e-07, "loss": 0.44763433933258057, "step": 818, "token_acc": 0.8540184921763869 }, { "epoch": 2.202020202020202, "grad_norm": 1.4307794570922852, "learning_rate": 8.733461670203545e-07, "loss": 0.4758141040802002, "step": 819, "token_acc": 0.8440308087291399 }, { "epoch": 2.204713804713805, "grad_norm": 1.3300832509994507, "learning_rate": 8.678409599044196e-07, "loss": 0.4450075030326843, "step": 820, "token_acc": 0.8542254690353644 }, { "epoch": 2.2074074074074073, "grad_norm": 1.4743781089782715, "learning_rate": 8.623495124386916e-07, "loss": 0.47731345891952515, "step": 821, "token_acc": 0.8429677651719791 }, { "epoch": 2.21010101010101, "grad_norm": 1.3866221904754639, "learning_rate": 8.568718709179211e-07, "loss": 0.47740742564201355, "step": 822, "token_acc": 0.84375 }, { "epoch": 2.212794612794613, "grad_norm": 1.231095314025879, "learning_rate": 8.514080815204703e-07, "loss": 0.44215211272239685, "step": 823, "token_acc": 0.8542775873386332 }, { "epoch": 2.2154882154882154, "grad_norm": 1.2378566265106201, "learning_rate": 8.459581903079228e-07, "loss": 0.45330843329429626, "step": 824, "token_acc": 0.8464812712826334 }, { "epoch": 2.2181818181818183, "grad_norm": 1.4313534498214722, "learning_rate": 8.405222432246976e-07, "loss": 0.45744794607162476, "step": 825, "token_acc": 0.8494499645138396 }, { "epoch": 2.2208754208754207, "grad_norm": 1.2900134325027466, "learning_rate": 8.351002860976582e-07, "loss": 0.42397648096084595, "step": 826, "token_acc": 0.8588516746411483 }, { "epoch": 2.2235690235690235, "grad_norm": 1.5118861198425293, "learning_rate": 8.296923646357292e-07, "loss": 0.46082139015197754, "step": 827, "token_acc": 0.8493660024787874 }, { "epoch": 2.2262626262626264, "grad_norm": 1.503380537033081, "learning_rate": 8.242985244295077e-07, "loss": 0.4637534022331238, "step": 828, "token_acc": 0.8446020633750921 }, { "epoch": 2.228956228956229, "grad_norm": 1.3085763454437256, "learning_rate": 8.189188109508825e-07, "loss": 0.46453648805618286, "step": 829, "token_acc": 0.8482813117344923 }, { "epoch": 2.2316498316498317, "grad_norm": 1.4059942960739136, "learning_rate": 8.135532695526507e-07, "loss": 0.461564302444458, "step": 830, "token_acc": 0.8483900954956478 }, { "epoch": 2.2343434343434345, "grad_norm": 1.3793410062789917, "learning_rate": 8.082019454681328e-07, "loss": 0.4185619354248047, "step": 831, "token_acc": 0.8574900972272236 }, { "epoch": 2.237037037037037, "grad_norm": 1.3347095251083374, "learning_rate": 8.028648838107916e-07, "loss": 0.46459540724754333, "step": 832, "token_acc": 0.8487435181491823 }, { "epoch": 2.23973063973064, "grad_norm": 1.3014055490493774, "learning_rate": 7.975421295738542e-07, "loss": 0.4530065953731537, "step": 833, "token_acc": 0.8489507264201707 }, { "epoch": 2.242424242424242, "grad_norm": 1.4138000011444092, "learning_rate": 7.922337276299305e-07, "loss": 0.49346980452537537, "step": 834, "token_acc": 0.8456919060052219 }, { "epoch": 2.245117845117845, "grad_norm": 1.344438076019287, "learning_rate": 7.869397227306352e-07, "loss": 0.4876825511455536, "step": 835, "token_acc": 0.8429926238145417 }, { "epoch": 2.247811447811448, "grad_norm": 1.2805230617523193, "learning_rate": 7.81660159506214e-07, "loss": 0.4434286952018738, "step": 836, "token_acc": 0.8529250528624012 }, { "epoch": 2.2505050505050503, "grad_norm": 1.294522762298584, "learning_rate": 7.76395082465162e-07, "loss": 0.44889816641807556, "step": 837, "token_acc": 0.8515925005647165 }, { "epoch": 2.253198653198653, "grad_norm": 1.4283506870269775, "learning_rate": 7.711445359938499e-07, "loss": 0.4760248064994812, "step": 838, "token_acc": 0.8434281005356407 }, { "epoch": 2.255892255892256, "grad_norm": 1.2840797901153564, "learning_rate": 7.659085643561526e-07, "loss": 0.47841233015060425, "step": 839, "token_acc": 0.8491068630523347 }, { "epoch": 2.2585858585858585, "grad_norm": 1.3654531240463257, "learning_rate": 7.606872116930733e-07, "loss": 0.42784351110458374, "step": 840, "token_acc": 0.8596114244784672 }, { "epoch": 2.2612794612794613, "grad_norm": 1.3194488286972046, "learning_rate": 7.554805220223743e-07, "loss": 0.4382828176021576, "step": 841, "token_acc": 0.8525582803511959 }, { "epoch": 2.263973063973064, "grad_norm": 1.5039162635803223, "learning_rate": 7.502885392382017e-07, "loss": 0.45981162786483765, "step": 842, "token_acc": 0.8490455784962991 }, { "epoch": 2.2666666666666666, "grad_norm": 1.2707375288009644, "learning_rate": 7.451113071107182e-07, "loss": 0.4490881860256195, "step": 843, "token_acc": 0.8533079847908746 }, { "epoch": 2.2693602693602695, "grad_norm": 1.3103524446487427, "learning_rate": 7.39948869285734e-07, "loss": 0.47044140100479126, "step": 844, "token_acc": 0.8487388185994037 }, { "epoch": 2.272053872053872, "grad_norm": 1.376499891281128, "learning_rate": 7.348012692843376e-07, "loss": 0.46952003240585327, "step": 845, "token_acc": 0.8475584902584169 }, { "epoch": 2.2747474747474747, "grad_norm": 1.5059535503387451, "learning_rate": 7.296685505025303e-07, "loss": 0.46225064992904663, "step": 846, "token_acc": 0.8477751756440282 }, { "epoch": 2.2774410774410776, "grad_norm": 1.3220504522323608, "learning_rate": 7.245507562108592e-07, "loss": 0.45228156447410583, "step": 847, "token_acc": 0.8521097571085137 }, { "epoch": 2.28013468013468, "grad_norm": 1.3019322156906128, "learning_rate": 7.194479295540535e-07, "loss": 0.46064242720603943, "step": 848, "token_acc": 0.8524292666831642 }, { "epoch": 2.282828282828283, "grad_norm": 1.7063854932785034, "learning_rate": 7.143601135506598e-07, "loss": 0.4349842071533203, "step": 849, "token_acc": 0.8575779290576854 }, { "epoch": 2.2855218855218853, "grad_norm": 1.2563503980636597, "learning_rate": 7.092873510926801e-07, "loss": 0.4257044196128845, "step": 850, "token_acc": 0.8611906789502206 }, { "epoch": 2.2855218855218853, "eval_loss": 0.6023553609848022, "eval_runtime": 9.8902, "eval_samples_per_second": 4.55, "eval_steps_per_second": 1.517, "eval_token_acc": 0.8122184070023619, "step": 850 }, { "epoch": 2.288215488215488, "grad_norm": 1.372205376625061, "learning_rate": 7.042296849452093e-07, "loss": 0.467682421207428, "step": 851, "token_acc": 0.8497198626423279 }, { "epoch": 2.290909090909091, "grad_norm": 1.3362534046173096, "learning_rate": 6.991871577460751e-07, "loss": 0.42746520042419434, "step": 852, "token_acc": 0.8592249368155013 }, { "epoch": 2.2936026936026934, "grad_norm": 1.3404808044433594, "learning_rate": 6.941598120054815e-07, "loss": 0.4520101547241211, "step": 853, "token_acc": 0.8519880290722531 }, { "epoch": 2.2962962962962963, "grad_norm": 1.5364707708358765, "learning_rate": 6.891476901056446e-07, "loss": 0.4610576629638672, "step": 854, "token_acc": 0.8434170471841704 }, { "epoch": 2.298989898989899, "grad_norm": 1.2862346172332764, "learning_rate": 6.841508343004399e-07, "loss": 0.47874677181243896, "step": 855, "token_acc": 0.8417160846328429 }, { "epoch": 2.3016835016835016, "grad_norm": 1.3739854097366333, "learning_rate": 6.791692867150429e-07, "loss": 0.47005438804626465, "step": 856, "token_acc": 0.8483781127129751 }, { "epoch": 2.3043771043771044, "grad_norm": 1.3806734085083008, "learning_rate": 6.74203089345577e-07, "loss": 0.4687688946723938, "step": 857, "token_acc": 0.8475224476222148 }, { "epoch": 2.3070707070707073, "grad_norm": 1.445547342300415, "learning_rate": 6.69252284058759e-07, "loss": 0.48147639632225037, "step": 858, "token_acc": 0.8418171340760661 }, { "epoch": 2.3097643097643097, "grad_norm": 1.3702841997146606, "learning_rate": 6.643169125915442e-07, "loss": 0.5029208660125732, "step": 859, "token_acc": 0.8394388152766953 }, { "epoch": 2.3124579124579125, "grad_norm": 1.334344506263733, "learning_rate": 6.593970165507752e-07, "loss": 0.440528005361557, "step": 860, "token_acc": 0.8553194993412385 }, { "epoch": 2.315151515151515, "grad_norm": 1.4151922464370728, "learning_rate": 6.544926374128321e-07, "loss": 0.4657013416290283, "step": 861, "token_acc": 0.846997893258427 }, { "epoch": 2.317845117845118, "grad_norm": 1.325297236442566, "learning_rate": 6.496038165232821e-07, "loss": 0.42371731996536255, "step": 862, "token_acc": 0.8604111918379966 }, { "epoch": 2.3205387205387207, "grad_norm": 1.3620318174362183, "learning_rate": 6.447305950965307e-07, "loss": 0.45599111914634705, "step": 863, "token_acc": 0.8482734866098338 }, { "epoch": 2.323232323232323, "grad_norm": 1.4733456373214722, "learning_rate": 6.398730142154766e-07, "loss": 0.4494168162345886, "step": 864, "token_acc": 0.854647989812625 }, { "epoch": 2.325925925925926, "grad_norm": 1.313315987586975, "learning_rate": 6.350311148311619e-07, "loss": 0.4472826421260834, "step": 865, "token_acc": 0.8516280631084256 }, { "epoch": 2.328619528619529, "grad_norm": 1.479888677597046, "learning_rate": 6.30204937762427e-07, "loss": 0.46487200260162354, "step": 866, "token_acc": 0.8468937875751503 }, { "epoch": 2.3313131313131312, "grad_norm": 1.3210628032684326, "learning_rate": 6.253945236955706e-07, "loss": 0.42386215925216675, "step": 867, "token_acc": 0.8580370942812983 }, { "epoch": 2.334006734006734, "grad_norm": 1.538174033164978, "learning_rate": 6.205999131840015e-07, "loss": 0.4789707362651825, "step": 868, "token_acc": 0.8452251360712518 }, { "epoch": 2.3367003367003365, "grad_norm": 1.3956888914108276, "learning_rate": 6.158211466479022e-07, "loss": 0.4531639516353607, "step": 869, "token_acc": 0.8502792305646663 }, { "epoch": 2.3393939393939394, "grad_norm": 1.2421443462371826, "learning_rate": 6.11058264373883e-07, "loss": 0.45980918407440186, "step": 870, "token_acc": 0.8500284575981787 }, { "epoch": 2.342087542087542, "grad_norm": 1.3761396408081055, "learning_rate": 6.063113065146448e-07, "loss": 0.45985326170921326, "step": 871, "token_acc": 0.849802371541502 }, { "epoch": 2.3447811447811446, "grad_norm": 1.26430082321167, "learning_rate": 6.015803130886402e-07, "loss": 0.40828320384025574, "step": 872, "token_acc": 0.8613094333890323 }, { "epoch": 2.3474747474747475, "grad_norm": 1.3693711757659912, "learning_rate": 5.96865323979737e-07, "loss": 0.4402793347835541, "step": 873, "token_acc": 0.8529512111907199 }, { "epoch": 2.3501683501683504, "grad_norm": 1.2828086614608765, "learning_rate": 5.921663789368806e-07, "loss": 0.44907480478286743, "step": 874, "token_acc": 0.8534435691443725 }, { "epoch": 2.3528619528619528, "grad_norm": 1.2835546731948853, "learning_rate": 5.874835175737598e-07, "loss": 0.4376440644264221, "step": 875, "token_acc": 0.8549034175334324 }, { "epoch": 2.3555555555555556, "grad_norm": 1.331415057182312, "learning_rate": 5.828167793684722e-07, "loss": 0.4712797999382019, "step": 876, "token_acc": 0.8490415335463258 }, { "epoch": 2.3582491582491585, "grad_norm": 1.353811264038086, "learning_rate": 5.781662036631921e-07, "loss": 0.4760933518409729, "step": 877, "token_acc": 0.8459950781932206 }, { "epoch": 2.360942760942761, "grad_norm": 1.3675670623779297, "learning_rate": 5.735318296638389e-07, "loss": 0.49946328997612, "step": 878, "token_acc": 0.8390758005674909 }, { "epoch": 2.3636363636363638, "grad_norm": 1.339974045753479, "learning_rate": 5.689136964397443e-07, "loss": 0.4295516014099121, "step": 879, "token_acc": 0.856322781695916 }, { "epoch": 2.366329966329966, "grad_norm": 1.3538150787353516, "learning_rate": 5.64311842923328e-07, "loss": 0.4291786849498749, "step": 880, "token_acc": 0.8576 }, { "epoch": 2.369023569023569, "grad_norm": 1.3167110681533813, "learning_rate": 5.597263079097637e-07, "loss": 0.44421300292015076, "step": 881, "token_acc": 0.8532186000330961 }, { "epoch": 2.371717171717172, "grad_norm": 1.429537057876587, "learning_rate": 5.55157130056655e-07, "loss": 0.45758193731307983, "step": 882, "token_acc": 0.8546593716907872 }, { "epoch": 2.3744107744107743, "grad_norm": 1.2422311305999756, "learning_rate": 5.506043478837098e-07, "loss": 0.4379339814186096, "step": 883, "token_acc": 0.854500993428091 }, { "epoch": 2.377104377104377, "grad_norm": 1.4464815855026245, "learning_rate": 5.460679997724155e-07, "loss": 0.4881293475627899, "step": 884, "token_acc": 0.8459272972063279 }, { "epoch": 2.3797979797979796, "grad_norm": 1.4152475595474243, "learning_rate": 5.415481239657114e-07, "loss": 0.43724876642227173, "step": 885, "token_acc": 0.8497981157469717 }, { "epoch": 2.3824915824915824, "grad_norm": 1.5903568267822266, "learning_rate": 5.370447585676747e-07, "loss": 0.505699872970581, "step": 886, "token_acc": 0.8395534550195567 }, { "epoch": 2.3851851851851853, "grad_norm": 1.372292160987854, "learning_rate": 5.325579415431909e-07, "loss": 0.46757423877716064, "step": 887, "token_acc": 0.8443985189012314 }, { "epoch": 2.3878787878787877, "grad_norm": 1.3767039775848389, "learning_rate": 5.280877107176383e-07, "loss": 0.5052520036697388, "step": 888, "token_acc": 0.8430573248407643 }, { "epoch": 2.3905723905723906, "grad_norm": 1.3825987577438354, "learning_rate": 5.236341037765677e-07, "loss": 0.46855825185775757, "step": 889, "token_acc": 0.8485026917900403 }, { "epoch": 2.3932659932659934, "grad_norm": 1.329728126525879, "learning_rate": 5.191971582653848e-07, "loss": 0.4602469801902771, "step": 890, "token_acc": 0.8495636395057461 }, { "epoch": 2.395959595959596, "grad_norm": 1.2817330360412598, "learning_rate": 5.147769115890338e-07, "loss": 0.4555991291999817, "step": 891, "token_acc": 0.8515343610430773 }, { "epoch": 2.3986531986531987, "grad_norm": 1.3989284038543701, "learning_rate": 5.103734010116831e-07, "loss": 0.44095277786254883, "step": 892, "token_acc": 0.8555868203685214 }, { "epoch": 2.4013468013468016, "grad_norm": 1.3820195198059082, "learning_rate": 5.059866636564101e-07, "loss": 0.4718220829963684, "step": 893, "token_acc": 0.8430028559771522 }, { "epoch": 2.404040404040404, "grad_norm": 1.3960022926330566, "learning_rate": 5.016167365048858e-07, "loss": 0.4572494626045227, "step": 894, "token_acc": 0.8523166023166023 }, { "epoch": 2.406734006734007, "grad_norm": 1.3680237531661987, "learning_rate": 4.972636563970678e-07, "loss": 0.44302961230278015, "step": 895, "token_acc": 0.854449133518901 }, { "epoch": 2.4094276094276093, "grad_norm": 1.338860273361206, "learning_rate": 4.929274600308867e-07, "loss": 0.4547927975654602, "step": 896, "token_acc": 0.8519148247105498 }, { "epoch": 2.412121212121212, "grad_norm": 1.386505126953125, "learning_rate": 4.886081839619389e-07, "loss": 0.4274322986602783, "step": 897, "token_acc": 0.8625258086717137 }, { "epoch": 2.414814814814815, "grad_norm": 1.4120967388153076, "learning_rate": 4.843058646031751e-07, "loss": 0.4994282126426697, "step": 898, "token_acc": 0.843308961061044 }, { "epoch": 2.4175084175084174, "grad_norm": 1.3236873149871826, "learning_rate": 4.800205382245962e-07, "loss": 0.44663071632385254, "step": 899, "token_acc": 0.853766851704996 }, { "epoch": 2.4202020202020202, "grad_norm": 1.2947866916656494, "learning_rate": 4.757522409529461e-07, "loss": 0.41150960326194763, "step": 900, "token_acc": 0.8659107715060389 }, { "epoch": 2.4202020202020202, "eval_loss": 0.6026682257652283, "eval_runtime": 10.071, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.489, "eval_token_acc": 0.8119405354980846, "step": 900 }, { "epoch": 2.4228956228956227, "grad_norm": 1.4493576288223267, "learning_rate": 4.715010087714078e-07, "loss": 0.495847225189209, "step": 901, "token_acc": 0.8388524880467505 }, { "epoch": 2.4255892255892255, "grad_norm": 1.3524755239486694, "learning_rate": 4.6726687751929925e-07, "loss": 0.4947206377983093, "step": 902, "token_acc": 0.837675851161242 }, { "epoch": 2.4282828282828284, "grad_norm": 1.2880852222442627, "learning_rate": 4.630498828917743e-07, "loss": 0.4585961699485779, "step": 903, "token_acc": 0.848380064647072 }, { "epoch": 2.430976430976431, "grad_norm": 1.2679166793823242, "learning_rate": 4.5885006043951546e-07, "loss": 0.4075949788093567, "step": 904, "token_acc": 0.8661844484629295 }, { "epoch": 2.4336700336700336, "grad_norm": 1.5219398736953735, "learning_rate": 4.5466744556844077e-07, "loss": 0.5048119425773621, "step": 905, "token_acc": 0.843663969300075 }, { "epoch": 2.4363636363636365, "grad_norm": 1.394505262374878, "learning_rate": 4.5050207353940166e-07, "loss": 0.5007883906364441, "step": 906, "token_acc": 0.8368416603263167 }, { "epoch": 2.439057239057239, "grad_norm": 1.5569199323654175, "learning_rate": 4.463539794678862e-07, "loss": 0.47455352544784546, "step": 907, "token_acc": 0.8457467507199004 }, { "epoch": 2.441750841750842, "grad_norm": 1.430761694908142, "learning_rate": 4.422231983237246e-07, "loss": 0.49366435408592224, "step": 908, "token_acc": 0.8462658444672834 }, { "epoch": 2.4444444444444446, "grad_norm": 1.3397084474563599, "learning_rate": 4.381097649307928e-07, "loss": 0.4224032759666443, "step": 909, "token_acc": 0.8576549306993787 }, { "epoch": 2.447138047138047, "grad_norm": 1.3525537252426147, "learning_rate": 4.3401371396671857e-07, "loss": 0.4352490305900574, "step": 910, "token_acc": 0.8554785894206549 }, { "epoch": 2.44983164983165, "grad_norm": 1.4554191827774048, "learning_rate": 4.2993507996259094e-07, "loss": 0.4493456482887268, "step": 911, "token_acc": 0.8498639391948953 }, { "epoch": 2.4525252525252528, "grad_norm": 1.2684801816940308, "learning_rate": 4.258738973026677e-07, "loss": 0.4421004056930542, "step": 912, "token_acc": 0.8534445250181291 }, { "epoch": 2.455218855218855, "grad_norm": 1.2649563550949097, "learning_rate": 4.218302002240857e-07, "loss": 0.44410380721092224, "step": 913, "token_acc": 0.8532309475514344 }, { "epoch": 2.457912457912458, "grad_norm": 1.3275678157806396, "learning_rate": 4.178040228165725e-07, "loss": 0.47560450434684753, "step": 914, "token_acc": 0.8455473098330241 }, { "epoch": 2.4606060606060605, "grad_norm": 1.4092402458190918, "learning_rate": 4.137953990221599e-07, "loss": 0.445614755153656, "step": 915, "token_acc": 0.8520164046479836 }, { "epoch": 2.4632996632996633, "grad_norm": 1.3378640413284302, "learning_rate": 4.098043626348955e-07, "loss": 0.4307944178581238, "step": 916, "token_acc": 0.854629702348229 }, { "epoch": 2.465993265993266, "grad_norm": 1.4262300729751587, "learning_rate": 4.0583094730055976e-07, "loss": 0.45210617780685425, "step": 917, "token_acc": 0.8523188530961223 }, { "epoch": 2.4686868686868686, "grad_norm": 1.272359013557434, "learning_rate": 4.018751865163817e-07, "loss": 0.4466434121131897, "step": 918, "token_acc": 0.8541245223645763 }, { "epoch": 2.4713804713804715, "grad_norm": 1.3830268383026123, "learning_rate": 3.9793711363075583e-07, "loss": 0.4729907512664795, "step": 919, "token_acc": 0.8462392891145668 }, { "epoch": 2.474074074074074, "grad_norm": 1.3965145349502563, "learning_rate": 3.940167618429641e-07, "loss": 0.46166881918907166, "step": 920, "token_acc": 0.849703469006418 }, { "epoch": 2.4767676767676767, "grad_norm": 1.3572782278060913, "learning_rate": 3.9011416420289076e-07, "loss": 0.4217594265937805, "step": 921, "token_acc": 0.8621292902365878 }, { "epoch": 2.4794612794612796, "grad_norm": 1.3453633785247803, "learning_rate": 3.8622935361074863e-07, "loss": 0.4615384042263031, "step": 922, "token_acc": 0.8465617232808617 }, { "epoch": 2.482154882154882, "grad_norm": 1.352277398109436, "learning_rate": 3.823623628167977e-07, "loss": 0.4489099383354187, "step": 923, "token_acc": 0.8492134648300596 }, { "epoch": 2.484848484848485, "grad_norm": 1.5038037300109863, "learning_rate": 3.785132244210724e-07, "loss": 0.4499627947807312, "step": 924, "token_acc": 0.8507930291756413 }, { "epoch": 2.4875420875420877, "grad_norm": 1.28714919090271, "learning_rate": 3.7468197087310617e-07, "loss": 0.4156167507171631, "step": 925, "token_acc": 0.8607904272351901 }, { "epoch": 2.49023569023569, "grad_norm": 1.4358834028244019, "learning_rate": 3.708686344716561e-07, "loss": 0.47591906785964966, "step": 926, "token_acc": 0.8454036635006784 }, { "epoch": 2.492929292929293, "grad_norm": 1.372659683227539, "learning_rate": 3.6707324736443147e-07, "loss": 0.45664089918136597, "step": 927, "token_acc": 0.8487684729064039 }, { "epoch": 2.495622895622896, "grad_norm": 1.3613438606262207, "learning_rate": 3.6329584154782357e-07, "loss": 0.4475294053554535, "step": 928, "token_acc": 0.853152101400934 }, { "epoch": 2.4983164983164983, "grad_norm": 1.3997690677642822, "learning_rate": 3.595364488666353e-07, "loss": 0.4921870827674866, "step": 929, "token_acc": 0.8411151017615871 }, { "epoch": 2.501010101010101, "grad_norm": 1.481909990310669, "learning_rate": 3.5579510101381194e-07, "loss": 0.4721786677837372, "step": 930, "token_acc": 0.848770091466122 }, { "epoch": 2.5037037037037035, "grad_norm": 1.2364667654037476, "learning_rate": 3.52071829530177e-07, "loss": 0.47820422053337097, "step": 931, "token_acc": 0.8477452836804825 }, { "epoch": 2.5063973063973064, "grad_norm": 1.3156753778457642, "learning_rate": 3.4836666580416124e-07, "loss": 0.44714295864105225, "step": 932, "token_acc": 0.8514939136849871 }, { "epoch": 2.509090909090909, "grad_norm": 1.4911857843399048, "learning_rate": 3.446796410715428e-07, "loss": 0.4931856095790863, "step": 933, "token_acc": 0.8357540878779575 }, { "epoch": 2.5117845117845117, "grad_norm": 1.2285386323928833, "learning_rate": 3.4101078641518125e-07, "loss": 0.45593398809432983, "step": 934, "token_acc": 0.8514615662215806 }, { "epoch": 2.5144781144781145, "grad_norm": 1.2615535259246826, "learning_rate": 3.373601327647563e-07, "loss": 0.42197567224502563, "step": 935, "token_acc": 0.8588992137240886 }, { "epoch": 2.517171717171717, "grad_norm": 1.422208547592163, "learning_rate": 3.337277108965081e-07, "loss": 0.4457833468914032, "step": 936, "token_acc": 0.8537094216807144 }, { "epoch": 2.51986531986532, "grad_norm": 1.320516586303711, "learning_rate": 3.3011355143297534e-07, "loss": 0.44737479090690613, "step": 937, "token_acc": 0.8540515295399704 }, { "epoch": 2.5225589225589227, "grad_norm": 1.3363538980484009, "learning_rate": 3.2651768484273955e-07, "loss": 0.4628625512123108, "step": 938, "token_acc": 0.849702620157531 }, { "epoch": 2.525252525252525, "grad_norm": 1.3720386028289795, "learning_rate": 3.2294014144016616e-07, "loss": 0.4440345764160156, "step": 939, "token_acc": 0.8545242160122203 }, { "epoch": 2.527946127946128, "grad_norm": 1.2982479333877563, "learning_rate": 3.193809513851509e-07, "loss": 0.48547038435935974, "step": 940, "token_acc": 0.8484777517564402 }, { "epoch": 2.530639730639731, "grad_norm": 1.3547900915145874, "learning_rate": 3.1584014468286377e-07, "loss": 0.4259619116783142, "step": 941, "token_acc": 0.8606906932017727 }, { "epoch": 2.533333333333333, "grad_norm": 1.3823848962783813, "learning_rate": 3.1231775118349796e-07, "loss": 0.4803712069988251, "step": 942, "token_acc": 0.8476863171239699 }, { "epoch": 2.536026936026936, "grad_norm": 1.3940930366516113, "learning_rate": 3.0881380058201597e-07, "loss": 0.4608324468135834, "step": 943, "token_acc": 0.8501391389752824 }, { "epoch": 2.538720538720539, "grad_norm": 1.4138658046722412, "learning_rate": 3.053283224179013e-07, "loss": 0.4737139642238617, "step": 944, "token_acc": 0.8466550426722951 }, { "epoch": 2.5414141414141413, "grad_norm": 1.4371023178100586, "learning_rate": 3.018613460749084e-07, "loss": 0.46041059494018555, "step": 945, "token_acc": 0.8501960784313726 }, { "epoch": 2.544107744107744, "grad_norm": 1.3689640760421753, "learning_rate": 2.9841290078081453e-07, "loss": 0.44626128673553467, "step": 946, "token_acc": 0.8513525343605007 }, { "epoch": 2.546801346801347, "grad_norm": 1.2831043004989624, "learning_rate": 2.9498301560717534e-07, "loss": 0.464735209941864, "step": 947, "token_acc": 0.8484499693063229 }, { "epoch": 2.5494949494949495, "grad_norm": 1.3713418245315552, "learning_rate": 2.915717194690773e-07, "loss": 0.4426043927669525, "step": 948, "token_acc": 0.8554216867469879 }, { "epoch": 2.5521885521885523, "grad_norm": 1.5185340642929077, "learning_rate": 2.8817904112489527e-07, "loss": 0.49989578127861023, "step": 949, "token_acc": 0.840236131934033 }, { "epoch": 2.5548821548821548, "grad_norm": 1.4666484594345093, "learning_rate": 2.8480500917605015e-07, "loss": 0.4472880959510803, "step": 950, "token_acc": 0.8552631578947368 }, { "epoch": 2.5548821548821548, "eval_loss": 0.6015769839286804, "eval_runtime": 10.7401, "eval_samples_per_second": 4.19, "eval_steps_per_second": 1.397, "eval_token_acc": 0.8126153662941865, "step": 950 }, { "epoch": 2.5575757575757576, "grad_norm": 1.3766974210739136, "learning_rate": 2.814496520667667e-07, "loss": 0.4488174617290497, "step": 951, "token_acc": 0.8534402701561841 }, { "epoch": 2.56026936026936, "grad_norm": 1.3600940704345703, "learning_rate": 2.7811299808383457e-07, "loss": 0.47165271639823914, "step": 952, "token_acc": 0.8484674172076588 }, { "epoch": 2.562962962962963, "grad_norm": 1.3728609085083008, "learning_rate": 2.7479507535637135e-07, "loss": 0.45255085825920105, "step": 953, "token_acc": 0.8477996596158521 }, { "epoch": 2.5656565656565657, "grad_norm": 1.2520395517349243, "learning_rate": 2.714959118555821e-07, "loss": 0.45493072271347046, "step": 954, "token_acc": 0.851994851994852 }, { "epoch": 2.568350168350168, "grad_norm": 1.4152458906173706, "learning_rate": 2.6821553539452593e-07, "loss": 0.4609255790710449, "step": 955, "token_acc": 0.8523271731690623 }, { "epoch": 2.571043771043771, "grad_norm": 1.2889204025268555, "learning_rate": 2.6495397362788113e-07, "loss": 0.489478200674057, "step": 956, "token_acc": 0.8411973617453069 }, { "epoch": 2.573737373737374, "grad_norm": 1.264844536781311, "learning_rate": 2.617112540517108e-07, "loss": 0.43277594447135925, "step": 957, "token_acc": 0.8578951327766494 }, { "epoch": 2.5764309764309763, "grad_norm": 1.274577260017395, "learning_rate": 2.584874040032326e-07, "loss": 0.48508113622665405, "step": 958, "token_acc": 0.840975681218869 }, { "epoch": 2.579124579124579, "grad_norm": 1.4244353771209717, "learning_rate": 2.5528245066058886e-07, "loss": 0.45330023765563965, "step": 959, "token_acc": 0.8523519548142264 }, { "epoch": 2.581818181818182, "grad_norm": 1.3983197212219238, "learning_rate": 2.5209642104261494e-07, "loss": 0.49080920219421387, "step": 960, "token_acc": 0.8382996994418205 }, { "epoch": 2.5845117845117844, "grad_norm": 1.43404221534729, "learning_rate": 2.489293420086125e-07, "loss": 0.44272708892822266, "step": 961, "token_acc": 0.8511979823455234 }, { "epoch": 2.5872053872053873, "grad_norm": 1.4007556438446045, "learning_rate": 2.4578124025812447e-07, "loss": 0.48012611269950867, "step": 962, "token_acc": 0.8451106935707867 }, { "epoch": 2.58989898989899, "grad_norm": 1.4113903045654297, "learning_rate": 2.4265214233070793e-07, "loss": 0.4795421063899994, "step": 963, "token_acc": 0.841886269070735 }, { "epoch": 2.5925925925925926, "grad_norm": 1.4592832326889038, "learning_rate": 2.3954207460571283e-07, "loss": 0.4592929780483246, "step": 964, "token_acc": 0.8485231247570929 }, { "epoch": 2.5952861952861954, "grad_norm": 1.3255521059036255, "learning_rate": 2.3645106330205697e-07, "loss": 0.4258558750152588, "step": 965, "token_acc": 0.8608268862823393 }, { "epoch": 2.597979797979798, "grad_norm": 1.2752007246017456, "learning_rate": 2.3337913447800565e-07, "loss": 0.42284274101257324, "step": 966, "token_acc": 0.8603801169590644 }, { "epoch": 2.6006734006734007, "grad_norm": 1.382836937904358, "learning_rate": 2.3032631403095362e-07, "loss": 0.465859055519104, "step": 967, "token_acc": 0.8499173553719008 }, { "epoch": 2.603367003367003, "grad_norm": 1.4192781448364258, "learning_rate": 2.2729262769720512e-07, "loss": 0.46965181827545166, "step": 968, "token_acc": 0.8446586082979418 }, { "epoch": 2.606060606060606, "grad_norm": 1.310037612915039, "learning_rate": 2.242781010517567e-07, "loss": 0.466453492641449, "step": 969, "token_acc": 0.8449909924022871 }, { "epoch": 2.608754208754209, "grad_norm": 1.360783338546753, "learning_rate": 2.2128275950808354e-07, "loss": 0.4657267928123474, "step": 970, "token_acc": 0.847356723289411 }, { "epoch": 2.6114478114478112, "grad_norm": 1.4429086446762085, "learning_rate": 2.1830662831792278e-07, "loss": 0.4535481333732605, "step": 971, "token_acc": 0.8522234011768111 }, { "epoch": 2.614141414141414, "grad_norm": 1.3488675355911255, "learning_rate": 2.1534973257106266e-07, "loss": 0.4335019588470459, "step": 972, "token_acc": 0.8563281824871228 }, { "epoch": 2.616835016835017, "grad_norm": 1.3630640506744385, "learning_rate": 2.1241209719512962e-07, "loss": 0.49038824439048767, "step": 973, "token_acc": 0.8391390075583306 }, { "epoch": 2.6195286195286194, "grad_norm": 1.3925279378890991, "learning_rate": 2.0949374695537872e-07, "loss": 0.44409507513046265, "step": 974, "token_acc": 0.8547974225439139 }, { "epoch": 2.6222222222222222, "grad_norm": 1.310078740119934, "learning_rate": 2.0659470645448542e-07, "loss": 0.4526987671852112, "step": 975, "token_acc": 0.8520905030387347 }, { "epoch": 2.624915824915825, "grad_norm": 1.3399505615234375, "learning_rate": 2.037150001323371e-07, "loss": 0.4460318088531494, "step": 976, "token_acc": 0.850920296493712 }, { "epoch": 2.6276094276094275, "grad_norm": 1.3752610683441162, "learning_rate": 2.0085465226582752e-07, "loss": 0.46625351905822754, "step": 977, "token_acc": 0.8508612481606509 }, { "epoch": 2.6303030303030304, "grad_norm": 1.49448561668396, "learning_rate": 1.980136869686522e-07, "loss": 0.4680027663707733, "step": 978, "token_acc": 0.8432066018272915 }, { "epoch": 2.6329966329966332, "grad_norm": 1.3875348567962646, "learning_rate": 1.9519212819110512e-07, "loss": 0.48852360248565674, "step": 979, "token_acc": 0.8402820088780573 }, { "epoch": 2.6356902356902356, "grad_norm": 1.3737491369247437, "learning_rate": 1.9238999971987638e-07, "loss": 0.45125240087509155, "step": 980, "token_acc": 0.8512726667775744 }, { "epoch": 2.6383838383838385, "grad_norm": 1.5436158180236816, "learning_rate": 1.896073251778527e-07, "loss": 0.48681220412254333, "step": 981, "token_acc": 0.8426603325415677 }, { "epoch": 2.641077441077441, "grad_norm": 1.534818172454834, "learning_rate": 1.8684412802391694e-07, "loss": 0.49265575408935547, "step": 982, "token_acc": 0.842263759086189 }, { "epoch": 2.6437710437710438, "grad_norm": 1.3680033683776855, "learning_rate": 1.8410043155275199e-07, "loss": 0.4479619264602661, "step": 983, "token_acc": 0.8544309113817724 }, { "epoch": 2.6464646464646466, "grad_norm": 1.2756664752960205, "learning_rate": 1.8137625889464245e-07, "loss": 0.5115567445755005, "step": 984, "token_acc": 0.8377169301966834 }, { "epoch": 2.649158249158249, "grad_norm": 1.4053670167922974, "learning_rate": 1.7867163301528073e-07, "loss": 0.4294043481349945, "step": 985, "token_acc": 0.860803198255497 }, { "epoch": 2.651851851851852, "grad_norm": 1.3615531921386719, "learning_rate": 1.7598657671557356e-07, "loss": 0.43544062972068787, "step": 986, "token_acc": 0.8528731542430172 }, { "epoch": 2.6545454545454543, "grad_norm": 1.4154788255691528, "learning_rate": 1.7332111263145047e-07, "loss": 0.46690839529037476, "step": 987, "token_acc": 0.850532004728931 }, { "epoch": 2.657239057239057, "grad_norm": 1.3746193647384644, "learning_rate": 1.7067526323367057e-07, "loss": 0.45226985216140747, "step": 988, "token_acc": 0.849534418240362 }, { "epoch": 2.65993265993266, "grad_norm": 1.2770192623138428, "learning_rate": 1.6804905082763446e-07, "loss": 0.41371601819992065, "step": 989, "token_acc": 0.8616726119698771 }, { "epoch": 2.6626262626262625, "grad_norm": 1.2995195388793945, "learning_rate": 1.6544249755319762e-07, "loss": 0.4332484006881714, "step": 990, "token_acc": 0.8593591905564925 }, { "epoch": 2.6653198653198653, "grad_norm": 1.3931037187576294, "learning_rate": 1.628556253844807e-07, "loss": 0.4358077049255371, "step": 991, "token_acc": 0.8537525702535984 }, { "epoch": 2.668013468013468, "grad_norm": 1.2682255506515503, "learning_rate": 1.602884561296883e-07, "loss": 0.4223305583000183, "step": 992, "token_acc": 0.8661111560553353 }, { "epoch": 2.6707070707070706, "grad_norm": 1.2835527658462524, "learning_rate": 1.577410114309208e-07, "loss": 0.47870200872421265, "step": 993, "token_acc": 0.8458346892287667 }, { "epoch": 2.6734006734006734, "grad_norm": 1.4029755592346191, "learning_rate": 1.5521331276399486e-07, "loss": 0.4483606219291687, "step": 994, "token_acc": 0.8498284507785695 }, { "epoch": 2.6760942760942763, "grad_norm": 1.3690557479858398, "learning_rate": 1.527053814382612e-07, "loss": 0.467379093170166, "step": 995, "token_acc": 0.8475682087781732 }, { "epoch": 2.6787878787878787, "grad_norm": 1.366944432258606, "learning_rate": 1.502172385964251e-07, "loss": 0.45231491327285767, "step": 996, "token_acc": 0.8531797958649568 }, { "epoch": 2.6814814814814816, "grad_norm": 1.451582908630371, "learning_rate": 1.4774890521436824e-07, "loss": 0.43635639548301697, "step": 997, "token_acc": 0.8519475889116677 }, { "epoch": 2.6841750841750844, "grad_norm": 1.6672883033752441, "learning_rate": 1.4530040210097325e-07, "loss": 0.4488867223262787, "step": 998, "token_acc": 0.8511434849841296 }, { "epoch": 2.686868686868687, "grad_norm": 1.3649197816848755, "learning_rate": 1.42871749897944e-07, "loss": 0.4416193962097168, "step": 999, "token_acc": 0.8527022546419099 }, { "epoch": 2.6895622895622897, "grad_norm": 1.353399634361267, "learning_rate": 1.4046296907963748e-07, "loss": 0.49190592765808105, "step": 1000, "token_acc": 0.845174636009229 }, { "epoch": 2.6895622895622897, "eval_loss": 0.600351095199585, "eval_runtime": 10.3229, "eval_samples_per_second": 4.359, "eval_steps_per_second": 1.453, "eval_token_acc": 0.812734454081734, "step": 1000 }, { "epoch": 2.692255892255892, "grad_norm": 1.3580011129379272, "learning_rate": 1.3807407995288596e-07, "loss": 0.43177270889282227, "step": 1001, "token_acc": 0.8550270081895801 }, { "epoch": 2.694949494949495, "grad_norm": 1.3417445421218872, "learning_rate": 1.3570510265682956e-07, "loss": 0.4245806336402893, "step": 1002, "token_acc": 0.8600113811885213 }, { "epoch": 2.6976430976430974, "grad_norm": 1.430476427078247, "learning_rate": 1.3335605716274485e-07, "loss": 0.4416522979736328, "step": 1003, "token_acc": 0.8571978815599423 }, { "epoch": 2.7003367003367003, "grad_norm": 1.261515736579895, "learning_rate": 1.310269632738756e-07, "loss": 0.4295913875102997, "step": 1004, "token_acc": 0.8576334453154162 }, { "epoch": 2.703030303030303, "grad_norm": 1.4265917539596558, "learning_rate": 1.287178406252676e-07, "loss": 0.44337451457977295, "step": 1005, "token_acc": 0.8549524145223828 }, { "epoch": 2.7057239057239055, "grad_norm": 1.290204644203186, "learning_rate": 1.2642870868360247e-07, "loss": 0.4840629994869232, "step": 1006, "token_acc": 0.8421479558728099 }, { "epoch": 2.7084175084175084, "grad_norm": 1.3639416694641113, "learning_rate": 1.2415958674703272e-07, "loss": 0.4350355565547943, "step": 1007, "token_acc": 0.8541151536559519 }, { "epoch": 2.7111111111111112, "grad_norm": 1.2500513792037964, "learning_rate": 1.2191049394502026e-07, "loss": 0.44977259635925293, "step": 1008, "token_acc": 0.8517087216791696 }, { "epoch": 2.7138047138047137, "grad_norm": 1.3968117237091064, "learning_rate": 1.196814492381751e-07, "loss": 0.44466304779052734, "step": 1009, "token_acc": 0.8538414911350205 }, { "epoch": 2.7164983164983165, "grad_norm": 1.4242382049560547, "learning_rate": 1.1747247141809387e-07, "loss": 0.4554741382598877, "step": 1010, "token_acc": 0.8469562243502052 }, { "epoch": 2.7191919191919194, "grad_norm": 1.39249587059021, "learning_rate": 1.1528357910720378e-07, "loss": 0.44580942392349243, "step": 1011, "token_acc": 0.8532224532224533 }, { "epoch": 2.721885521885522, "grad_norm": 1.2425296306610107, "learning_rate": 1.1311479075860388e-07, "loss": 0.45732831954956055, "step": 1012, "token_acc": 0.8485934699602588 }, { "epoch": 2.7245791245791247, "grad_norm": 1.2696439027786255, "learning_rate": 1.1096612465590961e-07, "loss": 0.45906370878219604, "step": 1013, "token_acc": 0.8508583690987125 }, { "epoch": 2.7272727272727275, "grad_norm": 1.3940834999084473, "learning_rate": 1.0883759891310047e-07, "loss": 0.4689827859401703, "step": 1014, "token_acc": 0.8478819530796651 }, { "epoch": 2.72996632996633, "grad_norm": 1.4137316942214966, "learning_rate": 1.0672923147436481e-07, "loss": 0.436021625995636, "step": 1015, "token_acc": 0.8578073695731485 }, { "epoch": 2.732659932659933, "grad_norm": 1.2988362312316895, "learning_rate": 1.046410401139497e-07, "loss": 0.4141569137573242, "step": 1016, "token_acc": 0.860684952735162 }, { "epoch": 2.735353535353535, "grad_norm": 1.365103840827942, "learning_rate": 1.0257304243601162e-07, "loss": 0.5122590065002441, "step": 1017, "token_acc": 0.8375943511078646 }, { "epoch": 2.738047138047138, "grad_norm": 1.4946807622909546, "learning_rate": 1.0052525587446682e-07, "loss": 0.4527641236782074, "step": 1018, "token_acc": 0.8498027518522082 }, { "epoch": 2.7407407407407405, "grad_norm": 1.3794580698013306, "learning_rate": 9.849769769284534e-08, "loss": 0.43577855825424194, "step": 1019, "token_acc": 0.8552409841590832 }, { "epoch": 2.7434343434343433, "grad_norm": 1.2107993364334106, "learning_rate": 9.649038498414587e-08, "loss": 0.4491564631462097, "step": 1020, "token_acc": 0.8521850036920186 }, { "epoch": 2.746127946127946, "grad_norm": 1.3921531438827515, "learning_rate": 9.450333467069029e-08, "loss": 0.46638792753219604, "step": 1021, "token_acc": 0.8475328947368421 }, { "epoch": 2.7488215488215486, "grad_norm": 1.3340668678283691, "learning_rate": 9.253656350398127e-08, "loss": 0.4254743754863739, "step": 1022, "token_acc": 0.8585940674522552 }, { "epoch": 2.7515151515151515, "grad_norm": 1.5015039443969727, "learning_rate": 9.059008806456216e-08, "loss": 0.4824942946434021, "step": 1023, "token_acc": 0.849650703654956 }, { "epoch": 2.7542087542087543, "grad_norm": 1.3555876016616821, "learning_rate": 8.866392476187618e-08, "loss": 0.4268767535686493, "step": 1024, "token_acc": 0.8593853636966615 }, { "epoch": 2.7569023569023567, "grad_norm": 1.486167073249817, "learning_rate": 8.675808983412831e-08, "loss": 0.47532498836517334, "step": 1025, "token_acc": 0.8487428232033261 }, { "epoch": 2.7595959595959596, "grad_norm": 1.311797022819519, "learning_rate": 8.487259934814945e-08, "loss": 0.4283115863800049, "step": 1026, "token_acc": 0.8535128312999579 }, { "epoch": 2.7622895622895625, "grad_norm": 1.4805521965026855, "learning_rate": 8.300746919925828e-08, "loss": 0.4737658202648163, "step": 1027, "token_acc": 0.8455888560574082 }, { "epoch": 2.764983164983165, "grad_norm": 1.2686336040496826, "learning_rate": 8.116271511112994e-08, "loss": 0.444037526845932, "step": 1028, "token_acc": 0.8515808170515098 }, { "epoch": 2.7676767676767677, "grad_norm": 1.4185503721237183, "learning_rate": 7.933835263566281e-08, "loss": 0.4618375301361084, "step": 1029, "token_acc": 0.8493991989319092 }, { "epoch": 2.7703703703703706, "grad_norm": 1.2653461694717407, "learning_rate": 7.753439715284639e-08, "loss": 0.4643747806549072, "step": 1030, "token_acc": 0.8468408126814021 }, { "epoch": 2.773063973063973, "grad_norm": 1.4526880979537964, "learning_rate": 7.575086387063308e-08, "loss": 0.44832342863082886, "step": 1031, "token_acc": 0.8534892217756668 }, { "epoch": 2.775757575757576, "grad_norm": 1.3791003227233887, "learning_rate": 7.398776782480882e-08, "loss": 0.44167500734329224, "step": 1032, "token_acc": 0.8572172554465758 }, { "epoch": 2.7784511784511787, "grad_norm": 1.299343466758728, "learning_rate": 7.224512387886712e-08, "loss": 0.4822643995285034, "step": 1033, "token_acc": 0.8417318273964668 }, { "epoch": 2.781144781144781, "grad_norm": 1.3139241933822632, "learning_rate": 7.052294672388271e-08, "loss": 0.44595083594322205, "step": 1034, "token_acc": 0.8548144809974463 }, { "epoch": 2.783838383838384, "grad_norm": 1.4367371797561646, "learning_rate": 6.882125087838893e-08, "loss": 0.4639328420162201, "step": 1035, "token_acc": 0.8507450262215933 }, { "epoch": 2.7865319865319864, "grad_norm": 1.3890961408615112, "learning_rate": 6.714005068825469e-08, "loss": 0.46469229459762573, "step": 1036, "token_acc": 0.8494162497021682 }, { "epoch": 2.7892255892255893, "grad_norm": 1.338135004043579, "learning_rate": 6.547936032656354e-08, "loss": 0.46596047282218933, "step": 1037, "token_acc": 0.8466650652574266 }, { "epoch": 2.7919191919191917, "grad_norm": 1.3197437524795532, "learning_rate": 6.383919379349457e-08, "loss": 0.44545578956604004, "step": 1038, "token_acc": 0.8515505066011667 }, { "epoch": 2.7946127946127945, "grad_norm": 1.477396011352539, "learning_rate": 6.221956491620357e-08, "loss": 0.4718177318572998, "step": 1039, "token_acc": 0.845871716137837 }, { "epoch": 2.7973063973063974, "grad_norm": 1.2885854244232178, "learning_rate": 6.062048734870735e-08, "loss": 0.4504006803035736, "step": 1040, "token_acc": 0.8527679623085983 }, { "epoch": 2.8, "grad_norm": 1.2179943323135376, "learning_rate": 5.904197457176797e-08, "loss": 0.4400767982006073, "step": 1041, "token_acc": 0.852683344023947 }, { "epoch": 2.8026936026936027, "grad_norm": 1.2974787950515747, "learning_rate": 5.748403989278006e-08, "loss": 0.4255152642726898, "step": 1042, "token_acc": 0.8597424627958979 }, { "epoch": 2.8053872053872055, "grad_norm": 1.4658721685409546, "learning_rate": 5.594669644565731e-08, "loss": 0.47051024436950684, "step": 1043, "token_acc": 0.8456346423562412 }, { "epoch": 2.808080808080808, "grad_norm": 1.2956441640853882, "learning_rate": 5.442995719072225e-08, "loss": 0.4545144736766815, "step": 1044, "token_acc": 0.8519438877755511 }, { "epoch": 2.810774410774411, "grad_norm": 1.3253010511398315, "learning_rate": 5.293383491459803e-08, "loss": 0.4374930262565613, "step": 1045, "token_acc": 0.8590142982666347 }, { "epoch": 2.8134680134680137, "grad_norm": 1.259684443473816, "learning_rate": 5.1458342230098214e-08, "loss": 0.4349730908870697, "step": 1046, "token_acc": 0.8566985282961614 }, { "epoch": 2.816161616161616, "grad_norm": 1.352737307548523, "learning_rate": 5.000349157612383e-08, "loss": 0.42229461669921875, "step": 1047, "token_acc": 0.8613423959218351 }, { "epoch": 2.818855218855219, "grad_norm": 1.4329676628112793, "learning_rate": 4.85692952175551e-08, "loss": 0.4437457323074341, "step": 1048, "token_acc": 0.8535380747126436 }, { "epoch": 2.821548821548822, "grad_norm": 1.3560211658477783, "learning_rate": 4.715576524515042e-08, "loss": 0.4282883107662201, "step": 1049, "token_acc": 0.858544140423335 }, { "epoch": 2.824242424242424, "grad_norm": 1.2915691137313843, "learning_rate": 4.576291357544338e-08, "loss": 0.43202680349349976, "step": 1050, "token_acc": 0.856839121190645 }, { "epoch": 2.824242424242424, "eval_loss": 0.6005462408065796, "eval_runtime": 10.34, "eval_samples_per_second": 4.352, "eval_steps_per_second": 1.451, "eval_token_acc": 0.8127146061171426, "step": 1050 }, { "epoch": 2.826936026936027, "grad_norm": 1.3895041942596436, "learning_rate": 4.4390751950642296e-08, "loss": 0.4641355872154236, "step": 1051, "token_acc": 0.8476884378523722 }, { "epoch": 2.8296296296296295, "grad_norm": 1.42020845413208, "learning_rate": 4.303929193853168e-08, "loss": 0.4612810015678406, "step": 1052, "token_acc": 0.8486214209968187 }, { "epoch": 2.8323232323232324, "grad_norm": 1.3331258296966553, "learning_rate": 4.170854493237425e-08, "loss": 0.4456251561641693, "step": 1053, "token_acc": 0.8531029065200314 }, { "epoch": 2.8350168350168348, "grad_norm": 1.448405146598816, "learning_rate": 4.039852215081602e-08, "loss": 0.4235530495643616, "step": 1054, "token_acc": 0.8605760410933774 }, { "epoch": 2.8377104377104376, "grad_norm": 1.4553523063659668, "learning_rate": 3.910923463778971e-08, "loss": 0.4478422999382019, "step": 1055, "token_acc": 0.8509987403275149 }, { "epoch": 2.8404040404040405, "grad_norm": 1.425102949142456, "learning_rate": 3.7840693262422566e-08, "loss": 0.45926913619041443, "step": 1056, "token_acc": 0.851668169522092 }, { "epoch": 2.843097643097643, "grad_norm": 1.46293306350708, "learning_rate": 3.659290871894594e-08, "loss": 0.45675668120384216, "step": 1057, "token_acc": 0.8526781511813989 }, { "epoch": 2.8457912457912458, "grad_norm": 1.6274057626724243, "learning_rate": 3.5365891526603635e-08, "loss": 0.4518497586250305, "step": 1058, "token_acc": 0.8491165214559075 }, { "epoch": 2.8484848484848486, "grad_norm": 1.4286657571792603, "learning_rate": 3.415965202956367e-08, "loss": 0.4507647752761841, "step": 1059, "token_acc": 0.8521857923497268 }, { "epoch": 2.851178451178451, "grad_norm": 1.2332202196121216, "learning_rate": 3.297420039683086e-08, "loss": 0.469647616147995, "step": 1060, "token_acc": 0.8464457011955043 }, { "epoch": 2.853872053872054, "grad_norm": 1.379022240638733, "learning_rate": 3.1809546622162123e-08, "loss": 0.4521993100643158, "step": 1061, "token_acc": 0.8534840230913082 }, { "epoch": 2.8565656565656568, "grad_norm": 1.3139450550079346, "learning_rate": 3.066570052398049e-08, "loss": 0.47251901030540466, "step": 1062, "token_acc": 0.8452921586436573 }, { "epoch": 2.859259259259259, "grad_norm": 1.3725159168243408, "learning_rate": 2.954267174529346e-08, "loss": 0.4734669625759125, "step": 1063, "token_acc": 0.8456953078429703 }, { "epoch": 2.861952861952862, "grad_norm": 1.2339024543762207, "learning_rate": 2.8440469753612253e-08, "loss": 0.4544416666030884, "step": 1064, "token_acc": 0.8487486398258978 }, { "epoch": 2.864646464646465, "grad_norm": 1.3339927196502686, "learning_rate": 2.735910384087048e-08, "loss": 0.43935346603393555, "step": 1065, "token_acc": 0.8534369885433716 }, { "epoch": 2.8673400673400673, "grad_norm": 1.294851541519165, "learning_rate": 2.629858312334671e-08, "loss": 0.5151973962783813, "step": 1066, "token_acc": 0.8410596026490066 }, { "epoch": 2.87003367003367, "grad_norm": 1.4755780696868896, "learning_rate": 2.5258916541587574e-08, "loss": 0.4402005076408386, "step": 1067, "token_acc": 0.8572569375124776 }, { "epoch": 2.8727272727272726, "grad_norm": 1.4051295518875122, "learning_rate": 2.4240112860332843e-08, "loss": 0.4259350299835205, "step": 1068, "token_acc": 0.8608035714285714 }, { "epoch": 2.8754208754208754, "grad_norm": 1.3524303436279297, "learning_rate": 2.3242180668440195e-08, "loss": 0.4785524308681488, "step": 1069, "token_acc": 0.8414318354912415 }, { "epoch": 2.878114478114478, "grad_norm": 1.4711772203445435, "learning_rate": 2.226512837881417e-08, "loss": 0.4889402389526367, "step": 1070, "token_acc": 0.8431959656449453 }, { "epoch": 2.8808080808080807, "grad_norm": 1.2629989385604858, "learning_rate": 2.1308964228334282e-08, "loss": 0.42904341220855713, "step": 1071, "token_acc": 0.8595747900431263 }, { "epoch": 2.8835016835016836, "grad_norm": 1.349511742591858, "learning_rate": 2.037369627778646e-08, "loss": 0.4101376235485077, "step": 1072, "token_acc": 0.8628255589984088 }, { "epoch": 2.886195286195286, "grad_norm": 1.3907666206359863, "learning_rate": 1.9459332411794208e-08, "loss": 0.4565786123275757, "step": 1073, "token_acc": 0.8459187045604759 }, { "epoch": 2.888888888888889, "grad_norm": 1.5231903791427612, "learning_rate": 1.8565880338752838e-08, "loss": 0.4534092843532562, "step": 1074, "token_acc": 0.8503656849179679 }, { "epoch": 2.8915824915824917, "grad_norm": 1.2534784078598022, "learning_rate": 1.7693347590764244e-08, "loss": 0.488106369972229, "step": 1075, "token_acc": 0.8438154379377573 }, { "epoch": 2.894276094276094, "grad_norm": 1.341977834701538, "learning_rate": 1.684174152357304e-08, "loss": 0.4296245574951172, "step": 1076, "token_acc": 0.8560876421279774 }, { "epoch": 2.896969696969697, "grad_norm": 1.3972772359848022, "learning_rate": 1.6011069316505255e-08, "loss": 0.45440539717674255, "step": 1077, "token_acc": 0.8486801986235735 }, { "epoch": 2.8996632996633, "grad_norm": 1.3822901248931885, "learning_rate": 1.520133797240725e-08, "loss": 0.4369499683380127, "step": 1078, "token_acc": 0.8547075112495673 }, { "epoch": 2.9023569023569022, "grad_norm": 1.2065870761871338, "learning_rate": 1.4412554317586591e-08, "loss": 0.42626523971557617, "step": 1079, "token_acc": 0.8578007256488975 }, { "epoch": 2.905050505050505, "grad_norm": 1.4155739545822144, "learning_rate": 1.3644725001755177e-08, "loss": 0.42687201499938965, "step": 1080, "token_acc": 0.8575879765395894 }, { "epoch": 2.907744107744108, "grad_norm": 1.4314115047454834, "learning_rate": 1.2897856497972871e-08, "loss": 0.4564938545227051, "step": 1081, "token_acc": 0.8515245249668582 }, { "epoch": 2.9104377104377104, "grad_norm": 1.4439717531204224, "learning_rate": 1.2171955102592558e-08, "loss": 0.4901677370071411, "step": 1082, "token_acc": 0.8443231636396171 }, { "epoch": 2.9131313131313132, "grad_norm": 1.3583319187164307, "learning_rate": 1.1467026935207404e-08, "loss": 0.46829360723495483, "step": 1083, "token_acc": 0.849126750821373 }, { "epoch": 2.915824915824916, "grad_norm": 1.2711912393569946, "learning_rate": 1.0783077938598952e-08, "loss": 0.4471214711666107, "step": 1084, "token_acc": 0.8536527886881382 }, { "epoch": 2.9185185185185185, "grad_norm": 1.3285554647445679, "learning_rate": 1.012011387868772e-08, "loss": 0.4448176622390747, "step": 1085, "token_acc": 0.8511799646813293 }, { "epoch": 2.9212121212121214, "grad_norm": 1.306470274925232, "learning_rate": 9.478140344483522e-09, "loss": 0.4528490900993347, "step": 1086, "token_acc": 0.853187328901056 }, { "epoch": 2.923905723905724, "grad_norm": 1.3932037353515625, "learning_rate": 8.857162748039939e-09, "loss": 0.44952404499053955, "step": 1087, "token_acc": 0.8501376462491397 }, { "epoch": 2.9265993265993266, "grad_norm": 1.4067864418029785, "learning_rate": 8.257186324406863e-09, "loss": 0.47229695320129395, "step": 1088, "token_acc": 0.8481876332622601 }, { "epoch": 2.929292929292929, "grad_norm": 1.5048812627792358, "learning_rate": 7.678216131587757e-09, "loss": 0.4276365041732788, "step": 1089, "token_acc": 0.8567903138736793 }, { "epoch": 2.931986531986532, "grad_norm": 1.350053071975708, "learning_rate": 7.120257050496071e-09, "loss": 0.4378651976585388, "step": 1090, "token_acc": 0.8537161133109961 }, { "epoch": 2.9346801346801348, "grad_norm": 1.308628797531128, "learning_rate": 6.583313784914725e-09, "loss": 0.4429702162742615, "step": 1091, "token_acc": 0.8539325842696629 }, { "epoch": 2.937373737373737, "grad_norm": 1.3252204656600952, "learning_rate": 6.067390861456413e-09, "loss": 0.45683854818344116, "step": 1092, "token_acc": 0.8532543431270515 }, { "epoch": 2.94006734006734, "grad_norm": 1.3999561071395874, "learning_rate": 5.57249262952475e-09, "loss": 0.4456324577331543, "step": 1093, "token_acc": 0.8555119188967029 }, { "epoch": 2.942760942760943, "grad_norm": 1.3021738529205322, "learning_rate": 5.0986232612787455e-09, "loss": 0.4576069712638855, "step": 1094, "token_acc": 0.8528182893180922 }, { "epoch": 2.9454545454545453, "grad_norm": 1.3985567092895508, "learning_rate": 4.645786751596437e-09, "loss": 0.4843981862068176, "step": 1095, "token_acc": 0.84496699669967 }, { "epoch": 2.948148148148148, "grad_norm": 1.3914210796356201, "learning_rate": 4.2139869180424235e-09, "loss": 0.4784889221191406, "step": 1096, "token_acc": 0.8408710217755444 }, { "epoch": 2.950841750841751, "grad_norm": 1.3218568563461304, "learning_rate": 3.803227400834275e-09, "loss": 0.4423666000366211, "step": 1097, "token_acc": 0.8508930034409307 }, { "epoch": 2.9535353535353535, "grad_norm": 1.401810646057129, "learning_rate": 3.413511662813118e-09, "loss": 0.46272262930870056, "step": 1098, "token_acc": 0.8465168153740563 }, { "epoch": 2.9562289562289563, "grad_norm": 1.4249076843261719, "learning_rate": 3.0448429894142096e-09, "loss": 0.43625950813293457, "step": 1099, "token_acc": 0.8555818612274632 }, { "epoch": 2.958922558922559, "grad_norm": 1.3060100078582764, "learning_rate": 2.6972244886380726e-09, "loss": 0.4409256875514984, "step": 1100, "token_acc": 0.8537014805922369 }, { "epoch": 2.958922558922559, "eval_loss": 0.6000774502754211, "eval_runtime": 10.2189, "eval_samples_per_second": 4.404, "eval_steps_per_second": 1.468, "eval_token_acc": 0.8137070043467043, "step": 1100 }, { "epoch": 2.9616161616161616, "grad_norm": 1.272207498550415, "learning_rate": 2.370659091026073e-09, "loss": 0.45354872941970825, "step": 1101, "token_acc": 0.8495662760614823 }, { "epoch": 2.9643097643097645, "grad_norm": 1.3420530557632446, "learning_rate": 2.0651495496343265e-09, "loss": 0.4635380506515503, "step": 1102, "token_acc": 0.8528394682240836 }, { "epoch": 2.967003367003367, "grad_norm": 1.2819633483886719, "learning_rate": 1.7806984400109417e-09, "loss": 0.5061292052268982, "step": 1103, "token_acc": 0.8372504829362524 }, { "epoch": 2.9696969696969697, "grad_norm": 1.362766146659851, "learning_rate": 1.5173081601746464e-09, "loss": 0.483750581741333, "step": 1104, "token_acc": 0.8422962718574728 }, { "epoch": 2.972390572390572, "grad_norm": 1.32582426071167, "learning_rate": 1.2749809305942495e-09, "loss": 0.4669121205806732, "step": 1105, "token_acc": 0.8468155630968353 }, { "epoch": 2.975084175084175, "grad_norm": 1.393170714378357, "learning_rate": 1.0537187941700445e-09, "loss": 0.46840453147888184, "step": 1106, "token_acc": 0.8499755261869799 }, { "epoch": 2.977777777777778, "grad_norm": 1.3448066711425781, "learning_rate": 8.535236162160454e-10, "loss": 0.4484027028083801, "step": 1107, "token_acc": 0.8561471279811891 }, { "epoch": 2.9804713804713803, "grad_norm": 1.2766088247299194, "learning_rate": 6.743970844449999e-10, "loss": 0.4465380907058716, "step": 1108, "token_acc": 0.8546540160314084 }, { "epoch": 2.983164983164983, "grad_norm": 1.3982439041137695, "learning_rate": 5.163407089539552e-10, "loss": 0.4449412226676941, "step": 1109, "token_acc": 0.8537470827210649 }, { "epoch": 2.985858585858586, "grad_norm": 1.4704973697662354, "learning_rate": 3.793558222117688e-10, "loss": 0.4328109920024872, "step": 1110, "token_acc": 0.8561663617984019 }, { "epoch": 2.9885521885521884, "grad_norm": 1.3781206607818604, "learning_rate": 2.634435790463408e-10, "loss": 0.456814706325531, "step": 1111, "token_acc": 0.8516021712484678 }, { "epoch": 2.9912457912457913, "grad_norm": 1.2882847785949707, "learning_rate": 1.6860495663767463e-10, "loss": 0.45441097021102905, "step": 1112, "token_acc": 0.8535392735175411 }, { "epoch": 2.993939393939394, "grad_norm": 1.378680944442749, "learning_rate": 9.484075450677532e-11, "loss": 0.45585161447525024, "step": 1113, "token_acc": 0.8524416135881104 }, { "epoch": 2.9966329966329965, "grad_norm": 1.3800830841064453, "learning_rate": 4.2151594510653205e-11, "loss": 0.4293474853038788, "step": 1114, "token_acc": 0.8551560379918589 }, { "epoch": 2.9993265993265994, "grad_norm": 1.3244179487228394, "learning_rate": 1.0537920837327965e-11, "loss": 0.4796169400215149, "step": 1115, "token_acc": 0.8450114543012877 }, { "epoch": 3.0, "grad_norm": 2.4627857208251953, "learning_rate": 0.0, "loss": 0.4717531204223633, "step": 1116, "token_acc": 0.8425821064552661 }, { "epoch": 3.0, "eval_loss": 0.6002582311630249, "eval_runtime": 10.244, "eval_samples_per_second": 4.393, "eval_steps_per_second": 1.464, "eval_token_acc": 0.813429132842427, "step": 1116 } ], "logging_steps": 1, "max_steps": 1116, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 118444065140736.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }