{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 184, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005457025920873124, "grad_norm": 10.22213420182019, "learning_rate": 0.0, "loss": 1.1573, "mean_token_accuracy": 0.7164628058671951, "num_tokens": 523676.0, "step": 1 }, { "epoch": 0.010914051841746248, "grad_norm": 10.273668310890756, "learning_rate": 1.0526315789473685e-06, "loss": 1.1616, "mean_token_accuracy": 0.7147120386362076, "num_tokens": 1045809.0, "step": 2 }, { "epoch": 0.01637107776261937, "grad_norm": 9.78774768102665, "learning_rate": 2.105263157894737e-06, "loss": 1.1567, "mean_token_accuracy": 0.7149067521095276, "num_tokens": 1570097.0, "step": 3 }, { "epoch": 0.021828103683492497, "grad_norm": 8.94313097518904, "learning_rate": 3.157894736842105e-06, "loss": 1.1348, "mean_token_accuracy": 0.7210559844970703, "num_tokens": 2091845.0, "step": 4 }, { "epoch": 0.027285129604365622, "grad_norm": 7.9721339575335834, "learning_rate": 4.210526315789474e-06, "loss": 1.1095, "mean_token_accuracy": 0.7218570560216904, "num_tokens": 2615847.0, "step": 5 }, { "epoch": 0.03274215552523874, "grad_norm": 7.592127321700853, "learning_rate": 5.263157894736842e-06, "loss": 1.0809, "mean_token_accuracy": 0.7219354957342148, "num_tokens": 3138337.0, "step": 6 }, { "epoch": 0.03819918144611187, "grad_norm": 5.919775437809893, "learning_rate": 6.31578947368421e-06, "loss": 1.0143, "mean_token_accuracy": 0.7335961759090424, "num_tokens": 3661593.0, "step": 7 }, { "epoch": 0.04365620736698499, "grad_norm": 5.601592015786545, "learning_rate": 7.368421052631579e-06, "loss": 1.0196, "mean_token_accuracy": 0.7284622341394424, "num_tokens": 4185249.0, "step": 8 }, { "epoch": 0.04911323328785812, "grad_norm": 4.982536388951164, "learning_rate": 8.421052631578948e-06, "loss": 0.9759, "mean_token_accuracy": 0.7368100732564926, "num_tokens": 4708215.0, "step": 9 }, { "epoch": 0.054570259208731244, "grad_norm": 4.249175412307748, "learning_rate": 9.473684210526315e-06, "loss": 0.9085, "mean_token_accuracy": 0.7506365925073624, "num_tokens": 5230908.0, "step": 10 }, { "epoch": 0.06002728512960437, "grad_norm": 2.66891797206269, "learning_rate": 1.0526315789473684e-05, "loss": 0.8817, "mean_token_accuracy": 0.7550760209560394, "num_tokens": 5753624.0, "step": 11 }, { "epoch": 0.06548431105047749, "grad_norm": 1.901679552444703, "learning_rate": 1.1578947368421053e-05, "loss": 0.8638, "mean_token_accuracy": 0.7565864026546478, "num_tokens": 6276373.0, "step": 12 }, { "epoch": 0.07094133697135062, "grad_norm": 1.9750884331299126, "learning_rate": 1.263157894736842e-05, "loss": 0.8356, "mean_token_accuracy": 0.7596608996391296, "num_tokens": 6800427.0, "step": 13 }, { "epoch": 0.07639836289222374, "grad_norm": 2.0247436740822646, "learning_rate": 1.3684210526315791e-05, "loss": 0.8165, "mean_token_accuracy": 0.7632935792207718, "num_tokens": 7323928.0, "step": 14 }, { "epoch": 0.08185538881309687, "grad_norm": 1.2580071009761389, "learning_rate": 1.4736842105263159e-05, "loss": 0.7843, "mean_token_accuracy": 0.7702907770872116, "num_tokens": 7847456.0, "step": 15 }, { "epoch": 0.08731241473396999, "grad_norm": 1.1026372318409008, "learning_rate": 1.578947368421053e-05, "loss": 0.7812, "mean_token_accuracy": 0.7720244079828262, "num_tokens": 8370723.0, "step": 16 }, { "epoch": 0.0927694406548431, "grad_norm": 0.9790512143637813, "learning_rate": 1.6842105263157896e-05, "loss": 0.7609, "mean_token_accuracy": 0.7757307291030884, "num_tokens": 8894432.0, "step": 17 }, { "epoch": 0.09822646657571624, "grad_norm": 0.910145815422614, "learning_rate": 1.7894736842105264e-05, "loss": 0.7538, "mean_token_accuracy": 0.7766335010528564, "num_tokens": 9418720.0, "step": 18 }, { "epoch": 0.10368349249658936, "grad_norm": 0.8433683095975495, "learning_rate": 1.894736842105263e-05, "loss": 0.7409, "mean_token_accuracy": 0.7793708741664886, "num_tokens": 9942800.0, "step": 19 }, { "epoch": 0.10914051841746249, "grad_norm": 0.7670309303084534, "learning_rate": 2e-05, "loss": 0.7321, "mean_token_accuracy": 0.7815710753202438, "num_tokens": 10466884.0, "step": 20 }, { "epoch": 0.1145975443383356, "grad_norm": 0.706537688385123, "learning_rate": 1.999818745523526e-05, "loss": 0.697, "mean_token_accuracy": 0.7905226796865463, "num_tokens": 10990544.0, "step": 21 }, { "epoch": 0.12005457025920874, "grad_norm": 0.5527165898545957, "learning_rate": 1.999275047800474e-05, "loss": 0.7128, "mean_token_accuracy": 0.7862321138381958, "num_tokens": 11514374.0, "step": 22 }, { "epoch": 0.12551159618008187, "grad_norm": 0.571109958623919, "learning_rate": 1.9983691039261358e-05, "loss": 0.7241, "mean_token_accuracy": 0.7830730974674225, "num_tokens": 12037544.0, "step": 23 }, { "epoch": 0.13096862210095497, "grad_norm": 0.5370114426229838, "learning_rate": 1.9971012423132776e-05, "loss": 0.6982, "mean_token_accuracy": 0.7900128364562988, "num_tokens": 12561482.0, "step": 24 }, { "epoch": 0.1364256480218281, "grad_norm": 0.5547056978188413, "learning_rate": 1.9954719225730847e-05, "loss": 0.7062, "mean_token_accuracy": 0.7870719134807587, "num_tokens": 13085632.0, "step": 25 }, { "epoch": 0.14188267394270124, "grad_norm": 0.4977992915278872, "learning_rate": 1.99348173534855e-05, "loss": 0.7022, "mean_token_accuracy": 0.7884464412927628, "num_tokens": 13608889.0, "step": 26 }, { "epoch": 0.14733969986357434, "grad_norm": 0.6284161854646022, "learning_rate": 1.9911314021003614e-05, "loss": 0.6929, "mean_token_accuracy": 0.7912539541721344, "num_tokens": 14132289.0, "step": 27 }, { "epoch": 0.15279672578444747, "grad_norm": 0.5715615291456563, "learning_rate": 1.9884217748453625e-05, "loss": 0.6831, "mean_token_accuracy": 0.7934748083353043, "num_tokens": 14655501.0, "step": 28 }, { "epoch": 0.1582537517053206, "grad_norm": 0.7940147931527408, "learning_rate": 1.9853538358476933e-05, "loss": 0.6985, "mean_token_accuracy": 0.7891672253608704, "num_tokens": 15178673.0, "step": 29 }, { "epoch": 0.16371077762619374, "grad_norm": 0.42070194949907197, "learning_rate": 1.9819286972627066e-05, "loss": 0.6789, "mean_token_accuracy": 0.7936267405748367, "num_tokens": 15702134.0, "step": 30 }, { "epoch": 0.16916780354706684, "grad_norm": 0.42689025398588065, "learning_rate": 1.9781476007338058e-05, "loss": 0.6661, "mean_token_accuracy": 0.7965902388095856, "num_tokens": 16225916.0, "step": 31 }, { "epoch": 0.17462482946793997, "grad_norm": 0.4069071707695385, "learning_rate": 1.9740119169423337e-05, "loss": 0.6769, "mean_token_accuracy": 0.7948543280363083, "num_tokens": 16749401.0, "step": 32 }, { "epoch": 0.1800818553888131, "grad_norm": 0.3899451941069501, "learning_rate": 1.9695231451106914e-05, "loss": 0.6849, "mean_token_accuracy": 0.7926386445760727, "num_tokens": 17273468.0, "step": 33 }, { "epoch": 0.1855388813096862, "grad_norm": 0.38714269519831274, "learning_rate": 1.964682912458856e-05, "loss": 0.6689, "mean_token_accuracy": 0.7962382435798645, "num_tokens": 17795903.0, "step": 34 }, { "epoch": 0.19099590723055934, "grad_norm": 0.3854479116669319, "learning_rate": 1.9594929736144978e-05, "loss": 0.6828, "mean_token_accuracy": 0.792619377374649, "num_tokens": 18317513.0, "step": 35 }, { "epoch": 0.19645293315143247, "grad_norm": 0.36645962752185096, "learning_rate": 1.9539552099769128e-05, "loss": 0.6582, "mean_token_accuracy": 0.7995733767747879, "num_tokens": 18840309.0, "step": 36 }, { "epoch": 0.2019099590723056, "grad_norm": 0.4016622694853883, "learning_rate": 1.9480716290349998e-05, "loss": 0.675, "mean_token_accuracy": 0.7943409830331802, "num_tokens": 19363271.0, "step": 37 }, { "epoch": 0.2073669849931787, "grad_norm": 0.3598753892733462, "learning_rate": 1.941844363639525e-05, "loss": 0.6502, "mean_token_accuracy": 0.8010804206132889, "num_tokens": 19886861.0, "step": 38 }, { "epoch": 0.21282401091405184, "grad_norm": 0.3767009308515463, "learning_rate": 1.9352756712299467e-05, "loss": 0.6622, "mean_token_accuracy": 0.7980271279811859, "num_tokens": 20410529.0, "step": 39 }, { "epoch": 0.21828103683492497, "grad_norm": 0.36380332204946886, "learning_rate": 1.9283679330160726e-05, "loss": 0.6743, "mean_token_accuracy": 0.7954283803701401, "num_tokens": 20933244.0, "step": 40 }, { "epoch": 0.22373806275579808, "grad_norm": 0.3372596025306025, "learning_rate": 1.92112365311485e-05, "loss": 0.6603, "mean_token_accuracy": 0.7978256940841675, "num_tokens": 21456172.0, "step": 41 }, { "epoch": 0.2291950886766712, "grad_norm": 0.3431291193692692, "learning_rate": 1.913545457642601e-05, "loss": 0.6474, "mean_token_accuracy": 0.8019589632749557, "num_tokens": 21979434.0, "step": 42 }, { "epoch": 0.23465211459754434, "grad_norm": 0.35120079952111755, "learning_rate": 1.905636093763031e-05, "loss": 0.6526, "mean_token_accuracy": 0.8002910017967224, "num_tokens": 22503439.0, "step": 43 }, { "epoch": 0.24010914051841747, "grad_norm": 0.32515836450165586, "learning_rate": 1.8973984286913584e-05, "loss": 0.669, "mean_token_accuracy": 0.7958613783121109, "num_tokens": 23025997.0, "step": 44 }, { "epoch": 0.24556616643929058, "grad_norm": 0.3423697459142544, "learning_rate": 1.8888354486549238e-05, "loss": 0.6508, "mean_token_accuracy": 0.8007421046495438, "num_tokens": 23548909.0, "step": 45 }, { "epoch": 0.25102319236016374, "grad_norm": 0.34254341826311147, "learning_rate": 1.8799502578106533e-05, "loss": 0.6518, "mean_token_accuracy": 0.7999228090047836, "num_tokens": 24071561.0, "step": 46 }, { "epoch": 0.25648021828103684, "grad_norm": 0.38246389596908054, "learning_rate": 1.8707460771197773e-05, "loss": 0.6643, "mean_token_accuracy": 0.7966996878385544, "num_tokens": 24594782.0, "step": 47 }, { "epoch": 0.26193724420190995, "grad_norm": 0.36747149421521913, "learning_rate": 1.861226243180201e-05, "loss": 0.6424, "mean_token_accuracy": 0.8020810931921005, "num_tokens": 25117688.0, "step": 48 }, { "epoch": 0.2673942701227831, "grad_norm": 0.3260159545079233, "learning_rate": 1.8513942070169572e-05, "loss": 0.6496, "mean_token_accuracy": 0.8000700622797012, "num_tokens": 25641600.0, "step": 49 }, { "epoch": 0.2728512960436562, "grad_norm": 0.31918709162924563, "learning_rate": 1.8412535328311813e-05, "loss": 0.6469, "mean_token_accuracy": 0.8012609481811523, "num_tokens": 26165286.0, "step": 50 }, { "epoch": 0.2783083219645293, "grad_norm": 0.3884773043904409, "learning_rate": 1.8308078967080547e-05, "loss": 0.6453, "mean_token_accuracy": 0.802044153213501, "num_tokens": 26688026.0, "step": 51 }, { "epoch": 0.2837653478854025, "grad_norm": 0.3221640246981774, "learning_rate": 1.8200610852841913e-05, "loss": 0.6447, "mean_token_accuracy": 0.8017777651548386, "num_tokens": 27211433.0, "step": 52 }, { "epoch": 0.2892223738062756, "grad_norm": 0.35950232000512794, "learning_rate": 1.8090169943749477e-05, "loss": 0.6393, "mean_token_accuracy": 0.8036545515060425, "num_tokens": 27735386.0, "step": 53 }, { "epoch": 0.2946793997271487, "grad_norm": 0.3336227394165688, "learning_rate": 1.7976796275621556e-05, "loss": 0.6532, "mean_token_accuracy": 0.7994280308485031, "num_tokens": 28259425.0, "step": 54 }, { "epoch": 0.30013642564802184, "grad_norm": 0.3159155930753706, "learning_rate": 1.7860530947427878e-05, "loss": 0.6367, "mean_token_accuracy": 0.8045277893543243, "num_tokens": 28783214.0, "step": 55 }, { "epoch": 0.30559345156889495, "grad_norm": 0.33863627132547003, "learning_rate": 1.7741416106390828e-05, "loss": 0.6447, "mean_token_accuracy": 0.8009888529777527, "num_tokens": 29306632.0, "step": 56 }, { "epoch": 0.31105047748976805, "grad_norm": 0.32414058492274905, "learning_rate": 1.761949493270671e-05, "loss": 0.6264, "mean_token_accuracy": 0.8071126639842987, "num_tokens": 29829363.0, "step": 57 }, { "epoch": 0.3165075034106412, "grad_norm": 0.343900605873189, "learning_rate": 1.7494811623892543e-05, "loss": 0.6278, "mean_token_accuracy": 0.806527853012085, "num_tokens": 30352094.0, "step": 58 }, { "epoch": 0.3219645293315143, "grad_norm": 0.36999037823632697, "learning_rate": 1.736741137876405e-05, "loss": 0.6284, "mean_token_accuracy": 0.8057119399309158, "num_tokens": 30875328.0, "step": 59 }, { "epoch": 0.3274215552523875, "grad_norm": 0.3264916869231787, "learning_rate": 1.72373403810507e-05, "loss": 0.6335, "mean_token_accuracy": 0.8045472204685211, "num_tokens": 31399221.0, "step": 60 }, { "epoch": 0.3328785811732606, "grad_norm": 0.4782591852354962, "learning_rate": 1.710464578265369e-05, "loss": 0.649, "mean_token_accuracy": 0.8001821935176849, "num_tokens": 31923238.0, "step": 61 }, { "epoch": 0.3383356070941337, "grad_norm": 0.34512671491394814, "learning_rate": 1.696937568655294e-05, "loss": 0.6343, "mean_token_accuracy": 0.8047307133674622, "num_tokens": 32446843.0, "step": 62 }, { "epoch": 0.34379263301500684, "grad_norm": 0.3795273366749496, "learning_rate": 1.6831579129369347e-05, "loss": 0.6232, "mean_token_accuracy": 0.8063706010580063, "num_tokens": 32970843.0, "step": 63 }, { "epoch": 0.34924965893587995, "grad_norm": 0.37143998348188484, "learning_rate": 1.6691306063588583e-05, "loss": 0.6303, "mean_token_accuracy": 0.8047699332237244, "num_tokens": 33494468.0, "step": 64 }, { "epoch": 0.35470668485675305, "grad_norm": 0.399010736777414, "learning_rate": 1.6548607339452853e-05, "loss": 0.6098, "mean_token_accuracy": 0.8112862259149551, "num_tokens": 34018179.0, "step": 65 }, { "epoch": 0.3601637107776262, "grad_norm": 0.41886782643797665, "learning_rate": 1.6403534686527223e-05, "loss": 0.638, "mean_token_accuracy": 0.8040533065795898, "num_tokens": 34539809.0, "step": 66 }, { "epoch": 0.3656207366984993, "grad_norm": 0.37173400377826826, "learning_rate": 1.6256140694947217e-05, "loss": 0.6198, "mean_token_accuracy": 0.8078815788030624, "num_tokens": 35063685.0, "step": 67 }, { "epoch": 0.3710777626193724, "grad_norm": 0.37848673780253644, "learning_rate": 1.6106478796354382e-05, "loss": 0.6357, "mean_token_accuracy": 0.8034750819206238, "num_tokens": 35587259.0, "step": 68 }, { "epoch": 0.3765347885402456, "grad_norm": 0.40573394210222646, "learning_rate": 1.595460324452688e-05, "loss": 0.6291, "mean_token_accuracy": 0.8048963695764542, "num_tokens": 36110522.0, "step": 69 }, { "epoch": 0.3819918144611187, "grad_norm": 0.32059584094834276, "learning_rate": 1.5800569095711983e-05, "loss": 0.6284, "mean_token_accuracy": 0.8049021810293198, "num_tokens": 36634670.0, "step": 70 }, { "epoch": 0.3874488403819918, "grad_norm": 0.38182900761291677, "learning_rate": 1.5644432188667695e-05, "loss": 0.6406, "mean_token_accuracy": 0.8030418157577515, "num_tokens": 37157066.0, "step": 71 }, { "epoch": 0.39290586630286495, "grad_norm": 0.3792868667331441, "learning_rate": 1.5486249124420702e-05, "loss": 0.6216, "mean_token_accuracy": 0.8081106394529343, "num_tokens": 37680580.0, "step": 72 }, { "epoch": 0.39836289222373805, "grad_norm": 0.3505388571032621, "learning_rate": 1.5326077245747998e-05, "loss": 0.6417, "mean_token_accuracy": 0.8015989065170288, "num_tokens": 38204191.0, "step": 73 }, { "epoch": 0.4038199181446112, "grad_norm": 0.34629977214808816, "learning_rate": 1.5163974616389621e-05, "loss": 0.6119, "mean_token_accuracy": 0.8100632429122925, "num_tokens": 38727916.0, "step": 74 }, { "epoch": 0.4092769440654843, "grad_norm": 0.3432985248757669, "learning_rate": 1.5000000000000002e-05, "loss": 0.6189, "mean_token_accuracy": 0.8073546588420868, "num_tokens": 39251672.0, "step": 75 }, { "epoch": 0.4147339699863574, "grad_norm": 0.34896847344460213, "learning_rate": 1.4834212838845639e-05, "loss": 0.6251, "mean_token_accuracy": 0.8063764572143555, "num_tokens": 39775687.0, "step": 76 }, { "epoch": 0.4201909959072306, "grad_norm": 0.3625496780454411, "learning_rate": 1.4666673232256738e-05, "loss": 0.6328, "mean_token_accuracy": 0.8043248653411865, "num_tokens": 40298660.0, "step": 77 }, { "epoch": 0.4256480218281037, "grad_norm": 0.3532777855472362, "learning_rate": 1.449744191484066e-05, "loss": 0.6409, "mean_token_accuracy": 0.8027763664722443, "num_tokens": 40822948.0, "step": 78 }, { "epoch": 0.4311050477489768, "grad_norm": 0.35365330484423485, "learning_rate": 1.4326580234465084e-05, "loss": 0.626, "mean_token_accuracy": 0.8057489842176437, "num_tokens": 41345922.0, "step": 79 }, { "epoch": 0.43656207366984995, "grad_norm": 0.3349393977670838, "learning_rate": 1.4154150130018867e-05, "loss": 0.6175, "mean_token_accuracy": 0.808809220790863, "num_tokens": 41869207.0, "step": 80 }, { "epoch": 0.44201909959072305, "grad_norm": 0.35933628997277556, "learning_rate": 1.3980214108958626e-05, "loss": 0.6277, "mean_token_accuracy": 0.8056986331939697, "num_tokens": 42393021.0, "step": 81 }, { "epoch": 0.44747612551159616, "grad_norm": 0.33096454453402563, "learning_rate": 1.380483522464923e-05, "loss": 0.6282, "mean_token_accuracy": 0.8058310747146606, "num_tokens": 42916182.0, "step": 82 }, { "epoch": 0.4529331514324693, "grad_norm": 0.30478624197164905, "learning_rate": 1.362807705350641e-05, "loss": 0.622, "mean_token_accuracy": 0.8063958883285522, "num_tokens": 43438714.0, "step": 83 }, { "epoch": 0.4583901773533424, "grad_norm": 0.3501335389226804, "learning_rate": 1.3450003671949707e-05, "loss": 0.6316, "mean_token_accuracy": 0.8037978112697601, "num_tokens": 43963002.0, "step": 84 }, { "epoch": 0.4638472032742155, "grad_norm": 0.3050292683152368, "learning_rate": 1.3270679633174219e-05, "loss": 0.6313, "mean_token_accuracy": 0.8047272562980652, "num_tokens": 44486140.0, "step": 85 }, { "epoch": 0.4693042291950887, "grad_norm": 0.37055749603164684, "learning_rate": 1.3090169943749475e-05, "loss": 0.6207, "mean_token_accuracy": 0.8078664541244507, "num_tokens": 45009734.0, "step": 86 }, { "epoch": 0.4747612551159618, "grad_norm": 0.2984594081324396, "learning_rate": 1.2908540040053992e-05, "loss": 0.6432, "mean_token_accuracy": 0.8011371046304703, "num_tokens": 45532685.0, "step": 87 }, { "epoch": 0.48021828103683495, "grad_norm": 0.3800002310817578, "learning_rate": 1.2725855764553981e-05, "loss": 0.618, "mean_token_accuracy": 0.8085715621709824, "num_tokens": 46056685.0, "step": 88 }, { "epoch": 0.48567530695770805, "grad_norm": 0.330018519060922, "learning_rate": 1.2542183341934873e-05, "loss": 0.6319, "mean_token_accuracy": 0.8046033978462219, "num_tokens": 46580056.0, "step": 89 }, { "epoch": 0.49113233287858116, "grad_norm": 0.3375380842374477, "learning_rate": 1.2357589355094275e-05, "loss": 0.6208, "mean_token_accuracy": 0.8076395243406296, "num_tokens": 47103434.0, "step": 90 }, { "epoch": 0.4965893587994543, "grad_norm": 0.3164441239625524, "learning_rate": 1.217214072100508e-05, "loss": 0.6184, "mean_token_accuracy": 0.8088521063327789, "num_tokens": 47625849.0, "step": 91 }, { "epoch": 0.5020463847203275, "grad_norm": 0.30994257381808427, "learning_rate": 1.1985904666457455e-05, "loss": 0.6137, "mean_token_accuracy": 0.8092475831508636, "num_tokens": 48149477.0, "step": 92 }, { "epoch": 0.5075034106412005, "grad_norm": 0.32961807061620957, "learning_rate": 1.179894870368854e-05, "loss": 0.6144, "mean_token_accuracy": 0.8092338591814041, "num_tokens": 48672968.0, "step": 93 }, { "epoch": 0.5129604365620737, "grad_norm": 0.2954751932341951, "learning_rate": 1.1611340605908643e-05, "loss": 0.6293, "mean_token_accuracy": 0.804235503077507, "num_tokens": 49196609.0, "step": 94 }, { "epoch": 0.5184174624829468, "grad_norm": 0.3073584261153331, "learning_rate": 1.1423148382732854e-05, "loss": 0.6073, "mean_token_accuracy": 0.8108531385660172, "num_tokens": 49720351.0, "step": 95 }, { "epoch": 0.5238744884038199, "grad_norm": 0.31592897133822806, "learning_rate": 1.1234440255526948e-05, "loss": 0.6171, "mean_token_accuracy": 0.8081915378570557, "num_tokens": 50243733.0, "step": 96 }, { "epoch": 0.529331514324693, "grad_norm": 0.34742265287757973, "learning_rate": 1.1045284632676535e-05, "loss": 0.6318, "mean_token_accuracy": 0.8044733256101608, "num_tokens": 50767754.0, "step": 97 }, { "epoch": 0.5347885402455662, "grad_norm": 0.34110294551441445, "learning_rate": 1.08557500847884e-05, "loss": 0.6188, "mean_token_accuracy": 0.8076249808073044, "num_tokens": 51291988.0, "step": 98 }, { "epoch": 0.5402455661664393, "grad_norm": 0.32189183372762564, "learning_rate": 1.066590531983304e-05, "loss": 0.6152, "mean_token_accuracy": 0.8090378940105438, "num_tokens": 51815310.0, "step": 99 }, { "epoch": 0.5457025920873124, "grad_norm": 0.29768781279014606, "learning_rate": 1.0475819158237426e-05, "loss": 0.6247, "mean_token_accuracy": 0.806323915719986, "num_tokens": 52338013.0, "step": 100 }, { "epoch": 0.5511596180081856, "grad_norm": 0.33743263078601365, "learning_rate": 1.0285560507936962e-05, "loss": 0.6294, "mean_token_accuracy": 0.8047656267881393, "num_tokens": 52860936.0, "step": 101 }, { "epoch": 0.5566166439290586, "grad_norm": 0.30513251377145567, "learning_rate": 1.0095198339395769e-05, "loss": 0.6136, "mean_token_accuracy": 0.8092543631792068, "num_tokens": 53383863.0, "step": 102 }, { "epoch": 0.5620736698499318, "grad_norm": 0.2822146986279436, "learning_rate": 9.904801660604234e-06, "loss": 0.6096, "mean_token_accuracy": 0.8101857900619507, "num_tokens": 53906296.0, "step": 103 }, { "epoch": 0.567530695770805, "grad_norm": 0.2846732632438344, "learning_rate": 9.71443949206304e-06, "loss": 0.6145, "mean_token_accuracy": 0.8090496212244034, "num_tokens": 54429769.0, "step": 104 }, { "epoch": 0.572987721691678, "grad_norm": 0.29261503618482076, "learning_rate": 9.524180841762577e-06, "loss": 0.6077, "mean_token_accuracy": 0.8109816312789917, "num_tokens": 54953622.0, "step": 105 }, { "epoch": 0.5784447476125512, "grad_norm": 0.29743888731648277, "learning_rate": 9.334094680166962e-06, "loss": 0.6254, "mean_token_accuracy": 0.8057630807161331, "num_tokens": 55477635.0, "step": 106 }, { "epoch": 0.5839017735334243, "grad_norm": 0.313788538568538, "learning_rate": 9.144249915211605e-06, "loss": 0.5959, "mean_token_accuracy": 0.8136271983385086, "num_tokens": 56000074.0, "step": 107 }, { "epoch": 0.5893587994542974, "grad_norm": 0.28533146520527214, "learning_rate": 8.954715367323468e-06, "loss": 0.6167, "mean_token_accuracy": 0.8088293522596359, "num_tokens": 56522578.0, "step": 108 }, { "epoch": 0.5948158253751705, "grad_norm": 0.30428085575192637, "learning_rate": 8.765559744473054e-06, "loss": 0.6113, "mean_token_accuracy": 0.8102127313613892, "num_tokens": 57046143.0, "step": 109 }, { "epoch": 0.6002728512960437, "grad_norm": 0.2816148674353229, "learning_rate": 8.576851617267151e-06, "loss": 0.6024, "mean_token_accuracy": 0.8121312856674194, "num_tokens": 57569414.0, "step": 110 }, { "epoch": 0.6057298772169167, "grad_norm": 0.2745800257475884, "learning_rate": 8.388659394091362e-06, "loss": 0.604, "mean_token_accuracy": 0.8116618692874908, "num_tokens": 58093133.0, "step": 111 }, { "epoch": 0.6111869031377899, "grad_norm": 0.2852956217506078, "learning_rate": 8.201051296311462e-06, "loss": 0.6121, "mean_token_accuracy": 0.8092280626296997, "num_tokens": 58616445.0, "step": 112 }, { "epoch": 0.616643929058663, "grad_norm": 0.3028924823430585, "learning_rate": 8.014095333542548e-06, "loss": 0.6277, "mean_token_accuracy": 0.8050173074007034, "num_tokens": 59140029.0, "step": 113 }, { "epoch": 0.6221009549795361, "grad_norm": 0.2878705074568443, "learning_rate": 7.827859278994924e-06, "loss": 0.6183, "mean_token_accuracy": 0.8077353686094284, "num_tokens": 59663111.0, "step": 114 }, { "epoch": 0.6275579809004093, "grad_norm": 0.2915197799998971, "learning_rate": 7.642410644905726e-06, "loss": 0.6055, "mean_token_accuracy": 0.8112288415431976, "num_tokens": 60187141.0, "step": 115 }, { "epoch": 0.6330150068212824, "grad_norm": 0.28904450320434766, "learning_rate": 7.4578166580651335e-06, "loss": 0.5984, "mean_token_accuracy": 0.8130005896091461, "num_tokens": 60711004.0, "step": 116 }, { "epoch": 0.6384720327421555, "grad_norm": 0.28028896566316097, "learning_rate": 7.274144235446024e-06, "loss": 0.6186, "mean_token_accuracy": 0.807890847325325, "num_tokens": 61233923.0, "step": 117 }, { "epoch": 0.6439290586630286, "grad_norm": 0.2913180494222617, "learning_rate": 7.0914599599460095e-06, "loss": 0.6136, "mean_token_accuracy": 0.8088892549276352, "num_tokens": 61758088.0, "step": 118 }, { "epoch": 0.6493860845839018, "grad_norm": 0.30378873402714074, "learning_rate": 6.909830056250527e-06, "loss": 0.6079, "mean_token_accuracy": 0.810408428311348, "num_tokens": 62281964.0, "step": 119 }, { "epoch": 0.654843110504775, "grad_norm": 0.2805017521562141, "learning_rate": 6.729320366825785e-06, "loss": 0.6015, "mean_token_accuracy": 0.8118923008441925, "num_tokens": 62805421.0, "step": 120 }, { "epoch": 0.660300136425648, "grad_norm": 0.2900380570316718, "learning_rate": 6.549996328050296e-06, "loss": 0.6162, "mean_token_accuracy": 0.8080224990844727, "num_tokens": 63327512.0, "step": 121 }, { "epoch": 0.6657571623465212, "grad_norm": 0.28161953335473705, "learning_rate": 6.3719229464935915e-06, "loss": 0.611, "mean_token_accuracy": 0.8101419806480408, "num_tokens": 63851052.0, "step": 122 }, { "epoch": 0.6712141882673943, "grad_norm": 0.27911943019979396, "learning_rate": 6.19516477535077e-06, "loss": 0.6144, "mean_token_accuracy": 0.8090348690748215, "num_tokens": 64373976.0, "step": 123 }, { "epoch": 0.6766712141882674, "grad_norm": 0.28869189523406197, "learning_rate": 6.019785891041381e-06, "loss": 0.6027, "mean_token_accuracy": 0.8112694770097733, "num_tokens": 64897673.0, "step": 124 }, { "epoch": 0.6821282401091405, "grad_norm": 0.3116639999006473, "learning_rate": 5.845849869981137e-06, "loss": 0.6071, "mean_token_accuracy": 0.8102435320615768, "num_tokens": 65420396.0, "step": 125 }, { "epoch": 0.6875852660300137, "grad_norm": 0.2794198694229315, "learning_rate": 5.673419765534915e-06, "loss": 0.6139, "mean_token_accuracy": 0.8096896409988403, "num_tokens": 65944684.0, "step": 126 }, { "epoch": 0.6930422919508867, "grad_norm": 0.29207321018083127, "learning_rate": 5.502558085159344e-06, "loss": 0.6134, "mean_token_accuracy": 0.8090117424726486, "num_tokens": 66467447.0, "step": 127 }, { "epoch": 0.6984993178717599, "grad_norm": 0.29215460275378646, "learning_rate": 5.333326767743263e-06, "loss": 0.6035, "mean_token_accuracy": 0.8116814643144608, "num_tokens": 66990954.0, "step": 128 }, { "epoch": 0.703956343792633, "grad_norm": 0.30182577234260344, "learning_rate": 5.165787161154361e-06, "loss": 0.6198, "mean_token_accuracy": 0.8080471009016037, "num_tokens": 67513914.0, "step": 129 }, { "epoch": 0.7094133697135061, "grad_norm": 0.28575689674009214, "learning_rate": 5.000000000000003e-06, "loss": 0.6156, "mean_token_accuracy": 0.8082942962646484, "num_tokens": 68036824.0, "step": 130 }, { "epoch": 0.7148703956343793, "grad_norm": 0.28922672322466253, "learning_rate": 4.836025383610382e-06, "loss": 0.6003, "mean_token_accuracy": 0.8127593100070953, "num_tokens": 68559651.0, "step": 131 }, { "epoch": 0.7203274215552524, "grad_norm": 0.298101675623923, "learning_rate": 4.673922754252001e-06, "loss": 0.6168, "mean_token_accuracy": 0.8083733916282654, "num_tokens": 69082066.0, "step": 132 }, { "epoch": 0.7257844474761255, "grad_norm": 0.2744186495019478, "learning_rate": 4.513750875579303e-06, "loss": 0.6054, "mean_token_accuracy": 0.8108874261379242, "num_tokens": 69604652.0, "step": 133 }, { "epoch": 0.7312414733969986, "grad_norm": 0.2830493473939992, "learning_rate": 4.355567811332311e-06, "loss": 0.6124, "mean_token_accuracy": 0.8090694695711136, "num_tokens": 70128721.0, "step": 134 }, { "epoch": 0.7366984993178718, "grad_norm": 0.26452385051400257, "learning_rate": 4.19943090428802e-06, "loss": 0.609, "mean_token_accuracy": 0.8103781342506409, "num_tokens": 70651715.0, "step": 135 }, { "epoch": 0.7421555252387448, "grad_norm": 0.302575355186892, "learning_rate": 4.045396755473121e-06, "loss": 0.6155, "mean_token_accuracy": 0.8087055832147598, "num_tokens": 71175451.0, "step": 136 }, { "epoch": 0.747612551159618, "grad_norm": 0.29021714961394307, "learning_rate": 3.893521203645618e-06, "loss": 0.6107, "mean_token_accuracy": 0.8100896179676056, "num_tokens": 71699274.0, "step": 137 }, { "epoch": 0.7530695770804912, "grad_norm": 0.26530999183379256, "learning_rate": 3.743859305052785e-06, "loss": 0.6012, "mean_token_accuracy": 0.8125804513692856, "num_tokens": 72221747.0, "step": 138 }, { "epoch": 0.7585266030013642, "grad_norm": 0.2664134356854169, "learning_rate": 3.596465313472778e-06, "loss": 0.6049, "mean_token_accuracy": 0.8113019466400146, "num_tokens": 72745367.0, "step": 139 }, { "epoch": 0.7639836289222374, "grad_norm": 0.2726636629633549, "learning_rate": 3.4513926605471504e-06, "loss": 0.6029, "mean_token_accuracy": 0.8118140697479248, "num_tokens": 73269309.0, "step": 140 }, { "epoch": 0.7694406548431105, "grad_norm": 0.2823783285707402, "learning_rate": 3.308693936411421e-06, "loss": 0.5984, "mean_token_accuracy": 0.8139031380414963, "num_tokens": 73793330.0, "step": 141 }, { "epoch": 0.7748976807639836, "grad_norm": 0.2842494431708805, "learning_rate": 3.1684208706306572e-06, "loss": 0.6038, "mean_token_accuracy": 0.811387911438942, "num_tokens": 74315676.0, "step": 142 }, { "epoch": 0.7803547066848567, "grad_norm": 0.27140851078870026, "learning_rate": 3.0306243134470668e-06, "loss": 0.6013, "mean_token_accuracy": 0.8124092221260071, "num_tokens": 74839637.0, "step": 143 }, { "epoch": 0.7858117326057299, "grad_norm": 0.2798670919794706, "learning_rate": 2.8953542173463133e-06, "loss": 0.6106, "mean_token_accuracy": 0.8100556433200836, "num_tokens": 75363497.0, "step": 144 }, { "epoch": 0.791268758526603, "grad_norm": 0.28033902054535437, "learning_rate": 2.7626596189492983e-06, "loss": 0.6027, "mean_token_accuracy": 0.8115111291408539, "num_tokens": 75887489.0, "step": 145 }, { "epoch": 0.7967257844474761, "grad_norm": 0.27490448958024705, "learning_rate": 2.6325886212359496e-06, "loss": 0.6182, "mean_token_accuracy": 0.8073505163192749, "num_tokens": 76409584.0, "step": 146 }, { "epoch": 0.8021828103683493, "grad_norm": 0.24675144845595406, "learning_rate": 2.5051883761074613e-06, "loss": 0.6028, "mean_token_accuracy": 0.8121288418769836, "num_tokens": 76931524.0, "step": 147 }, { "epoch": 0.8076398362892224, "grad_norm": 0.25519179346777965, "learning_rate": 2.380505067293293e-06, "loss": 0.6196, "mean_token_accuracy": 0.8076845556497574, "num_tokens": 77454057.0, "step": 148 }, { "epoch": 0.8130968622100955, "grad_norm": 0.26185996250871496, "learning_rate": 2.2585838936091753e-06, "loss": 0.6017, "mean_token_accuracy": 0.8125504702329636, "num_tokens": 77977862.0, "step": 149 }, { "epoch": 0.8185538881309686, "grad_norm": 0.26113753260747125, "learning_rate": 2.1394690525721275e-06, "loss": 0.6004, "mean_token_accuracy": 0.8124841898679733, "num_tokens": 78501396.0, "step": 150 }, { "epoch": 0.8240109140518418, "grad_norm": 0.26467961910583754, "learning_rate": 2.0232037243784475e-06, "loss": 0.6119, "mean_token_accuracy": 0.8094252794981003, "num_tokens": 79024199.0, "step": 151 }, { "epoch": 0.8294679399727148, "grad_norm": 0.2644047665450295, "learning_rate": 1.9098300562505266e-06, "loss": 0.6004, "mean_token_accuracy": 0.8127636909484863, "num_tokens": 79548077.0, "step": 152 }, { "epoch": 0.834924965893588, "grad_norm": 0.2516602132329715, "learning_rate": 1.7993891471580894e-06, "loss": 0.6205, "mean_token_accuracy": 0.8068733364343643, "num_tokens": 80070613.0, "step": 153 }, { "epoch": 0.8403819918144612, "grad_norm": 0.25189418048747564, "learning_rate": 1.6919210329194535e-06, "loss": 0.5925, "mean_token_accuracy": 0.81495700776577, "num_tokens": 80593271.0, "step": 154 }, { "epoch": 0.8458390177353342, "grad_norm": 0.2541947348763928, "learning_rate": 1.587464671688187e-06, "loss": 0.6052, "mean_token_accuracy": 0.8111275136470795, "num_tokens": 81117027.0, "step": 155 }, { "epoch": 0.8512960436562074, "grad_norm": 0.2485428118091583, "learning_rate": 1.4860579298304311e-06, "loss": 0.6027, "mean_token_accuracy": 0.8118527084589005, "num_tokens": 81639970.0, "step": 156 }, { "epoch": 0.8567530695770805, "grad_norm": 0.2558642894347357, "learning_rate": 1.3877375681979944e-06, "loss": 0.6179, "mean_token_accuracy": 0.8081785440444946, "num_tokens": 82164220.0, "step": 157 }, { "epoch": 0.8622100954979536, "grad_norm": 0.2616239161200991, "learning_rate": 1.2925392288022299e-06, "loss": 0.6006, "mean_token_accuracy": 0.8130854815244675, "num_tokens": 82687126.0, "step": 158 }, { "epoch": 0.8676671214188267, "grad_norm": 0.2503849676832415, "learning_rate": 1.2004974218934695e-06, "loss": 0.6121, "mean_token_accuracy": 0.8090514242649078, "num_tokens": 83211082.0, "step": 159 }, { "epoch": 0.8731241473396999, "grad_norm": 0.25569881744843687, "learning_rate": 1.1116455134507665e-06, "loss": 0.5978, "mean_token_accuracy": 0.8128385543823242, "num_tokens": 83734481.0, "step": 160 }, { "epoch": 0.878581173260573, "grad_norm": 0.258101439807178, "learning_rate": 1.0260157130864178e-06, "loss": 0.5997, "mean_token_accuracy": 0.8123024553060532, "num_tokens": 84258601.0, "step": 161 }, { "epoch": 0.8840381991814461, "grad_norm": 0.25927293039307325, "learning_rate": 9.436390623696911e-07, "loss": 0.6124, "mean_token_accuracy": 0.8082059770822525, "num_tokens": 84781078.0, "step": 162 }, { "epoch": 0.8894952251023193, "grad_norm": 0.2526035772284644, "learning_rate": 8.645454235739903e-07, "loss": 0.5948, "mean_token_accuracy": 0.8139047026634216, "num_tokens": 85303323.0, "step": 163 }, { "epoch": 0.8949522510231923, "grad_norm": 0.24869120811837703, "learning_rate": 7.887634688515e-07, "loss": 0.6007, "mean_token_accuracy": 0.8121525943279266, "num_tokens": 85826587.0, "step": 164 }, { "epoch": 0.9004092769440655, "grad_norm": 0.2585783712565244, "learning_rate": 7.163206698392744e-07, "loss": 0.6067, "mean_token_accuracy": 0.8101864755153656, "num_tokens": 86348762.0, "step": 165 }, { "epoch": 0.9058663028649386, "grad_norm": 0.2518620468900349, "learning_rate": 6.472432877005341e-07, "loss": 0.5887, "mean_token_accuracy": 0.8158304989337921, "num_tokens": 86871300.0, "step": 166 }, { "epoch": 0.9113233287858117, "grad_norm": 0.244004632190641, "learning_rate": 5.815563636047539e-07, "loss": 0.6072, "mean_token_accuracy": 0.810369223356247, "num_tokens": 87394281.0, "step": 167 }, { "epoch": 0.9167803547066848, "grad_norm": 0.24445608373025496, "learning_rate": 5.192837096500058e-07, "loss": 0.5996, "mean_token_accuracy": 0.8120895475149155, "num_tokens": 87918100.0, "step": 168 }, { "epoch": 0.922237380627558, "grad_norm": 0.2513116743689248, "learning_rate": 4.6044790023087373e-07, "loss": 0.6019, "mean_token_accuracy": 0.812640592455864, "num_tokens": 88441158.0, "step": 169 }, { "epoch": 0.927694406548431, "grad_norm": 0.24582299262926002, "learning_rate": 4.0507026385502747e-07, "loss": 0.6141, "mean_token_accuracy": 0.8093185424804688, "num_tokens": 88964881.0, "step": 170 }, { "epoch": 0.9331514324693042, "grad_norm": 0.24125628809062805, "learning_rate": 3.531708754114438e-07, "loss": 0.5934, "mean_token_accuracy": 0.8144457191228867, "num_tokens": 89487871.0, "step": 171 }, { "epoch": 0.9386084583901774, "grad_norm": 0.24933232718734447, "learning_rate": 3.0476854889308737e-07, "loss": 0.6121, "mean_token_accuracy": 0.8091763854026794, "num_tokens": 90011304.0, "step": 172 }, { "epoch": 0.9440654843110505, "grad_norm": 0.24998307613649684, "learning_rate": 2.5988083057666534e-07, "loss": 0.6128, "mean_token_accuracy": 0.8095022439956665, "num_tokens": 90535512.0, "step": 173 }, { "epoch": 0.9495225102319236, "grad_norm": 0.2603433846994922, "learning_rate": 2.1852399266194312e-07, "loss": 0.6096, "mean_token_accuracy": 0.8097108900547028, "num_tokens": 91057921.0, "step": 174 }, { "epoch": 0.9549795361527967, "grad_norm": 0.2472084519543789, "learning_rate": 1.8071302737293294e-07, "loss": 0.6117, "mean_token_accuracy": 0.8093229830265045, "num_tokens": 91581472.0, "step": 175 }, { "epoch": 0.9604365620736699, "grad_norm": 0.24695647352252775, "learning_rate": 1.464616415230702e-07, "loss": 0.6055, "mean_token_accuracy": 0.810840904712677, "num_tokens": 92105494.0, "step": 176 }, { "epoch": 0.965893587994543, "grad_norm": 0.24599551241247092, "learning_rate": 1.1578225154637579e-07, "loss": 0.6038, "mean_token_accuracy": 0.8120936304330826, "num_tokens": 92629136.0, "step": 177 }, { "epoch": 0.9713506139154161, "grad_norm": 0.25292592350349663, "learning_rate": 8.868597899638897e-08, "loss": 0.5969, "mean_token_accuracy": 0.813206359744072, "num_tokens": 93153344.0, "step": 178 }, { "epoch": 0.9768076398362893, "grad_norm": 0.24220493608697735, "learning_rate": 6.51826465144978e-08, "loss": 0.6064, "mean_token_accuracy": 0.8107435554265976, "num_tokens": 93677179.0, "step": 179 }, { "epoch": 0.9822646657571623, "grad_norm": 0.251125383266427, "learning_rate": 4.528077426915412e-08, "loss": 0.6135, "mean_token_accuracy": 0.8094018846750259, "num_tokens": 94199742.0, "step": 180 }, { "epoch": 0.9877216916780355, "grad_norm": 0.24330075967604267, "learning_rate": 2.898757686722542e-08, "loss": 0.6076, "mean_token_accuracy": 0.8115710318088531, "num_tokens": 94723291.0, "step": 181 }, { "epoch": 0.9931787175989086, "grad_norm": 0.25502156218936445, "learning_rate": 1.630896073864352e-08, "loss": 0.6099, "mean_token_accuracy": 0.8100082129240036, "num_tokens": 95247067.0, "step": 182 }, { "epoch": 0.9986357435197817, "grad_norm": 0.24338031531558027, "learning_rate": 7.2495219952639636e-09, "loss": 0.5994, "mean_token_accuracy": 0.8127514272928238, "num_tokens": 95771253.0, "step": 183 }, { "epoch": 1.0, "grad_norm": 0.24338031531558027, "learning_rate": 1.8125447647421302e-09, "loss": 0.5996, "mean_token_accuracy": 0.8128812313079834, "num_tokens": 95902052.0, "step": 184 }, { "epoch": 1.0, "step": 184, "total_flos": 83412022984704.0, "train_loss": 0.0, "train_runtime": 0.0447, "train_samples_per_second": 2097588.378, "train_steps_per_second": 4117.613 } ], "logging_steps": 1, "max_steps": 184, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 83412022984704.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }