| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 184, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005457025920873124, | |
| "grad_norm": 10.22213420182019, | |
| "learning_rate": 0.0, | |
| "loss": 1.1573, | |
| "mean_token_accuracy": 0.7164628058671951, | |
| "num_tokens": 523676.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010914051841746248, | |
| "grad_norm": 10.273668310890756, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 1.1616, | |
| "mean_token_accuracy": 0.7147120386362076, | |
| "num_tokens": 1045809.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01637107776261937, | |
| "grad_norm": 9.78774768102665, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 1.1567, | |
| "mean_token_accuracy": 0.7149067521095276, | |
| "num_tokens": 1570097.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.021828103683492497, | |
| "grad_norm": 8.94313097518904, | |
| "learning_rate": 3.157894736842105e-06, | |
| "loss": 1.1348, | |
| "mean_token_accuracy": 0.7210559844970703, | |
| "num_tokens": 2091845.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.027285129604365622, | |
| "grad_norm": 7.9721339575335834, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 1.1095, | |
| "mean_token_accuracy": 0.7218570560216904, | |
| "num_tokens": 2615847.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03274215552523874, | |
| "grad_norm": 7.592127321700853, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 1.0809, | |
| "mean_token_accuracy": 0.7219354957342148, | |
| "num_tokens": 3138337.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.03819918144611187, | |
| "grad_norm": 5.919775437809893, | |
| "learning_rate": 6.31578947368421e-06, | |
| "loss": 1.0143, | |
| "mean_token_accuracy": 0.7335961759090424, | |
| "num_tokens": 3661593.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.04365620736698499, | |
| "grad_norm": 5.601592015786545, | |
| "learning_rate": 7.368421052631579e-06, | |
| "loss": 1.0196, | |
| "mean_token_accuracy": 0.7284622341394424, | |
| "num_tokens": 4185249.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.04911323328785812, | |
| "grad_norm": 4.982536388951164, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 0.9759, | |
| "mean_token_accuracy": 0.7368100732564926, | |
| "num_tokens": 4708215.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.054570259208731244, | |
| "grad_norm": 4.249175412307748, | |
| "learning_rate": 9.473684210526315e-06, | |
| "loss": 0.9085, | |
| "mean_token_accuracy": 0.7506365925073624, | |
| "num_tokens": 5230908.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06002728512960437, | |
| "grad_norm": 2.66891797206269, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 0.8817, | |
| "mean_token_accuracy": 0.7550760209560394, | |
| "num_tokens": 5753624.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.06548431105047749, | |
| "grad_norm": 1.901679552444703, | |
| "learning_rate": 1.1578947368421053e-05, | |
| "loss": 0.8638, | |
| "mean_token_accuracy": 0.7565864026546478, | |
| "num_tokens": 6276373.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07094133697135062, | |
| "grad_norm": 1.9750884331299126, | |
| "learning_rate": 1.263157894736842e-05, | |
| "loss": 0.8356, | |
| "mean_token_accuracy": 0.7596608996391296, | |
| "num_tokens": 6800427.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.07639836289222374, | |
| "grad_norm": 2.0247436740822646, | |
| "learning_rate": 1.3684210526315791e-05, | |
| "loss": 0.8165, | |
| "mean_token_accuracy": 0.7632935792207718, | |
| "num_tokens": 7323928.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.08185538881309687, | |
| "grad_norm": 1.2580071009761389, | |
| "learning_rate": 1.4736842105263159e-05, | |
| "loss": 0.7843, | |
| "mean_token_accuracy": 0.7702907770872116, | |
| "num_tokens": 7847456.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08731241473396999, | |
| "grad_norm": 1.1026372318409008, | |
| "learning_rate": 1.578947368421053e-05, | |
| "loss": 0.7812, | |
| "mean_token_accuracy": 0.7720244079828262, | |
| "num_tokens": 8370723.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0927694406548431, | |
| "grad_norm": 0.9790512143637813, | |
| "learning_rate": 1.6842105263157896e-05, | |
| "loss": 0.7609, | |
| "mean_token_accuracy": 0.7757307291030884, | |
| "num_tokens": 8894432.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.09822646657571624, | |
| "grad_norm": 0.910145815422614, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 0.7538, | |
| "mean_token_accuracy": 0.7766335010528564, | |
| "num_tokens": 9418720.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.10368349249658936, | |
| "grad_norm": 0.8433683095975495, | |
| "learning_rate": 1.894736842105263e-05, | |
| "loss": 0.7409, | |
| "mean_token_accuracy": 0.7793708741664886, | |
| "num_tokens": 9942800.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.10914051841746249, | |
| "grad_norm": 0.7670309303084534, | |
| "learning_rate": 2e-05, | |
| "loss": 0.7321, | |
| "mean_token_accuracy": 0.7815710753202438, | |
| "num_tokens": 10466884.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1145975443383356, | |
| "grad_norm": 0.706537688385123, | |
| "learning_rate": 1.999818745523526e-05, | |
| "loss": 0.697, | |
| "mean_token_accuracy": 0.7905226796865463, | |
| "num_tokens": 10990544.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.12005457025920874, | |
| "grad_norm": 0.5527165898545957, | |
| "learning_rate": 1.999275047800474e-05, | |
| "loss": 0.7128, | |
| "mean_token_accuracy": 0.7862321138381958, | |
| "num_tokens": 11514374.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.12551159618008187, | |
| "grad_norm": 0.571109958623919, | |
| "learning_rate": 1.9983691039261358e-05, | |
| "loss": 0.7241, | |
| "mean_token_accuracy": 0.7830730974674225, | |
| "num_tokens": 12037544.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.13096862210095497, | |
| "grad_norm": 0.5370114426229838, | |
| "learning_rate": 1.9971012423132776e-05, | |
| "loss": 0.6982, | |
| "mean_token_accuracy": 0.7900128364562988, | |
| "num_tokens": 12561482.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.1364256480218281, | |
| "grad_norm": 0.5547056978188413, | |
| "learning_rate": 1.9954719225730847e-05, | |
| "loss": 0.7062, | |
| "mean_token_accuracy": 0.7870719134807587, | |
| "num_tokens": 13085632.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.14188267394270124, | |
| "grad_norm": 0.4977992915278872, | |
| "learning_rate": 1.99348173534855e-05, | |
| "loss": 0.7022, | |
| "mean_token_accuracy": 0.7884464412927628, | |
| "num_tokens": 13608889.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.14733969986357434, | |
| "grad_norm": 0.6284161854646022, | |
| "learning_rate": 1.9911314021003614e-05, | |
| "loss": 0.6929, | |
| "mean_token_accuracy": 0.7912539541721344, | |
| "num_tokens": 14132289.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.15279672578444747, | |
| "grad_norm": 0.5715615291456563, | |
| "learning_rate": 1.9884217748453625e-05, | |
| "loss": 0.6831, | |
| "mean_token_accuracy": 0.7934748083353043, | |
| "num_tokens": 14655501.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1582537517053206, | |
| "grad_norm": 0.7940147931527408, | |
| "learning_rate": 1.9853538358476933e-05, | |
| "loss": 0.6985, | |
| "mean_token_accuracy": 0.7891672253608704, | |
| "num_tokens": 15178673.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.16371077762619374, | |
| "grad_norm": 0.42070194949907197, | |
| "learning_rate": 1.9819286972627066e-05, | |
| "loss": 0.6789, | |
| "mean_token_accuracy": 0.7936267405748367, | |
| "num_tokens": 15702134.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16916780354706684, | |
| "grad_norm": 0.42689025398588065, | |
| "learning_rate": 1.9781476007338058e-05, | |
| "loss": 0.6661, | |
| "mean_token_accuracy": 0.7965902388095856, | |
| "num_tokens": 16225916.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.17462482946793997, | |
| "grad_norm": 0.4069071707695385, | |
| "learning_rate": 1.9740119169423337e-05, | |
| "loss": 0.6769, | |
| "mean_token_accuracy": 0.7948543280363083, | |
| "num_tokens": 16749401.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1800818553888131, | |
| "grad_norm": 0.3899451941069501, | |
| "learning_rate": 1.9695231451106914e-05, | |
| "loss": 0.6849, | |
| "mean_token_accuracy": 0.7926386445760727, | |
| "num_tokens": 17273468.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1855388813096862, | |
| "grad_norm": 0.38714269519831274, | |
| "learning_rate": 1.964682912458856e-05, | |
| "loss": 0.6689, | |
| "mean_token_accuracy": 0.7962382435798645, | |
| "num_tokens": 17795903.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.19099590723055934, | |
| "grad_norm": 0.3854479116669319, | |
| "learning_rate": 1.9594929736144978e-05, | |
| "loss": 0.6828, | |
| "mean_token_accuracy": 0.792619377374649, | |
| "num_tokens": 18317513.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.19645293315143247, | |
| "grad_norm": 0.36645962752185096, | |
| "learning_rate": 1.9539552099769128e-05, | |
| "loss": 0.6582, | |
| "mean_token_accuracy": 0.7995733767747879, | |
| "num_tokens": 18840309.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.2019099590723056, | |
| "grad_norm": 0.4016622694853883, | |
| "learning_rate": 1.9480716290349998e-05, | |
| "loss": 0.675, | |
| "mean_token_accuracy": 0.7943409830331802, | |
| "num_tokens": 19363271.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2073669849931787, | |
| "grad_norm": 0.3598753892733462, | |
| "learning_rate": 1.941844363639525e-05, | |
| "loss": 0.6502, | |
| "mean_token_accuracy": 0.8010804206132889, | |
| "num_tokens": 19886861.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.21282401091405184, | |
| "grad_norm": 0.3767009308515463, | |
| "learning_rate": 1.9352756712299467e-05, | |
| "loss": 0.6622, | |
| "mean_token_accuracy": 0.7980271279811859, | |
| "num_tokens": 20410529.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.21828103683492497, | |
| "grad_norm": 0.36380332204946886, | |
| "learning_rate": 1.9283679330160726e-05, | |
| "loss": 0.6743, | |
| "mean_token_accuracy": 0.7954283803701401, | |
| "num_tokens": 20933244.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22373806275579808, | |
| "grad_norm": 0.3372596025306025, | |
| "learning_rate": 1.92112365311485e-05, | |
| "loss": 0.6603, | |
| "mean_token_accuracy": 0.7978256940841675, | |
| "num_tokens": 21456172.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2291950886766712, | |
| "grad_norm": 0.3431291193692692, | |
| "learning_rate": 1.913545457642601e-05, | |
| "loss": 0.6474, | |
| "mean_token_accuracy": 0.8019589632749557, | |
| "num_tokens": 21979434.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.23465211459754434, | |
| "grad_norm": 0.35120079952111755, | |
| "learning_rate": 1.905636093763031e-05, | |
| "loss": 0.6526, | |
| "mean_token_accuracy": 0.8002910017967224, | |
| "num_tokens": 22503439.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.24010914051841747, | |
| "grad_norm": 0.32515836450165586, | |
| "learning_rate": 1.8973984286913584e-05, | |
| "loss": 0.669, | |
| "mean_token_accuracy": 0.7958613783121109, | |
| "num_tokens": 23025997.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.24556616643929058, | |
| "grad_norm": 0.3423697459142544, | |
| "learning_rate": 1.8888354486549238e-05, | |
| "loss": 0.6508, | |
| "mean_token_accuracy": 0.8007421046495438, | |
| "num_tokens": 23548909.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.25102319236016374, | |
| "grad_norm": 0.34254341826311147, | |
| "learning_rate": 1.8799502578106533e-05, | |
| "loss": 0.6518, | |
| "mean_token_accuracy": 0.7999228090047836, | |
| "num_tokens": 24071561.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.25648021828103684, | |
| "grad_norm": 0.38246389596908054, | |
| "learning_rate": 1.8707460771197773e-05, | |
| "loss": 0.6643, | |
| "mean_token_accuracy": 0.7966996878385544, | |
| "num_tokens": 24594782.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.26193724420190995, | |
| "grad_norm": 0.36747149421521913, | |
| "learning_rate": 1.861226243180201e-05, | |
| "loss": 0.6424, | |
| "mean_token_accuracy": 0.8020810931921005, | |
| "num_tokens": 25117688.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2673942701227831, | |
| "grad_norm": 0.3260159545079233, | |
| "learning_rate": 1.8513942070169572e-05, | |
| "loss": 0.6496, | |
| "mean_token_accuracy": 0.8000700622797012, | |
| "num_tokens": 25641600.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.2728512960436562, | |
| "grad_norm": 0.31918709162924563, | |
| "learning_rate": 1.8412535328311813e-05, | |
| "loss": 0.6469, | |
| "mean_token_accuracy": 0.8012609481811523, | |
| "num_tokens": 26165286.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2783083219645293, | |
| "grad_norm": 0.3884773043904409, | |
| "learning_rate": 1.8308078967080547e-05, | |
| "loss": 0.6453, | |
| "mean_token_accuracy": 0.802044153213501, | |
| "num_tokens": 26688026.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.2837653478854025, | |
| "grad_norm": 0.3221640246981774, | |
| "learning_rate": 1.8200610852841913e-05, | |
| "loss": 0.6447, | |
| "mean_token_accuracy": 0.8017777651548386, | |
| "num_tokens": 27211433.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.2892223738062756, | |
| "grad_norm": 0.35950232000512794, | |
| "learning_rate": 1.8090169943749477e-05, | |
| "loss": 0.6393, | |
| "mean_token_accuracy": 0.8036545515060425, | |
| "num_tokens": 27735386.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2946793997271487, | |
| "grad_norm": 0.3336227394165688, | |
| "learning_rate": 1.7976796275621556e-05, | |
| "loss": 0.6532, | |
| "mean_token_accuracy": 0.7994280308485031, | |
| "num_tokens": 28259425.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.30013642564802184, | |
| "grad_norm": 0.3159155930753706, | |
| "learning_rate": 1.7860530947427878e-05, | |
| "loss": 0.6367, | |
| "mean_token_accuracy": 0.8045277893543243, | |
| "num_tokens": 28783214.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.30559345156889495, | |
| "grad_norm": 0.33863627132547003, | |
| "learning_rate": 1.7741416106390828e-05, | |
| "loss": 0.6447, | |
| "mean_token_accuracy": 0.8009888529777527, | |
| "num_tokens": 29306632.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.31105047748976805, | |
| "grad_norm": 0.32414058492274905, | |
| "learning_rate": 1.761949493270671e-05, | |
| "loss": 0.6264, | |
| "mean_token_accuracy": 0.8071126639842987, | |
| "num_tokens": 29829363.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3165075034106412, | |
| "grad_norm": 0.343900605873189, | |
| "learning_rate": 1.7494811623892543e-05, | |
| "loss": 0.6278, | |
| "mean_token_accuracy": 0.806527853012085, | |
| "num_tokens": 30352094.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3219645293315143, | |
| "grad_norm": 0.36999037823632697, | |
| "learning_rate": 1.736741137876405e-05, | |
| "loss": 0.6284, | |
| "mean_token_accuracy": 0.8057119399309158, | |
| "num_tokens": 30875328.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.3274215552523875, | |
| "grad_norm": 0.3264916869231787, | |
| "learning_rate": 1.72373403810507e-05, | |
| "loss": 0.6335, | |
| "mean_token_accuracy": 0.8045472204685211, | |
| "num_tokens": 31399221.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3328785811732606, | |
| "grad_norm": 0.4782591852354962, | |
| "learning_rate": 1.710464578265369e-05, | |
| "loss": 0.649, | |
| "mean_token_accuracy": 0.8001821935176849, | |
| "num_tokens": 31923238.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.3383356070941337, | |
| "grad_norm": 0.34512671491394814, | |
| "learning_rate": 1.696937568655294e-05, | |
| "loss": 0.6343, | |
| "mean_token_accuracy": 0.8047307133674622, | |
| "num_tokens": 32446843.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.34379263301500684, | |
| "grad_norm": 0.3795273366749496, | |
| "learning_rate": 1.6831579129369347e-05, | |
| "loss": 0.6232, | |
| "mean_token_accuracy": 0.8063706010580063, | |
| "num_tokens": 32970843.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.34924965893587995, | |
| "grad_norm": 0.37143998348188484, | |
| "learning_rate": 1.6691306063588583e-05, | |
| "loss": 0.6303, | |
| "mean_token_accuracy": 0.8047699332237244, | |
| "num_tokens": 33494468.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.35470668485675305, | |
| "grad_norm": 0.399010736777414, | |
| "learning_rate": 1.6548607339452853e-05, | |
| "loss": 0.6098, | |
| "mean_token_accuracy": 0.8112862259149551, | |
| "num_tokens": 34018179.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3601637107776262, | |
| "grad_norm": 0.41886782643797665, | |
| "learning_rate": 1.6403534686527223e-05, | |
| "loss": 0.638, | |
| "mean_token_accuracy": 0.8040533065795898, | |
| "num_tokens": 34539809.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.3656207366984993, | |
| "grad_norm": 0.37173400377826826, | |
| "learning_rate": 1.6256140694947217e-05, | |
| "loss": 0.6198, | |
| "mean_token_accuracy": 0.8078815788030624, | |
| "num_tokens": 35063685.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.3710777626193724, | |
| "grad_norm": 0.37848673780253644, | |
| "learning_rate": 1.6106478796354382e-05, | |
| "loss": 0.6357, | |
| "mean_token_accuracy": 0.8034750819206238, | |
| "num_tokens": 35587259.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3765347885402456, | |
| "grad_norm": 0.40573394210222646, | |
| "learning_rate": 1.595460324452688e-05, | |
| "loss": 0.6291, | |
| "mean_token_accuracy": 0.8048963695764542, | |
| "num_tokens": 36110522.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.3819918144611187, | |
| "grad_norm": 0.32059584094834276, | |
| "learning_rate": 1.5800569095711983e-05, | |
| "loss": 0.6284, | |
| "mean_token_accuracy": 0.8049021810293198, | |
| "num_tokens": 36634670.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3874488403819918, | |
| "grad_norm": 0.38182900761291677, | |
| "learning_rate": 1.5644432188667695e-05, | |
| "loss": 0.6406, | |
| "mean_token_accuracy": 0.8030418157577515, | |
| "num_tokens": 37157066.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.39290586630286495, | |
| "grad_norm": 0.3792868667331441, | |
| "learning_rate": 1.5486249124420702e-05, | |
| "loss": 0.6216, | |
| "mean_token_accuracy": 0.8081106394529343, | |
| "num_tokens": 37680580.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.39836289222373805, | |
| "grad_norm": 0.3505388571032621, | |
| "learning_rate": 1.5326077245747998e-05, | |
| "loss": 0.6417, | |
| "mean_token_accuracy": 0.8015989065170288, | |
| "num_tokens": 38204191.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4038199181446112, | |
| "grad_norm": 0.34629977214808816, | |
| "learning_rate": 1.5163974616389621e-05, | |
| "loss": 0.6119, | |
| "mean_token_accuracy": 0.8100632429122925, | |
| "num_tokens": 38727916.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.4092769440654843, | |
| "grad_norm": 0.3432985248757669, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.6189, | |
| "mean_token_accuracy": 0.8073546588420868, | |
| "num_tokens": 39251672.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4147339699863574, | |
| "grad_norm": 0.34896847344460213, | |
| "learning_rate": 1.4834212838845639e-05, | |
| "loss": 0.6251, | |
| "mean_token_accuracy": 0.8063764572143555, | |
| "num_tokens": 39775687.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.4201909959072306, | |
| "grad_norm": 0.3625496780454411, | |
| "learning_rate": 1.4666673232256738e-05, | |
| "loss": 0.6328, | |
| "mean_token_accuracy": 0.8043248653411865, | |
| "num_tokens": 40298660.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.4256480218281037, | |
| "grad_norm": 0.3532777855472362, | |
| "learning_rate": 1.449744191484066e-05, | |
| "loss": 0.6409, | |
| "mean_token_accuracy": 0.8027763664722443, | |
| "num_tokens": 40822948.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4311050477489768, | |
| "grad_norm": 0.35365330484423485, | |
| "learning_rate": 1.4326580234465084e-05, | |
| "loss": 0.626, | |
| "mean_token_accuracy": 0.8057489842176437, | |
| "num_tokens": 41345922.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.43656207366984995, | |
| "grad_norm": 0.3349393977670838, | |
| "learning_rate": 1.4154150130018867e-05, | |
| "loss": 0.6175, | |
| "mean_token_accuracy": 0.808809220790863, | |
| "num_tokens": 41869207.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.44201909959072305, | |
| "grad_norm": 0.35933628997277556, | |
| "learning_rate": 1.3980214108958626e-05, | |
| "loss": 0.6277, | |
| "mean_token_accuracy": 0.8056986331939697, | |
| "num_tokens": 42393021.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.44747612551159616, | |
| "grad_norm": 0.33096454453402563, | |
| "learning_rate": 1.380483522464923e-05, | |
| "loss": 0.6282, | |
| "mean_token_accuracy": 0.8058310747146606, | |
| "num_tokens": 42916182.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4529331514324693, | |
| "grad_norm": 0.30478624197164905, | |
| "learning_rate": 1.362807705350641e-05, | |
| "loss": 0.622, | |
| "mean_token_accuracy": 0.8063958883285522, | |
| "num_tokens": 43438714.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.4583901773533424, | |
| "grad_norm": 0.3501335389226804, | |
| "learning_rate": 1.3450003671949707e-05, | |
| "loss": 0.6316, | |
| "mean_token_accuracy": 0.8037978112697601, | |
| "num_tokens": 43963002.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4638472032742155, | |
| "grad_norm": 0.3050292683152368, | |
| "learning_rate": 1.3270679633174219e-05, | |
| "loss": 0.6313, | |
| "mean_token_accuracy": 0.8047272562980652, | |
| "num_tokens": 44486140.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.4693042291950887, | |
| "grad_norm": 0.37055749603164684, | |
| "learning_rate": 1.3090169943749475e-05, | |
| "loss": 0.6207, | |
| "mean_token_accuracy": 0.8078664541244507, | |
| "num_tokens": 45009734.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4747612551159618, | |
| "grad_norm": 0.2984594081324396, | |
| "learning_rate": 1.2908540040053992e-05, | |
| "loss": 0.6432, | |
| "mean_token_accuracy": 0.8011371046304703, | |
| "num_tokens": 45532685.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.48021828103683495, | |
| "grad_norm": 0.3800002310817578, | |
| "learning_rate": 1.2725855764553981e-05, | |
| "loss": 0.618, | |
| "mean_token_accuracy": 0.8085715621709824, | |
| "num_tokens": 46056685.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.48567530695770805, | |
| "grad_norm": 0.330018519060922, | |
| "learning_rate": 1.2542183341934873e-05, | |
| "loss": 0.6319, | |
| "mean_token_accuracy": 0.8046033978462219, | |
| "num_tokens": 46580056.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.49113233287858116, | |
| "grad_norm": 0.3375380842374477, | |
| "learning_rate": 1.2357589355094275e-05, | |
| "loss": 0.6208, | |
| "mean_token_accuracy": 0.8076395243406296, | |
| "num_tokens": 47103434.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4965893587994543, | |
| "grad_norm": 0.3164441239625524, | |
| "learning_rate": 1.217214072100508e-05, | |
| "loss": 0.6184, | |
| "mean_token_accuracy": 0.8088521063327789, | |
| "num_tokens": 47625849.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5020463847203275, | |
| "grad_norm": 0.30994257381808427, | |
| "learning_rate": 1.1985904666457455e-05, | |
| "loss": 0.6137, | |
| "mean_token_accuracy": 0.8092475831508636, | |
| "num_tokens": 48149477.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5075034106412005, | |
| "grad_norm": 0.32961807061620957, | |
| "learning_rate": 1.179894870368854e-05, | |
| "loss": 0.6144, | |
| "mean_token_accuracy": 0.8092338591814041, | |
| "num_tokens": 48672968.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.5129604365620737, | |
| "grad_norm": 0.2954751932341951, | |
| "learning_rate": 1.1611340605908643e-05, | |
| "loss": 0.6293, | |
| "mean_token_accuracy": 0.804235503077507, | |
| "num_tokens": 49196609.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5184174624829468, | |
| "grad_norm": 0.3073584261153331, | |
| "learning_rate": 1.1423148382732854e-05, | |
| "loss": 0.6073, | |
| "mean_token_accuracy": 0.8108531385660172, | |
| "num_tokens": 49720351.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5238744884038199, | |
| "grad_norm": 0.31592897133822806, | |
| "learning_rate": 1.1234440255526948e-05, | |
| "loss": 0.6171, | |
| "mean_token_accuracy": 0.8081915378570557, | |
| "num_tokens": 50243733.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.529331514324693, | |
| "grad_norm": 0.34742265287757973, | |
| "learning_rate": 1.1045284632676535e-05, | |
| "loss": 0.6318, | |
| "mean_token_accuracy": 0.8044733256101608, | |
| "num_tokens": 50767754.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.5347885402455662, | |
| "grad_norm": 0.34110294551441445, | |
| "learning_rate": 1.08557500847884e-05, | |
| "loss": 0.6188, | |
| "mean_token_accuracy": 0.8076249808073044, | |
| "num_tokens": 51291988.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5402455661664393, | |
| "grad_norm": 0.32189183372762564, | |
| "learning_rate": 1.066590531983304e-05, | |
| "loss": 0.6152, | |
| "mean_token_accuracy": 0.8090378940105438, | |
| "num_tokens": 51815310.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.5457025920873124, | |
| "grad_norm": 0.29768781279014606, | |
| "learning_rate": 1.0475819158237426e-05, | |
| "loss": 0.6247, | |
| "mean_token_accuracy": 0.806323915719986, | |
| "num_tokens": 52338013.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5511596180081856, | |
| "grad_norm": 0.33743263078601365, | |
| "learning_rate": 1.0285560507936962e-05, | |
| "loss": 0.6294, | |
| "mean_token_accuracy": 0.8047656267881393, | |
| "num_tokens": 52860936.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.5566166439290586, | |
| "grad_norm": 0.30513251377145567, | |
| "learning_rate": 1.0095198339395769e-05, | |
| "loss": 0.6136, | |
| "mean_token_accuracy": 0.8092543631792068, | |
| "num_tokens": 53383863.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5620736698499318, | |
| "grad_norm": 0.2822146986279436, | |
| "learning_rate": 9.904801660604234e-06, | |
| "loss": 0.6096, | |
| "mean_token_accuracy": 0.8101857900619507, | |
| "num_tokens": 53906296.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.567530695770805, | |
| "grad_norm": 0.2846732632438344, | |
| "learning_rate": 9.71443949206304e-06, | |
| "loss": 0.6145, | |
| "mean_token_accuracy": 0.8090496212244034, | |
| "num_tokens": 54429769.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.572987721691678, | |
| "grad_norm": 0.29261503618482076, | |
| "learning_rate": 9.524180841762577e-06, | |
| "loss": 0.6077, | |
| "mean_token_accuracy": 0.8109816312789917, | |
| "num_tokens": 54953622.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5784447476125512, | |
| "grad_norm": 0.29743888731648277, | |
| "learning_rate": 9.334094680166962e-06, | |
| "loss": 0.6254, | |
| "mean_token_accuracy": 0.8057630807161331, | |
| "num_tokens": 55477635.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.5839017735334243, | |
| "grad_norm": 0.313788538568538, | |
| "learning_rate": 9.144249915211605e-06, | |
| "loss": 0.5959, | |
| "mean_token_accuracy": 0.8136271983385086, | |
| "num_tokens": 56000074.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.5893587994542974, | |
| "grad_norm": 0.28533146520527214, | |
| "learning_rate": 8.954715367323468e-06, | |
| "loss": 0.6167, | |
| "mean_token_accuracy": 0.8088293522596359, | |
| "num_tokens": 56522578.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5948158253751705, | |
| "grad_norm": 0.30428085575192637, | |
| "learning_rate": 8.765559744473054e-06, | |
| "loss": 0.6113, | |
| "mean_token_accuracy": 0.8102127313613892, | |
| "num_tokens": 57046143.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.6002728512960437, | |
| "grad_norm": 0.2816148674353229, | |
| "learning_rate": 8.576851617267151e-06, | |
| "loss": 0.6024, | |
| "mean_token_accuracy": 0.8121312856674194, | |
| "num_tokens": 57569414.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6057298772169167, | |
| "grad_norm": 0.2745800257475884, | |
| "learning_rate": 8.388659394091362e-06, | |
| "loss": 0.604, | |
| "mean_token_accuracy": 0.8116618692874908, | |
| "num_tokens": 58093133.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.6111869031377899, | |
| "grad_norm": 0.2852956217506078, | |
| "learning_rate": 8.201051296311462e-06, | |
| "loss": 0.6121, | |
| "mean_token_accuracy": 0.8092280626296997, | |
| "num_tokens": 58616445.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.616643929058663, | |
| "grad_norm": 0.3028924823430585, | |
| "learning_rate": 8.014095333542548e-06, | |
| "loss": 0.6277, | |
| "mean_token_accuracy": 0.8050173074007034, | |
| "num_tokens": 59140029.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.6221009549795361, | |
| "grad_norm": 0.2878705074568443, | |
| "learning_rate": 7.827859278994924e-06, | |
| "loss": 0.6183, | |
| "mean_token_accuracy": 0.8077353686094284, | |
| "num_tokens": 59663111.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6275579809004093, | |
| "grad_norm": 0.2915197799998971, | |
| "learning_rate": 7.642410644905726e-06, | |
| "loss": 0.6055, | |
| "mean_token_accuracy": 0.8112288415431976, | |
| "num_tokens": 60187141.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6330150068212824, | |
| "grad_norm": 0.28904450320434766, | |
| "learning_rate": 7.4578166580651335e-06, | |
| "loss": 0.5984, | |
| "mean_token_accuracy": 0.8130005896091461, | |
| "num_tokens": 60711004.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6384720327421555, | |
| "grad_norm": 0.28028896566316097, | |
| "learning_rate": 7.274144235446024e-06, | |
| "loss": 0.6186, | |
| "mean_token_accuracy": 0.807890847325325, | |
| "num_tokens": 61233923.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.6439290586630286, | |
| "grad_norm": 0.2913180494222617, | |
| "learning_rate": 7.0914599599460095e-06, | |
| "loss": 0.6136, | |
| "mean_token_accuracy": 0.8088892549276352, | |
| "num_tokens": 61758088.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6493860845839018, | |
| "grad_norm": 0.30378873402714074, | |
| "learning_rate": 6.909830056250527e-06, | |
| "loss": 0.6079, | |
| "mean_token_accuracy": 0.810408428311348, | |
| "num_tokens": 62281964.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.654843110504775, | |
| "grad_norm": 0.2805017521562141, | |
| "learning_rate": 6.729320366825785e-06, | |
| "loss": 0.6015, | |
| "mean_token_accuracy": 0.8118923008441925, | |
| "num_tokens": 62805421.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.660300136425648, | |
| "grad_norm": 0.2900380570316718, | |
| "learning_rate": 6.549996328050296e-06, | |
| "loss": 0.6162, | |
| "mean_token_accuracy": 0.8080224990844727, | |
| "num_tokens": 63327512.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.6657571623465212, | |
| "grad_norm": 0.28161953335473705, | |
| "learning_rate": 6.3719229464935915e-06, | |
| "loss": 0.611, | |
| "mean_token_accuracy": 0.8101419806480408, | |
| "num_tokens": 63851052.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.6712141882673943, | |
| "grad_norm": 0.27911943019979396, | |
| "learning_rate": 6.19516477535077e-06, | |
| "loss": 0.6144, | |
| "mean_token_accuracy": 0.8090348690748215, | |
| "num_tokens": 64373976.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.6766712141882674, | |
| "grad_norm": 0.28869189523406197, | |
| "learning_rate": 6.019785891041381e-06, | |
| "loss": 0.6027, | |
| "mean_token_accuracy": 0.8112694770097733, | |
| "num_tokens": 64897673.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.6821282401091405, | |
| "grad_norm": 0.3116639999006473, | |
| "learning_rate": 5.845849869981137e-06, | |
| "loss": 0.6071, | |
| "mean_token_accuracy": 0.8102435320615768, | |
| "num_tokens": 65420396.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6875852660300137, | |
| "grad_norm": 0.2794198694229315, | |
| "learning_rate": 5.673419765534915e-06, | |
| "loss": 0.6139, | |
| "mean_token_accuracy": 0.8096896409988403, | |
| "num_tokens": 65944684.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.6930422919508867, | |
| "grad_norm": 0.29207321018083127, | |
| "learning_rate": 5.502558085159344e-06, | |
| "loss": 0.6134, | |
| "mean_token_accuracy": 0.8090117424726486, | |
| "num_tokens": 66467447.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.6984993178717599, | |
| "grad_norm": 0.29215460275378646, | |
| "learning_rate": 5.333326767743263e-06, | |
| "loss": 0.6035, | |
| "mean_token_accuracy": 0.8116814643144608, | |
| "num_tokens": 66990954.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.703956343792633, | |
| "grad_norm": 0.30182577234260344, | |
| "learning_rate": 5.165787161154361e-06, | |
| "loss": 0.6198, | |
| "mean_token_accuracy": 0.8080471009016037, | |
| "num_tokens": 67513914.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.7094133697135061, | |
| "grad_norm": 0.28575689674009214, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 0.6156, | |
| "mean_token_accuracy": 0.8082942962646484, | |
| "num_tokens": 68036824.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7148703956343793, | |
| "grad_norm": 0.28922672322466253, | |
| "learning_rate": 4.836025383610382e-06, | |
| "loss": 0.6003, | |
| "mean_token_accuracy": 0.8127593100070953, | |
| "num_tokens": 68559651.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.7203274215552524, | |
| "grad_norm": 0.298101675623923, | |
| "learning_rate": 4.673922754252001e-06, | |
| "loss": 0.6168, | |
| "mean_token_accuracy": 0.8083733916282654, | |
| "num_tokens": 69082066.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7257844474761255, | |
| "grad_norm": 0.2744186495019478, | |
| "learning_rate": 4.513750875579303e-06, | |
| "loss": 0.6054, | |
| "mean_token_accuracy": 0.8108874261379242, | |
| "num_tokens": 69604652.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.7312414733969986, | |
| "grad_norm": 0.2830493473939992, | |
| "learning_rate": 4.355567811332311e-06, | |
| "loss": 0.6124, | |
| "mean_token_accuracy": 0.8090694695711136, | |
| "num_tokens": 70128721.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7366984993178718, | |
| "grad_norm": 0.26452385051400257, | |
| "learning_rate": 4.19943090428802e-06, | |
| "loss": 0.609, | |
| "mean_token_accuracy": 0.8103781342506409, | |
| "num_tokens": 70651715.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7421555252387448, | |
| "grad_norm": 0.302575355186892, | |
| "learning_rate": 4.045396755473121e-06, | |
| "loss": 0.6155, | |
| "mean_token_accuracy": 0.8087055832147598, | |
| "num_tokens": 71175451.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.747612551159618, | |
| "grad_norm": 0.29021714961394307, | |
| "learning_rate": 3.893521203645618e-06, | |
| "loss": 0.6107, | |
| "mean_token_accuracy": 0.8100896179676056, | |
| "num_tokens": 71699274.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.7530695770804912, | |
| "grad_norm": 0.26530999183379256, | |
| "learning_rate": 3.743859305052785e-06, | |
| "loss": 0.6012, | |
| "mean_token_accuracy": 0.8125804513692856, | |
| "num_tokens": 72221747.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.7585266030013642, | |
| "grad_norm": 0.2664134356854169, | |
| "learning_rate": 3.596465313472778e-06, | |
| "loss": 0.6049, | |
| "mean_token_accuracy": 0.8113019466400146, | |
| "num_tokens": 72745367.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.7639836289222374, | |
| "grad_norm": 0.2726636629633549, | |
| "learning_rate": 3.4513926605471504e-06, | |
| "loss": 0.6029, | |
| "mean_token_accuracy": 0.8118140697479248, | |
| "num_tokens": 73269309.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7694406548431105, | |
| "grad_norm": 0.2823783285707402, | |
| "learning_rate": 3.308693936411421e-06, | |
| "loss": 0.5984, | |
| "mean_token_accuracy": 0.8139031380414963, | |
| "num_tokens": 73793330.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.7748976807639836, | |
| "grad_norm": 0.2842494431708805, | |
| "learning_rate": 3.1684208706306572e-06, | |
| "loss": 0.6038, | |
| "mean_token_accuracy": 0.811387911438942, | |
| "num_tokens": 74315676.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.7803547066848567, | |
| "grad_norm": 0.27140851078870026, | |
| "learning_rate": 3.0306243134470668e-06, | |
| "loss": 0.6013, | |
| "mean_token_accuracy": 0.8124092221260071, | |
| "num_tokens": 74839637.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.7858117326057299, | |
| "grad_norm": 0.2798670919794706, | |
| "learning_rate": 2.8953542173463133e-06, | |
| "loss": 0.6106, | |
| "mean_token_accuracy": 0.8100556433200836, | |
| "num_tokens": 75363497.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.791268758526603, | |
| "grad_norm": 0.28033902054535437, | |
| "learning_rate": 2.7626596189492983e-06, | |
| "loss": 0.6027, | |
| "mean_token_accuracy": 0.8115111291408539, | |
| "num_tokens": 75887489.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.7967257844474761, | |
| "grad_norm": 0.27490448958024705, | |
| "learning_rate": 2.6325886212359496e-06, | |
| "loss": 0.6182, | |
| "mean_token_accuracy": 0.8073505163192749, | |
| "num_tokens": 76409584.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.8021828103683493, | |
| "grad_norm": 0.24675144845595406, | |
| "learning_rate": 2.5051883761074613e-06, | |
| "loss": 0.6028, | |
| "mean_token_accuracy": 0.8121288418769836, | |
| "num_tokens": 76931524.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.8076398362892224, | |
| "grad_norm": 0.25519179346777965, | |
| "learning_rate": 2.380505067293293e-06, | |
| "loss": 0.6196, | |
| "mean_token_accuracy": 0.8076845556497574, | |
| "num_tokens": 77454057.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.8130968622100955, | |
| "grad_norm": 0.26185996250871496, | |
| "learning_rate": 2.2585838936091753e-06, | |
| "loss": 0.6017, | |
| "mean_token_accuracy": 0.8125504702329636, | |
| "num_tokens": 77977862.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.8185538881309686, | |
| "grad_norm": 0.26113753260747125, | |
| "learning_rate": 2.1394690525721275e-06, | |
| "loss": 0.6004, | |
| "mean_token_accuracy": 0.8124841898679733, | |
| "num_tokens": 78501396.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8240109140518418, | |
| "grad_norm": 0.26467961910583754, | |
| "learning_rate": 2.0232037243784475e-06, | |
| "loss": 0.6119, | |
| "mean_token_accuracy": 0.8094252794981003, | |
| "num_tokens": 79024199.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.8294679399727148, | |
| "grad_norm": 0.2644047665450295, | |
| "learning_rate": 1.9098300562505266e-06, | |
| "loss": 0.6004, | |
| "mean_token_accuracy": 0.8127636909484863, | |
| "num_tokens": 79548077.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.834924965893588, | |
| "grad_norm": 0.2516602132329715, | |
| "learning_rate": 1.7993891471580894e-06, | |
| "loss": 0.6205, | |
| "mean_token_accuracy": 0.8068733364343643, | |
| "num_tokens": 80070613.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.8403819918144612, | |
| "grad_norm": 0.25189418048747564, | |
| "learning_rate": 1.6919210329194535e-06, | |
| "loss": 0.5925, | |
| "mean_token_accuracy": 0.81495700776577, | |
| "num_tokens": 80593271.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.8458390177353342, | |
| "grad_norm": 0.2541947348763928, | |
| "learning_rate": 1.587464671688187e-06, | |
| "loss": 0.6052, | |
| "mean_token_accuracy": 0.8111275136470795, | |
| "num_tokens": 81117027.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.8512960436562074, | |
| "grad_norm": 0.2485428118091583, | |
| "learning_rate": 1.4860579298304311e-06, | |
| "loss": 0.6027, | |
| "mean_token_accuracy": 0.8118527084589005, | |
| "num_tokens": 81639970.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.8567530695770805, | |
| "grad_norm": 0.2558642894347357, | |
| "learning_rate": 1.3877375681979944e-06, | |
| "loss": 0.6179, | |
| "mean_token_accuracy": 0.8081785440444946, | |
| "num_tokens": 82164220.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.8622100954979536, | |
| "grad_norm": 0.2616239161200991, | |
| "learning_rate": 1.2925392288022299e-06, | |
| "loss": 0.6006, | |
| "mean_token_accuracy": 0.8130854815244675, | |
| "num_tokens": 82687126.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.8676671214188267, | |
| "grad_norm": 0.2503849676832415, | |
| "learning_rate": 1.2004974218934695e-06, | |
| "loss": 0.6121, | |
| "mean_token_accuracy": 0.8090514242649078, | |
| "num_tokens": 83211082.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.8731241473396999, | |
| "grad_norm": 0.25569881744843687, | |
| "learning_rate": 1.1116455134507665e-06, | |
| "loss": 0.5978, | |
| "mean_token_accuracy": 0.8128385543823242, | |
| "num_tokens": 83734481.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.878581173260573, | |
| "grad_norm": 0.258101439807178, | |
| "learning_rate": 1.0260157130864178e-06, | |
| "loss": 0.5997, | |
| "mean_token_accuracy": 0.8123024553060532, | |
| "num_tokens": 84258601.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.8840381991814461, | |
| "grad_norm": 0.25927293039307325, | |
| "learning_rate": 9.436390623696911e-07, | |
| "loss": 0.6124, | |
| "mean_token_accuracy": 0.8082059770822525, | |
| "num_tokens": 84781078.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.8894952251023193, | |
| "grad_norm": 0.2526035772284644, | |
| "learning_rate": 8.645454235739903e-07, | |
| "loss": 0.5948, | |
| "mean_token_accuracy": 0.8139047026634216, | |
| "num_tokens": 85303323.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.8949522510231923, | |
| "grad_norm": 0.24869120811837703, | |
| "learning_rate": 7.887634688515e-07, | |
| "loss": 0.6007, | |
| "mean_token_accuracy": 0.8121525943279266, | |
| "num_tokens": 85826587.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.9004092769440655, | |
| "grad_norm": 0.2585783712565244, | |
| "learning_rate": 7.163206698392744e-07, | |
| "loss": 0.6067, | |
| "mean_token_accuracy": 0.8101864755153656, | |
| "num_tokens": 86348762.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9058663028649386, | |
| "grad_norm": 0.2518620468900349, | |
| "learning_rate": 6.472432877005341e-07, | |
| "loss": 0.5887, | |
| "mean_token_accuracy": 0.8158304989337921, | |
| "num_tokens": 86871300.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.9113233287858117, | |
| "grad_norm": 0.244004632190641, | |
| "learning_rate": 5.815563636047539e-07, | |
| "loss": 0.6072, | |
| "mean_token_accuracy": 0.810369223356247, | |
| "num_tokens": 87394281.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.9167803547066848, | |
| "grad_norm": 0.24445608373025496, | |
| "learning_rate": 5.192837096500058e-07, | |
| "loss": 0.5996, | |
| "mean_token_accuracy": 0.8120895475149155, | |
| "num_tokens": 87918100.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.922237380627558, | |
| "grad_norm": 0.2513116743689248, | |
| "learning_rate": 4.6044790023087373e-07, | |
| "loss": 0.6019, | |
| "mean_token_accuracy": 0.812640592455864, | |
| "num_tokens": 88441158.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.927694406548431, | |
| "grad_norm": 0.24582299262926002, | |
| "learning_rate": 4.0507026385502747e-07, | |
| "loss": 0.6141, | |
| "mean_token_accuracy": 0.8093185424804688, | |
| "num_tokens": 88964881.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9331514324693042, | |
| "grad_norm": 0.24125628809062805, | |
| "learning_rate": 3.531708754114438e-07, | |
| "loss": 0.5934, | |
| "mean_token_accuracy": 0.8144457191228867, | |
| "num_tokens": 89487871.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.9386084583901774, | |
| "grad_norm": 0.24933232718734447, | |
| "learning_rate": 3.0476854889308737e-07, | |
| "loss": 0.6121, | |
| "mean_token_accuracy": 0.8091763854026794, | |
| "num_tokens": 90011304.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.9440654843110505, | |
| "grad_norm": 0.24998307613649684, | |
| "learning_rate": 2.5988083057666534e-07, | |
| "loss": 0.6128, | |
| "mean_token_accuracy": 0.8095022439956665, | |
| "num_tokens": 90535512.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.9495225102319236, | |
| "grad_norm": 0.2603433846994922, | |
| "learning_rate": 2.1852399266194312e-07, | |
| "loss": 0.6096, | |
| "mean_token_accuracy": 0.8097108900547028, | |
| "num_tokens": 91057921.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.9549795361527967, | |
| "grad_norm": 0.2472084519543789, | |
| "learning_rate": 1.8071302737293294e-07, | |
| "loss": 0.6117, | |
| "mean_token_accuracy": 0.8093229830265045, | |
| "num_tokens": 91581472.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.9604365620736699, | |
| "grad_norm": 0.24695647352252775, | |
| "learning_rate": 1.464616415230702e-07, | |
| "loss": 0.6055, | |
| "mean_token_accuracy": 0.810840904712677, | |
| "num_tokens": 92105494.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.965893587994543, | |
| "grad_norm": 0.24599551241247092, | |
| "learning_rate": 1.1578225154637579e-07, | |
| "loss": 0.6038, | |
| "mean_token_accuracy": 0.8120936304330826, | |
| "num_tokens": 92629136.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.9713506139154161, | |
| "grad_norm": 0.25292592350349663, | |
| "learning_rate": 8.868597899638897e-08, | |
| "loss": 0.5969, | |
| "mean_token_accuracy": 0.813206359744072, | |
| "num_tokens": 93153344.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.9768076398362893, | |
| "grad_norm": 0.24220493608697735, | |
| "learning_rate": 6.51826465144978e-08, | |
| "loss": 0.6064, | |
| "mean_token_accuracy": 0.8107435554265976, | |
| "num_tokens": 93677179.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.9822646657571623, | |
| "grad_norm": 0.251125383266427, | |
| "learning_rate": 4.528077426915412e-08, | |
| "loss": 0.6135, | |
| "mean_token_accuracy": 0.8094018846750259, | |
| "num_tokens": 94199742.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9877216916780355, | |
| "grad_norm": 0.24330075967604267, | |
| "learning_rate": 2.898757686722542e-08, | |
| "loss": 0.6076, | |
| "mean_token_accuracy": 0.8115710318088531, | |
| "num_tokens": 94723291.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.9931787175989086, | |
| "grad_norm": 0.25502156218936445, | |
| "learning_rate": 1.630896073864352e-08, | |
| "loss": 0.6099, | |
| "mean_token_accuracy": 0.8100082129240036, | |
| "num_tokens": 95247067.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.9986357435197817, | |
| "grad_norm": 0.24338031531558027, | |
| "learning_rate": 7.2495219952639636e-09, | |
| "loss": 0.5994, | |
| "mean_token_accuracy": 0.8127514272928238, | |
| "num_tokens": 95771253.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.24338031531558027, | |
| "learning_rate": 1.8125447647421302e-09, | |
| "loss": 0.5996, | |
| "mean_token_accuracy": 0.8128812313079834, | |
| "num_tokens": 95902052.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 184, | |
| "total_flos": 83412022984704.0, | |
| "train_loss": 0.0, | |
| "train_runtime": 0.0447, | |
| "train_samples_per_second": 2097588.378, | |
| "train_steps_per_second": 4117.613 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 184, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 83412022984704.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |