text2arch-llama / trainer_state.json
shivank21's picture
Upload folder using huggingface_hub
1c34c54 verified
{
"best_global_step": 9000,
"best_metric": 0.9195617437362671,
"best_model_checkpoint": "./results/checkpoint-9000",
"epoch": 4.757171183079974,
"eval_steps": 250,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.026437541308658295,
"grad_norm": 43.99717712402344,
"learning_rate": 0.00034625958983852136,
"loss": 9.1282,
"mean_token_accuracy": 0.3046248774370179,
"num_tokens": 1638400.0,
"step": 50
},
{
"epoch": 0.05287508261731659,
"grad_norm": 6.144142150878906,
"learning_rate": 0.000407611186724682,
"loss": 4.215,
"mean_token_accuracy": 0.4404753165692091,
"num_tokens": 3276800.0,
"step": 100
},
{
"epoch": 0.07931262392597488,
"grad_norm": 2.289468288421631,
"learning_rate": 0.0004434995702624468,
"loss": 3.3619,
"mean_token_accuracy": 0.4969379284977913,
"num_tokens": 4915200.0,
"step": 150
},
{
"epoch": 0.10575016523463318,
"grad_norm": 1.4314253330230713,
"learning_rate": 0.0004689627836108426,
"loss": 2.3482,
"mean_token_accuracy": 0.6427779817581176,
"num_tokens": 6553600.0,
"step": 200
},
{
"epoch": 0.13218770654329148,
"grad_norm": 1.0555275678634644,
"learning_rate": 0.0004887135863147016,
"loss": 2.0093,
"step": 250
},
{
"epoch": 0.13218770654329148,
"eval_loss": 1.941351056098938,
"eval_mean_token_accuracy": 0.6896467157826616,
"eval_num_tokens": 8192000.0,
"eval_runtime": 1710.841,
"eval_samples_per_second": 4.422,
"eval_steps_per_second": 0.553,
"step": 250
},
{
"epoch": 0.15862524785194976,
"grad_norm": 1.0403132438659668,
"learning_rate": 0.0004991822047759241,
"loss": 1.8613,
"mean_token_accuracy": 0.6918438403308391,
"num_tokens": 9830400.0,
"step": 300
},
{
"epoch": 0.18506278916060806,
"grad_norm": 1.0463595390319824,
"learning_rate": 0.0004964562206956711,
"loss": 1.7529,
"mean_token_accuracy": 0.7160025656223297,
"num_tokens": 11468800.0,
"step": 350
},
{
"epoch": 0.21150033046926636,
"grad_norm": 0.8012986183166504,
"learning_rate": 0.0004937302366154182,
"loss": 1.642,
"mean_token_accuracy": 0.731451002061367,
"num_tokens": 13107200.0,
"step": 400
},
{
"epoch": 0.23793787177792466,
"grad_norm": 0.9207384586334229,
"learning_rate": 0.0004910042525351653,
"loss": 1.5717,
"mean_token_accuracy": 0.7387188410758972,
"num_tokens": 14745600.0,
"step": 450
},
{
"epoch": 0.26437541308658297,
"grad_norm": 0.6695268750190735,
"learning_rate": 0.00048827826845491225,
"loss": 1.5342,
"step": 500
},
{
"epoch": 0.26437541308658297,
"eval_loss": 1.520922064781189,
"eval_mean_token_accuracy": 0.7428315233982643,
"eval_num_tokens": 16384000.0,
"eval_runtime": 1711.3798,
"eval_samples_per_second": 4.42,
"eval_steps_per_second": 0.553,
"step": 500
},
{
"epoch": 0.29081295439524124,
"grad_norm": 0.7166313529014587,
"learning_rate": 0.0004855522843746593,
"loss": 1.5115,
"mean_token_accuracy": 0.7436290304362774,
"num_tokens": 18022400.0,
"step": 550
},
{
"epoch": 0.3172504957038995,
"grad_norm": 0.7090346217155457,
"learning_rate": 0.00048282630029440626,
"loss": 1.4558,
"mean_token_accuracy": 0.7508443316817284,
"num_tokens": 19660800.0,
"step": 600
},
{
"epoch": 0.34368803701255785,
"grad_norm": 0.8063613176345825,
"learning_rate": 0.00048010031621415335,
"loss": 1.4464,
"mean_token_accuracy": 0.7509249743819236,
"num_tokens": 21299200.0,
"step": 650
},
{
"epoch": 0.3701255783212161,
"grad_norm": 1.142205834388733,
"learning_rate": 0.0004773743321339004,
"loss": 1.3984,
"mean_token_accuracy": 0.756721040904522,
"num_tokens": 22937600.0,
"step": 700
},
{
"epoch": 0.3965631196298744,
"grad_norm": 0.8467270135879517,
"learning_rate": 0.00047464834805364736,
"loss": 1.3935,
"step": 750
},
{
"epoch": 0.3965631196298744,
"eval_loss": 1.3857780694961548,
"eval_mean_token_accuracy": 0.7577579059888142,
"eval_num_tokens": 24576000.0,
"eval_runtime": 1712.2261,
"eval_samples_per_second": 4.418,
"eval_steps_per_second": 0.552,
"step": 750
},
{
"epoch": 0.4230006609385327,
"grad_norm": 0.7939649820327759,
"learning_rate": 0.0004719223639733944,
"loss": 1.3699,
"mean_token_accuracy": 0.7578213591873646,
"num_tokens": 26214400.0,
"step": 800
},
{
"epoch": 0.449438202247191,
"grad_norm": 0.7455005645751953,
"learning_rate": 0.0004691963798931415,
"loss": 1.3693,
"mean_token_accuracy": 0.7589296215772628,
"num_tokens": 27852800.0,
"step": 850
},
{
"epoch": 0.47587574355584933,
"grad_norm": 0.7528285980224609,
"learning_rate": 0.00046647039581288846,
"loss": 1.3242,
"mean_token_accuracy": 0.7669953557848931,
"num_tokens": 29491200.0,
"step": 900
},
{
"epoch": 0.5023132848645075,
"grad_norm": 0.8056386113166809,
"learning_rate": 0.0004637444117326355,
"loss": 1.335,
"mean_token_accuracy": 0.7646432068943977,
"num_tokens": 31129600.0,
"step": 950
},
{
"epoch": 0.5287508261731659,
"grad_norm": 0.7774543166160583,
"learning_rate": 0.0004610184276523825,
"loss": 1.3286,
"step": 1000
},
{
"epoch": 0.5287508261731659,
"eval_loss": 1.3367868661880493,
"eval_mean_token_accuracy": 0.7645143720362706,
"eval_num_tokens": 32768000.0,
"eval_runtime": 1713.3241,
"eval_samples_per_second": 4.415,
"eval_steps_per_second": 0.552,
"step": 1000
},
{
"epoch": 0.5551883674818242,
"grad_norm": 1.0932977199554443,
"learning_rate": 0.00045829244357212956,
"loss": 1.3357,
"mean_token_accuracy": 0.7644849714636802,
"num_tokens": 34406400.0,
"step": 1050
},
{
"epoch": 0.5816259087904825,
"grad_norm": 0.9064853191375732,
"learning_rate": 0.0004555664594918766,
"loss": 1.3309,
"mean_token_accuracy": 0.7655669602751732,
"num_tokens": 36044800.0,
"step": 1100
},
{
"epoch": 0.6080634500991408,
"grad_norm": 1.1222511529922485,
"learning_rate": 0.0004528404754116236,
"loss": 1.3088,
"mean_token_accuracy": 0.7684097030758857,
"num_tokens": 37683200.0,
"step": 1150
},
{
"epoch": 0.634500991407799,
"grad_norm": 0.8550713658332825,
"learning_rate": 0.0004501144913313706,
"loss": 1.3121,
"mean_token_accuracy": 0.7672879993915558,
"num_tokens": 39321600.0,
"step": 1200
},
{
"epoch": 0.6609385327164574,
"grad_norm": 1.1444681882858276,
"learning_rate": 0.0004473885072511177,
"loss": 1.3124,
"step": 1250
},
{
"epoch": 0.6609385327164574,
"eval_loss": 1.3140060901641846,
"eval_mean_token_accuracy": 0.7666200130988882,
"eval_num_tokens": 40960000.0,
"eval_runtime": 1711.9779,
"eval_samples_per_second": 4.419,
"eval_steps_per_second": 0.553,
"step": 1250
},
{
"epoch": 0.6873760740251157,
"grad_norm": 0.9385507702827454,
"learning_rate": 0.0004446625231708647,
"loss": 1.299,
"mean_token_accuracy": 0.7683782380819321,
"num_tokens": 42598400.0,
"step": 1300
},
{
"epoch": 0.713813615333774,
"grad_norm": 0.9909027218818665,
"learning_rate": 0.0004419365390906117,
"loss": 1.2776,
"mean_token_accuracy": 0.7721034941077233,
"num_tokens": 44236800.0,
"step": 1350
},
{
"epoch": 0.7402511566424322,
"grad_norm": 1.0495429039001465,
"learning_rate": 0.00043921055501035873,
"loss": 1.2533,
"mean_token_accuracy": 0.7770307904481888,
"num_tokens": 45875200.0,
"step": 1400
},
{
"epoch": 0.7666886979510905,
"grad_norm": 0.9478822350502014,
"learning_rate": 0.0004364845709301058,
"loss": 1.2386,
"mean_token_accuracy": 0.7782954525947571,
"num_tokens": 47513600.0,
"step": 1450
},
{
"epoch": 0.7931262392597488,
"grad_norm": 0.9904903173446655,
"learning_rate": 0.0004337585868498528,
"loss": 1.2542,
"step": 1500
},
{
"epoch": 0.7931262392597488,
"eval_loss": 1.2644829750061035,
"eval_mean_token_accuracy": 0.7736368500027042,
"eval_num_tokens": 49152000.0,
"eval_runtime": 1711.2255,
"eval_samples_per_second": 4.421,
"eval_steps_per_second": 0.553,
"step": 1500
},
{
"epoch": 0.8195637805684072,
"grad_norm": 0.797590970993042,
"learning_rate": 0.00043103260276959983,
"loss": 1.2613,
"mean_token_accuracy": 0.7753057803213597,
"num_tokens": 50790400.0,
"step": 1550
},
{
"epoch": 0.8460013218770654,
"grad_norm": 1.1253471374511719,
"learning_rate": 0.0004283066186893468,
"loss": 1.2626,
"mean_token_accuracy": 0.7741856071352958,
"num_tokens": 52428800.0,
"step": 1600
},
{
"epoch": 0.8724388631857237,
"grad_norm": 0.9880785346031189,
"learning_rate": 0.0004255806346090939,
"loss": 1.2731,
"mean_token_accuracy": 0.7714717736840249,
"num_tokens": 54067200.0,
"step": 1650
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.7777257561683655,
"learning_rate": 0.00042285465052884093,
"loss": 1.2649,
"mean_token_accuracy": 0.7737677192687988,
"num_tokens": 55705600.0,
"step": 1700
},
{
"epoch": 0.9253139458030403,
"grad_norm": 0.8958262801170349,
"learning_rate": 0.0004201286664485879,
"loss": 1.2498,
"step": 1750
},
{
"epoch": 0.9253139458030403,
"eval_loss": 1.2289273738861084,
"eval_mean_token_accuracy": 0.7793115381836639,
"eval_num_tokens": 57344000.0,
"eval_runtime": 1711.9208,
"eval_samples_per_second": 4.419,
"eval_steps_per_second": 0.553,
"step": 1750
},
{
"epoch": 0.9517514871116987,
"grad_norm": 0.9251750111579895,
"learning_rate": 0.00041740268236833495,
"loss": 1.2212,
"mean_token_accuracy": 0.7776866452395916,
"num_tokens": 58982400.0,
"step": 1800
},
{
"epoch": 0.9781890284203569,
"grad_norm": 0.8101162910461426,
"learning_rate": 0.00041467669828808203,
"loss": 1.2025,
"mean_token_accuracy": 0.7838043755292893,
"num_tokens": 60620800.0,
"step": 1850
},
{
"epoch": 1.0042300066093852,
"grad_norm": 1.5328075885772705,
"learning_rate": 0.00041200523388943414,
"loss": 1.2204,
"mean_token_accuracy": 0.7796021451804843,
"num_tokens": 62234624.0,
"step": 1900
},
{
"epoch": 1.0306675479180436,
"grad_norm": 0.9993696212768555,
"learning_rate": 0.0004092792498091811,
"loss": 1.1622,
"mean_token_accuracy": 0.7850250500440598,
"num_tokens": 63873024.0,
"step": 1950
},
{
"epoch": 1.057105089226702,
"grad_norm": 0.9176653027534485,
"learning_rate": 0.00040655326572892816,
"loss": 1.1807,
"step": 2000
},
{
"epoch": 1.057105089226702,
"eval_loss": 1.2058873176574707,
"eval_mean_token_accuracy": 0.7832688034588893,
"eval_num_tokens": 65511424.0,
"eval_runtime": 1712.3284,
"eval_samples_per_second": 4.418,
"eval_steps_per_second": 0.552,
"step": 2000
},
{
"epoch": 1.0835426305353602,
"grad_norm": 1.060483455657959,
"learning_rate": 0.00040382728164867513,
"loss": 1.1635,
"mean_token_accuracy": 0.7839937689900398,
"num_tokens": 67149824.0,
"step": 2050
},
{
"epoch": 1.1099801718440185,
"grad_norm": 1.3085092306137085,
"learning_rate": 0.0004011012975684222,
"loss": 1.1928,
"mean_token_accuracy": 0.7807007575035095,
"num_tokens": 68788224.0,
"step": 2100
},
{
"epoch": 1.1364177131526767,
"grad_norm": 0.9808939695358276,
"learning_rate": 0.00039837531348816925,
"loss": 1.1454,
"mean_token_accuracy": 0.7877211648225785,
"num_tokens": 70426624.0,
"step": 2150
},
{
"epoch": 1.162855254461335,
"grad_norm": 0.8935715556144714,
"learning_rate": 0.00039564932940791623,
"loss": 1.1514,
"mean_token_accuracy": 0.7872568437457085,
"num_tokens": 72065024.0,
"step": 2200
},
{
"epoch": 1.1892927957699935,
"grad_norm": 0.9606215357780457,
"learning_rate": 0.00039292334532766327,
"loss": 1.1569,
"step": 2250
},
{
"epoch": 1.1892927957699935,
"eval_loss": 1.1869500875473022,
"eval_mean_token_accuracy": 0.7853513486037547,
"eval_num_tokens": 73703424.0,
"eval_runtime": 1713.6219,
"eval_samples_per_second": 4.415,
"eval_steps_per_second": 0.552,
"step": 2250
},
{
"epoch": 1.2157303370786516,
"grad_norm": 1.2232052087783813,
"learning_rate": 0.00039019736124741035,
"loss": 1.1651,
"mean_token_accuracy": 0.785599644035101,
"num_tokens": 75341824.0,
"step": 2300
},
{
"epoch": 1.24216787838731,
"grad_norm": 1.2846604585647583,
"learning_rate": 0.00038747137716715733,
"loss": 1.1691,
"mean_token_accuracy": 0.7854020059108734,
"num_tokens": 76980224.0,
"step": 2350
},
{
"epoch": 1.2686054196959682,
"grad_norm": 1.0899465084075928,
"learning_rate": 0.00038474539308690437,
"loss": 1.1416,
"mean_token_accuracy": 0.7897036933898925,
"num_tokens": 78618624.0,
"step": 2400
},
{
"epoch": 1.2950429610046266,
"grad_norm": 1.0662829875946045,
"learning_rate": 0.0003820194090066514,
"loss": 1.1435,
"mean_token_accuracy": 0.7887658843398094,
"num_tokens": 80257024.0,
"step": 2450
},
{
"epoch": 1.321480502313285,
"grad_norm": 0.9844885468482971,
"learning_rate": 0.00037929342492639843,
"loss": 1.1374,
"step": 2500
},
{
"epoch": 1.321480502313285,
"eval_loss": 1.1827911138534546,
"eval_mean_token_accuracy": 0.7856577540930936,
"eval_num_tokens": 81895424.0,
"eval_runtime": 1711.6622,
"eval_samples_per_second": 4.42,
"eval_steps_per_second": 0.553,
"step": 2500
},
{
"epoch": 1.3479180436219431,
"grad_norm": 0.8380193710327148,
"learning_rate": 0.00037656744084614547,
"loss": 1.1353,
"mean_token_accuracy": 0.790248963534832,
"num_tokens": 83533824.0,
"step": 2550
},
{
"epoch": 1.3743555849306015,
"grad_norm": 0.7707766890525818,
"learning_rate": 0.0003738414567658925,
"loss": 1.1309,
"mean_token_accuracy": 0.7910679399967193,
"num_tokens": 85172224.0,
"step": 2600
},
{
"epoch": 1.4007931262392597,
"grad_norm": 1.3123962879180908,
"learning_rate": 0.0003711154726856395,
"loss": 1.119,
"mean_token_accuracy": 0.7927541556954384,
"num_tokens": 86810624.0,
"step": 2650
},
{
"epoch": 1.427230667547918,
"grad_norm": 0.9434394836425781,
"learning_rate": 0.00036838948860538656,
"loss": 1.1233,
"mean_token_accuracy": 0.7917754176259041,
"num_tokens": 88449024.0,
"step": 2700
},
{
"epoch": 1.4536682088565764,
"grad_norm": 1.0501152276992798,
"learning_rate": 0.0003656635045251336,
"loss": 1.1463,
"step": 2750
},
{
"epoch": 1.4536682088565764,
"eval_loss": 1.1497071981430054,
"eval_mean_token_accuracy": 0.7899814840981119,
"eval_num_tokens": 90087424.0,
"eval_runtime": 1712.288,
"eval_samples_per_second": 4.418,
"eval_steps_per_second": 0.552,
"step": 2750
},
{
"epoch": 1.4801057501652346,
"grad_norm": 1.1930551528930664,
"learning_rate": 0.0003629375204448806,
"loss": 1.1309,
"mean_token_accuracy": 0.7891275675594807,
"num_tokens": 91725824.0,
"step": 2800
},
{
"epoch": 1.5065432914738928,
"grad_norm": 1.3507503271102905,
"learning_rate": 0.0003602115363646276,
"loss": 1.1303,
"mean_token_accuracy": 0.7900684276223182,
"num_tokens": 93364224.0,
"step": 2850
},
{
"epoch": 1.5329808327825512,
"grad_norm": 0.935932993888855,
"learning_rate": 0.0003574855522843747,
"loss": 1.124,
"mean_token_accuracy": 0.7918371230363845,
"num_tokens": 95002624.0,
"step": 2900
},
{
"epoch": 1.5594183740912095,
"grad_norm": 1.3527334928512573,
"learning_rate": 0.0003547595682041217,
"loss": 1.1103,
"mean_token_accuracy": 0.7935648819804192,
"num_tokens": 96641024.0,
"step": 2950
},
{
"epoch": 1.585855915399868,
"grad_norm": 0.8783284425735474,
"learning_rate": 0.0003520335841238687,
"loss": 1.113,
"step": 3000
},
{
"epoch": 1.585855915399868,
"eval_loss": 1.1312052011489868,
"eval_mean_token_accuracy": 0.7928862137597913,
"eval_num_tokens": 98279424.0,
"eval_runtime": 1712.2465,
"eval_samples_per_second": 4.418,
"eval_steps_per_second": 0.552,
"step": 3000
},
{
"epoch": 1.612293456708526,
"grad_norm": 1.079725980758667,
"learning_rate": 0.00034930760004361574,
"loss": 1.107,
"mean_token_accuracy": 0.7933507452905179,
"num_tokens": 99917824.0,
"step": 3050
},
{
"epoch": 1.6387309980171842,
"grad_norm": 1.188661813735962,
"learning_rate": 0.0003465816159633628,
"loss": 1.1139,
"mean_token_accuracy": 0.7925458225607872,
"num_tokens": 101556224.0,
"step": 3100
},
{
"epoch": 1.6651685393258426,
"grad_norm": 0.983051061630249,
"learning_rate": 0.0003438556318831098,
"loss": 1.1198,
"mean_token_accuracy": 0.7924914485216141,
"num_tokens": 103194624.0,
"step": 3150
},
{
"epoch": 1.691606080634501,
"grad_norm": 0.9756836891174316,
"learning_rate": 0.00034112964780285684,
"loss": 1.1175,
"mean_token_accuracy": 0.7917662528157234,
"num_tokens": 104833024.0,
"step": 3200
},
{
"epoch": 1.7180436219431594,
"grad_norm": 1.1230757236480713,
"learning_rate": 0.0003384036637226039,
"loss": 1.0988,
"step": 3250
},
{
"epoch": 1.7180436219431594,
"eval_loss": 1.1241850852966309,
"eval_mean_token_accuracy": 0.793406566215116,
"eval_num_tokens": 106471424.0,
"eval_runtime": 1712.6064,
"eval_samples_per_second": 4.417,
"eval_steps_per_second": 0.552,
"step": 3250
},
{
"epoch": 1.7444811632518176,
"grad_norm": 1.3361942768096924,
"learning_rate": 0.0003356776796423509,
"loss": 1.0957,
"mean_token_accuracy": 0.7953433538973331,
"num_tokens": 108109824.0,
"step": 3300
},
{
"epoch": 1.7709187045604757,
"grad_norm": 0.8606221079826355,
"learning_rate": 0.00033295169556209794,
"loss": 1.072,
"mean_token_accuracy": 0.7988990727066994,
"num_tokens": 109748224.0,
"step": 3350
},
{
"epoch": 1.7973562458691341,
"grad_norm": 1.1985405683517456,
"learning_rate": 0.0003302257114818449,
"loss": 1.0964,
"mean_token_accuracy": 0.7953415229916573,
"num_tokens": 111386624.0,
"step": 3400
},
{
"epoch": 1.8237937871777925,
"grad_norm": 1.0375052690505981,
"learning_rate": 0.000327499727401592,
"loss": 1.0821,
"mean_token_accuracy": 0.7982869046926498,
"num_tokens": 113025024.0,
"step": 3450
},
{
"epoch": 1.8502313284864509,
"grad_norm": 1.037645936012268,
"learning_rate": 0.00032477374332133904,
"loss": 1.0854,
"step": 3500
},
{
"epoch": 1.8502313284864509,
"eval_loss": 1.1053054332733154,
"eval_mean_token_accuracy": 0.7964076004119051,
"eval_num_tokens": 114663424.0,
"eval_runtime": 1713.2036,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 0.552,
"step": 3500
},
{
"epoch": 1.876668869795109,
"grad_norm": 0.8609442114830017,
"learning_rate": 0.000322047759241086,
"loss": 1.0789,
"mean_token_accuracy": 0.7976255512237549,
"num_tokens": 116301824.0,
"step": 3550
},
{
"epoch": 1.9031064111037672,
"grad_norm": 0.7845131158828735,
"learning_rate": 0.00031932177516083305,
"loss": 1.1106,
"mean_token_accuracy": 0.7933571606874465,
"num_tokens": 117940224.0,
"step": 3600
},
{
"epoch": 1.9295439524124256,
"grad_norm": 0.9003056287765503,
"learning_rate": 0.00031659579108058014,
"loss": 1.0636,
"mean_token_accuracy": 0.7998753663897514,
"num_tokens": 119578624.0,
"step": 3650
},
{
"epoch": 1.955981493721084,
"grad_norm": 1.202172040939331,
"learning_rate": 0.0003138698070003271,
"loss": 1.0809,
"mean_token_accuracy": 0.7975763711333275,
"num_tokens": 121217024.0,
"step": 3700
},
{
"epoch": 1.9824190350297424,
"grad_norm": 0.6643933653831482,
"learning_rate": 0.00031114382292007415,
"loss": 1.0808,
"step": 3750
},
{
"epoch": 1.9824190350297424,
"eval_loss": 1.0950915813446045,
"eval_mean_token_accuracy": 0.7972050871833939,
"eval_num_tokens": 122855424.0,
"eval_runtime": 1713.9729,
"eval_samples_per_second": 4.414,
"eval_steps_per_second": 0.552,
"step": 3750
},
{
"epoch": 2.0084600132187704,
"grad_norm": 1.0089055299758911,
"learning_rate": 0.00030847235852142626,
"loss": 1.0434,
"mean_token_accuracy": 0.7995400524860065,
"num_tokens": 124469248.0,
"step": 3800
},
{
"epoch": 2.034897554527429,
"grad_norm": 1.8495018482208252,
"learning_rate": 0.00030574637444117324,
"loss": 1.0069,
"mean_token_accuracy": 0.8041293996572495,
"num_tokens": 126107648.0,
"step": 3850
},
{
"epoch": 2.061335095836087,
"grad_norm": 0.9792631268501282,
"learning_rate": 0.0003030203903609203,
"loss": 1.0248,
"mean_token_accuracy": 0.800488149523735,
"num_tokens": 127746048.0,
"step": 3900
},
{
"epoch": 2.0877726371447456,
"grad_norm": 1.0454398393630981,
"learning_rate": 0.00030029440628066736,
"loss": 1.0032,
"mean_token_accuracy": 0.8048930823802948,
"num_tokens": 129384448.0,
"step": 3950
},
{
"epoch": 2.114210178453404,
"grad_norm": 1.124090313911438,
"learning_rate": 0.00029756842220041434,
"loss": 1.0141,
"step": 4000
},
{
"epoch": 2.114210178453404,
"eval_loss": 1.0894951820373535,
"eval_mean_token_accuracy": 0.7976183624146604,
"eval_num_tokens": 131022848.0,
"eval_runtime": 1713.146,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 0.552,
"step": 4000
},
{
"epoch": 2.140647719762062,
"grad_norm": 1.068744421005249,
"learning_rate": 0.0002948424381201614,
"loss": 1.017,
"mean_token_accuracy": 0.8032655183970928,
"num_tokens": 132661248.0,
"step": 4050
},
{
"epoch": 2.1670852610707203,
"grad_norm": 0.6453216671943665,
"learning_rate": 0.0002921164540399084,
"loss": 1.0173,
"mean_token_accuracy": 0.8023881965875626,
"num_tokens": 134299648.0,
"step": 4100
},
{
"epoch": 2.1935228023793787,
"grad_norm": 0.8488343954086304,
"learning_rate": 0.00028939046995965544,
"loss": 1.0192,
"mean_token_accuracy": 0.8028500735759735,
"num_tokens": 135938048.0,
"step": 4150
},
{
"epoch": 2.219960343688037,
"grad_norm": 1.107086181640625,
"learning_rate": 0.00028666448587940247,
"loss": 1.0167,
"mean_token_accuracy": 0.8023680368065834,
"num_tokens": 137576448.0,
"step": 4200
},
{
"epoch": 2.2463978849966955,
"grad_norm": 0.9816263914108276,
"learning_rate": 0.0002839385017991495,
"loss": 1.026,
"step": 4250
},
{
"epoch": 2.2463978849966955,
"eval_loss": 1.070574402809143,
"eval_mean_token_accuracy": 0.8008053159738948,
"eval_num_tokens": 139214848.0,
"eval_runtime": 1713.5677,
"eval_samples_per_second": 4.415,
"eval_steps_per_second": 0.552,
"step": 4250
},
{
"epoch": 2.2728354263053534,
"grad_norm": 1.0816267728805542,
"learning_rate": 0.00028121251771889654,
"loss": 1.0271,
"mean_token_accuracy": 0.8018050470948219,
"num_tokens": 140853248.0,
"step": 4300
},
{
"epoch": 2.299272967614012,
"grad_norm": 1.1499203443527222,
"learning_rate": 0.00027848653363864357,
"loss": 1.0008,
"mean_token_accuracy": 0.8052902013063431,
"num_tokens": 142491648.0,
"step": 4350
},
{
"epoch": 2.32571050892267,
"grad_norm": 0.9058449864387512,
"learning_rate": 0.0002757605495583906,
"loss": 1.002,
"mean_token_accuracy": 0.806196848154068,
"num_tokens": 144130048.0,
"step": 4400
},
{
"epoch": 2.3521480502313286,
"grad_norm": 0.8086408972740173,
"learning_rate": 0.0002730345654781376,
"loss": 0.9968,
"mean_token_accuracy": 0.8068004646897315,
"num_tokens": 145768448.0,
"step": 4450
},
{
"epoch": 2.378585591539987,
"grad_norm": 0.9422939419746399,
"learning_rate": 0.00027030858139788467,
"loss": 0.9925,
"step": 4500
},
{
"epoch": 2.378585591539987,
"eval_loss": 1.0708719491958618,
"eval_mean_token_accuracy": 0.8005553823570872,
"eval_num_tokens": 147406848.0,
"eval_runtime": 1714.6525,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 0.552,
"step": 4500
},
{
"epoch": 2.405023132848645,
"grad_norm": 0.9562957882881165,
"learning_rate": 0.0002675825973176317,
"loss": 1.0125,
"mean_token_accuracy": 0.8054704304039478,
"num_tokens": 149045248.0,
"step": 4550
},
{
"epoch": 2.4314606741573033,
"grad_norm": 0.8408384919166565,
"learning_rate": 0.0002648566132373787,
"loss": 1.0194,
"mean_token_accuracy": 0.8027529340982437,
"num_tokens": 150683648.0,
"step": 4600
},
{
"epoch": 2.4578982154659617,
"grad_norm": 0.9756256341934204,
"learning_rate": 0.0002621306291571257,
"loss": 0.9828,
"mean_token_accuracy": 0.8083504402637481,
"num_tokens": 152322048.0,
"step": 4650
},
{
"epoch": 2.48433575677462,
"grad_norm": 1.2137442827224731,
"learning_rate": 0.0002594046450768728,
"loss": 1.0021,
"mean_token_accuracy": 0.8056860953569412,
"num_tokens": 153960448.0,
"step": 4700
},
{
"epoch": 2.5107732980832784,
"grad_norm": 0.7544079422950745,
"learning_rate": 0.0002566786609966198,
"loss": 1.0089,
"step": 4750
},
{
"epoch": 2.5107732980832784,
"eval_loss": 1.0549876689910889,
"eval_mean_token_accuracy": 0.8026504306158102,
"eval_num_tokens": 155598848.0,
"eval_runtime": 1712.6528,
"eval_samples_per_second": 4.417,
"eval_steps_per_second": 0.552,
"step": 4750
},
{
"epoch": 2.5372108393919364,
"grad_norm": 1.3571584224700928,
"learning_rate": 0.0002539526769163668,
"loss": 0.9947,
"mean_token_accuracy": 0.8055629892647267,
"num_tokens": 157237248.0,
"step": 4800
},
{
"epoch": 2.5636483807005948,
"grad_norm": 0.9661728739738464,
"learning_rate": 0.00025122669283611385,
"loss": 0.9934,
"mean_token_accuracy": 0.8067454797029495,
"num_tokens": 158875648.0,
"step": 4850
},
{
"epoch": 2.590085922009253,
"grad_norm": 0.8960219025611877,
"learning_rate": 0.0002485007087558609,
"loss": 1.0084,
"mean_token_accuracy": 0.8042240959405899,
"num_tokens": 160514048.0,
"step": 4900
},
{
"epoch": 2.6165234633179115,
"grad_norm": 0.8337807059288025,
"learning_rate": 0.0002457747246756079,
"loss": 1.0028,
"mean_token_accuracy": 0.8062023460865021,
"num_tokens": 162152448.0,
"step": 4950
},
{
"epoch": 2.64296100462657,
"grad_norm": 1.2237184047698975,
"learning_rate": 0.00024304874059535492,
"loss": 0.9907,
"step": 5000
},
{
"epoch": 2.64296100462657,
"eval_loss": 1.0443217754364014,
"eval_mean_token_accuracy": 0.8038508863706165,
"eval_num_tokens": 163790848.0,
"eval_runtime": 1716.2002,
"eval_samples_per_second": 4.408,
"eval_steps_per_second": 0.551,
"step": 5000
},
{
"epoch": 2.669398545935228,
"grad_norm": 0.7289232015609741,
"learning_rate": 0.00024032275651510195,
"loss": 1.0012,
"mean_token_accuracy": 0.8063395051658153,
"num_tokens": 165429248.0,
"step": 5050
},
{
"epoch": 2.6958360872438862,
"grad_norm": 1.0250189304351807,
"learning_rate": 0.000237596772434849,
"loss": 0.976,
"mean_token_accuracy": 0.8098240447044373,
"num_tokens": 167067648.0,
"step": 5100
},
{
"epoch": 2.7222736285525446,
"grad_norm": 1.0034643411636353,
"learning_rate": 0.00023487078835459602,
"loss": 0.9848,
"mean_token_accuracy": 0.8076087480783463,
"num_tokens": 168706048.0,
"step": 5150
},
{
"epoch": 2.748711169861203,
"grad_norm": 0.9291382431983948,
"learning_rate": 0.00023214480427434303,
"loss": 0.9757,
"mean_token_accuracy": 0.8101533487439155,
"num_tokens": 170344448.0,
"step": 5200
},
{
"epoch": 2.7751487111698614,
"grad_norm": 0.6038099527359009,
"learning_rate": 0.00022941882019409009,
"loss": 0.989,
"step": 5250
},
{
"epoch": 2.7751487111698614,
"eval_loss": 1.0298680067062378,
"eval_mean_token_accuracy": 0.8057682283584966,
"eval_num_tokens": 171982848.0,
"eval_runtime": 1715.5734,
"eval_samples_per_second": 4.41,
"eval_steps_per_second": 0.551,
"step": 5250
},
{
"epoch": 2.8015862524785193,
"grad_norm": 0.8782141804695129,
"learning_rate": 0.0002266928361138371,
"loss": 1.0017,
"mean_token_accuracy": 0.8068728642165661,
"num_tokens": 173621248.0,
"step": 5300
},
{
"epoch": 2.8280237937871777,
"grad_norm": 0.5077300667762756,
"learning_rate": 0.00022396685203358413,
"loss": 0.9759,
"mean_token_accuracy": 0.8097360721230507,
"num_tokens": 175259648.0,
"step": 5350
},
{
"epoch": 2.854461335095836,
"grad_norm": 0.571225643157959,
"learning_rate": 0.00022124086795333116,
"loss": 0.9693,
"mean_token_accuracy": 0.8103903934359551,
"num_tokens": 176898048.0,
"step": 5400
},
{
"epoch": 2.8808988764044945,
"grad_norm": 0.9907204508781433,
"learning_rate": 0.0002185148838730782,
"loss": 0.9783,
"mean_token_accuracy": 0.8095271262526512,
"num_tokens": 178536448.0,
"step": 5450
},
{
"epoch": 2.907336417713153,
"grad_norm": 1.0461844205856323,
"learning_rate": 0.0002157888997928252,
"loss": 0.9796,
"step": 5500
},
{
"epoch": 2.907336417713153,
"eval_loss": 1.0164023637771606,
"eval_mean_token_accuracy": 0.8078667395462698,
"eval_num_tokens": 180174848.0,
"eval_runtime": 1713.163,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 0.552,
"step": 5500
},
{
"epoch": 2.933773959021811,
"grad_norm": 0.5164626240730286,
"learning_rate": 0.00021306291571257226,
"loss": 0.9813,
"mean_token_accuracy": 0.8092189015448094,
"num_tokens": 181813248.0,
"step": 5550
},
{
"epoch": 2.960211500330469,
"grad_norm": 0.9014139771461487,
"learning_rate": 0.00021033693163231926,
"loss": 0.9785,
"mean_token_accuracy": 0.808426809310913,
"num_tokens": 183451648.0,
"step": 5600
},
{
"epoch": 2.9866490416391276,
"grad_norm": 1.2656482458114624,
"learning_rate": 0.0002076109475520663,
"loss": 0.9546,
"mean_token_accuracy": 0.8128830647468567,
"num_tokens": 185090048.0,
"step": 5650
},
{
"epoch": 3.012690019828156,
"grad_norm": 0.6291442513465881,
"learning_rate": 0.0002049394831534184,
"loss": 0.9425,
"mean_token_accuracy": 0.8123012103405095,
"num_tokens": 186703872.0,
"step": 5700
},
{
"epoch": 3.0391275611368145,
"grad_norm": 0.9149487614631653,
"learning_rate": 0.0002022134990731654,
"loss": 0.9036,
"step": 5750
},
{
"epoch": 3.0391275611368145,
"eval_loss": 1.010271668434143,
"eval_mean_token_accuracy": 0.8088774525463959,
"eval_num_tokens": 188342272.0,
"eval_runtime": 1714.8688,
"eval_samples_per_second": 4.411,
"eval_steps_per_second": 0.552,
"step": 5750
},
{
"epoch": 3.0655651024454724,
"grad_norm": 0.7065662741661072,
"learning_rate": 0.00019948751499291245,
"loss": 0.9032,
"mean_token_accuracy": 0.816273825019598,
"num_tokens": 189980672.0,
"step": 5800
},
{
"epoch": 3.092002643754131,
"grad_norm": 0.8670871257781982,
"learning_rate": 0.00019676153091265948,
"loss": 0.9084,
"mean_token_accuracy": 0.8148881956934929,
"num_tokens": 191619072.0,
"step": 5850
},
{
"epoch": 3.118440185062789,
"grad_norm": 0.9667902588844299,
"learning_rate": 0.0001940355468324065,
"loss": 0.8968,
"mean_token_accuracy": 0.8168530049920082,
"num_tokens": 193257472.0,
"step": 5900
},
{
"epoch": 3.1448777263714476,
"grad_norm": 0.6061888933181763,
"learning_rate": 0.00019130956275215352,
"loss": 0.9082,
"mean_token_accuracy": 0.8162536644935607,
"num_tokens": 194895872.0,
"step": 5950
},
{
"epoch": 3.1713152676801055,
"grad_norm": 0.8645080924034119,
"learning_rate": 0.00018858357867190058,
"loss": 0.9014,
"step": 6000
},
{
"epoch": 3.1713152676801055,
"eval_loss": 1.0060479640960693,
"eval_mean_token_accuracy": 0.8095184696275134,
"eval_num_tokens": 196534272.0,
"eval_runtime": 1716.7924,
"eval_samples_per_second": 4.406,
"eval_steps_per_second": 0.551,
"step": 6000
},
{
"epoch": 3.197752808988764,
"grad_norm": 0.9978011250495911,
"learning_rate": 0.00018585759459164758,
"loss": 0.8941,
"mean_token_accuracy": 0.817662510573864,
"num_tokens": 198172672.0,
"step": 6050
},
{
"epoch": 3.2241903502974223,
"grad_norm": 0.610701322555542,
"learning_rate": 0.00018313161051139462,
"loss": 0.9088,
"mean_token_accuracy": 0.8161278122663498,
"num_tokens": 199811072.0,
"step": 6100
},
{
"epoch": 3.2506278916060807,
"grad_norm": 0.592491626739502,
"learning_rate": 0.00018040562643114165,
"loss": 0.9111,
"mean_token_accuracy": 0.8152193301916122,
"num_tokens": 201449472.0,
"step": 6150
},
{
"epoch": 3.277065432914739,
"grad_norm": 0.5505239367485046,
"learning_rate": 0.00017767964235088868,
"loss": 0.894,
"mean_token_accuracy": 0.8186956241726875,
"num_tokens": 203087872.0,
"step": 6200
},
{
"epoch": 3.303502974223397,
"grad_norm": 0.6099046468734741,
"learning_rate": 0.0001749536582706357,
"loss": 0.9061,
"step": 6250
},
{
"epoch": 3.303502974223397,
"eval_loss": 0.9953573942184448,
"eval_mean_token_accuracy": 0.8111508759585294,
"eval_num_tokens": 204726272.0,
"eval_runtime": 1714.1014,
"eval_samples_per_second": 4.413,
"eval_steps_per_second": 0.552,
"step": 6250
},
{
"epoch": 3.3299405155320554,
"grad_norm": 0.6866306066513062,
"learning_rate": 0.00017222767419038275,
"loss": 0.9023,
"mean_token_accuracy": 0.8162371690571308,
"num_tokens": 206364672.0,
"step": 6300
},
{
"epoch": 3.3563780568407138,
"grad_norm": 0.6457993984222412,
"learning_rate": 0.00016950169011012976,
"loss": 0.9083,
"mean_token_accuracy": 0.8154759269952774,
"num_tokens": 208003072.0,
"step": 6350
},
{
"epoch": 3.382815598149372,
"grad_norm": 0.5925601124763489,
"learning_rate": 0.0001667757060298768,
"loss": 0.9009,
"mean_token_accuracy": 0.8165108740329743,
"num_tokens": 209641472.0,
"step": 6400
},
{
"epoch": 3.4092531394580305,
"grad_norm": 0.8631545901298523,
"learning_rate": 0.00016404972194962382,
"loss": 0.8779,
"mean_token_accuracy": 0.8210025626420975,
"num_tokens": 211279872.0,
"step": 6450
},
{
"epoch": 3.4356906807666885,
"grad_norm": 0.6113960146903992,
"learning_rate": 0.00016132373786937086,
"loss": 0.8894,
"step": 6500
},
{
"epoch": 3.4356906807666885,
"eval_loss": 0.9821568131446838,
"eval_mean_token_accuracy": 0.8126549717613809,
"eval_num_tokens": 212918272.0,
"eval_runtime": 1714.991,
"eval_samples_per_second": 4.411,
"eval_steps_per_second": 0.552,
"step": 6500
},
{
"epoch": 3.462128222075347,
"grad_norm": 0.6569721698760986,
"learning_rate": 0.00015859775378911786,
"loss": 0.8828,
"mean_token_accuracy": 0.8195891354978084,
"num_tokens": 214556672.0,
"step": 6550
},
{
"epoch": 3.4885657633840053,
"grad_norm": 0.548383891582489,
"learning_rate": 0.0001558717697088649,
"loss": 0.8922,
"mean_token_accuracy": 0.8191379508376122,
"num_tokens": 216195072.0,
"step": 6600
},
{
"epoch": 3.5150033046926636,
"grad_norm": 0.4519716799259186,
"learning_rate": 0.00015314578562861193,
"loss": 0.9047,
"mean_token_accuracy": 0.8174547863006592,
"num_tokens": 217833472.0,
"step": 6650
},
{
"epoch": 3.541440846001322,
"grad_norm": 0.4486851692199707,
"learning_rate": 0.00015041980154835896,
"loss": 0.8812,
"mean_token_accuracy": 0.8206359946727753,
"num_tokens": 219471872.0,
"step": 6700
},
{
"epoch": 3.56787838730998,
"grad_norm": 0.511616587638855,
"learning_rate": 0.00014769381746810597,
"loss": 0.8754,
"step": 6750
},
{
"epoch": 3.56787838730998,
"eval_loss": 0.9736062288284302,
"eval_mean_token_accuracy": 0.8140331042997428,
"eval_num_tokens": 221110272.0,
"eval_runtime": 1716.1615,
"eval_samples_per_second": 4.408,
"eval_steps_per_second": 0.551,
"step": 6750
},
{
"epoch": 3.5943159286186384,
"grad_norm": 0.4776919186115265,
"learning_rate": 0.00014496783338785303,
"loss": 0.8959,
"mean_token_accuracy": 0.8200723953545094,
"num_tokens": 222748672.0,
"step": 6800
},
{
"epoch": 3.6207534699272967,
"grad_norm": 0.6696462035179138,
"learning_rate": 0.00014224184930760003,
"loss": 0.8818,
"mean_token_accuracy": 0.8205584043264389,
"num_tokens": 224387072.0,
"step": 6850
},
{
"epoch": 3.647191011235955,
"grad_norm": 0.6305286884307861,
"learning_rate": 0.00013951586522734707,
"loss": 0.8918,
"mean_token_accuracy": 0.8183388301730156,
"num_tokens": 226025472.0,
"step": 6900
},
{
"epoch": 3.6736285525446135,
"grad_norm": 0.4481205344200134,
"learning_rate": 0.0001367898811470941,
"loss": 0.8689,
"mean_token_accuracy": 0.822255617082119,
"num_tokens": 227663872.0,
"step": 6950
},
{
"epoch": 3.7000660938532715,
"grad_norm": 0.5297748446464539,
"learning_rate": 0.00013406389706684113,
"loss": 0.891,
"step": 7000
},
{
"epoch": 3.7000660938532715,
"eval_loss": 0.9645546078681946,
"eval_mean_token_accuracy": 0.8154595721725681,
"eval_num_tokens": 229302272.0,
"eval_runtime": 1714.7844,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 0.552,
"step": 7000
},
{
"epoch": 3.72650363516193,
"grad_norm": 0.5066333413124084,
"learning_rate": 0.00013133791298658814,
"loss": 0.8888,
"mean_token_accuracy": 0.8194779419898987,
"num_tokens": 230940672.0,
"step": 7050
},
{
"epoch": 3.7529411764705882,
"grad_norm": 0.5374875068664551,
"learning_rate": 0.0001286119289063352,
"loss": 0.8668,
"mean_token_accuracy": 0.8228415179252625,
"num_tokens": 232579072.0,
"step": 7100
},
{
"epoch": 3.7793787177792466,
"grad_norm": 0.45081761479377747,
"learning_rate": 0.0001258859448260822,
"loss": 0.8793,
"mean_token_accuracy": 0.8205449622869492,
"num_tokens": 234217472.0,
"step": 7150
},
{
"epoch": 3.805816259087905,
"grad_norm": 0.4918268620967865,
"learning_rate": 0.00012315996074582924,
"loss": 0.8733,
"mean_token_accuracy": 0.8207551288604736,
"num_tokens": 235855872.0,
"step": 7200
},
{
"epoch": 3.832253800396563,
"grad_norm": 0.5663712024688721,
"learning_rate": 0.00012043397666557627,
"loss": 0.8701,
"step": 7250
},
{
"epoch": 3.832253800396563,
"eval_loss": 0.9548874497413635,
"eval_mean_token_accuracy": 0.8167319189418446,
"eval_num_tokens": 237494272.0,
"eval_runtime": 1714.4512,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 0.552,
"step": 7250
},
{
"epoch": 3.8586913417052213,
"grad_norm": 1.051850438117981,
"learning_rate": 0.0001177079925853233,
"loss": 0.8694,
"mean_token_accuracy": 0.8223982758820056,
"num_tokens": 239132672.0,
"step": 7300
},
{
"epoch": 3.8851288830138797,
"grad_norm": 0.4363590478897095,
"learning_rate": 0.00011498200850507034,
"loss": 0.8864,
"mean_token_accuracy": 0.8197085753083229,
"num_tokens": 240771072.0,
"step": 7350
},
{
"epoch": 3.911566424322538,
"grad_norm": 1.5718705654144287,
"learning_rate": 0.00011225602442481736,
"loss": 0.8714,
"mean_token_accuracy": 0.8220063516497612,
"num_tokens": 242409472.0,
"step": 7400
},
{
"epoch": 3.9380039656311965,
"grad_norm": 2.0182573795318604,
"learning_rate": 0.00010953004034456439,
"loss": 0.8666,
"mean_token_accuracy": 0.8229606547951698,
"num_tokens": 244047872.0,
"step": 7450
},
{
"epoch": 3.9644415069398544,
"grad_norm": 0.5944796800613403,
"learning_rate": 0.00010680405626431142,
"loss": 0.8553,
"step": 7500
},
{
"epoch": 3.9644415069398544,
"eval_loss": 0.9463370442390442,
"eval_mean_token_accuracy": 0.8175775335026594,
"eval_num_tokens": 245686272.0,
"eval_runtime": 1716.9243,
"eval_samples_per_second": 4.406,
"eval_steps_per_second": 0.551,
"step": 7500
},
{
"epoch": 3.990879048248513,
"grad_norm": 0.39286720752716064,
"learning_rate": 0.00010407807218405844,
"loss": 0.8742,
"mean_token_accuracy": 0.8231118628382683,
"num_tokens": 247324672.0,
"step": 7550
},
{
"epoch": 4.016920026437541,
"grad_norm": 0.485441118478775,
"learning_rate": 0.00010135208810380548,
"loss": 0.8161,
"mean_token_accuracy": 0.8290703360199323,
"num_tokens": 248938496.0,
"step": 7600
},
{
"epoch": 4.0433575677462,
"grad_norm": 0.6562045216560364,
"learning_rate": 9.86261040235525e-05,
"loss": 0.7963,
"mean_token_accuracy": 0.829828929901123,
"num_tokens": 250576896.0,
"step": 7650
},
{
"epoch": 4.069795109054858,
"grad_norm": 0.4619589149951935,
"learning_rate": 9.590011994329953e-05,
"loss": 0.7968,
"mean_token_accuracy": 0.829875974059105,
"num_tokens": 252215296.0,
"step": 7700
},
{
"epoch": 4.0962326503635165,
"grad_norm": 0.5542292594909668,
"learning_rate": 9.317413586304656e-05,
"loss": 0.7897,
"step": 7750
},
{
"epoch": 4.0962326503635165,
"eval_loss": 0.9498882293701172,
"eval_mean_token_accuracy": 0.8178022539136778,
"eval_num_tokens": 253853696.0,
"eval_runtime": 1713.64,
"eval_samples_per_second": 4.415,
"eval_steps_per_second": 0.552,
"step": 7750
},
{
"epoch": 4.122670191672174,
"grad_norm": 0.40135377645492554,
"learning_rate": 9.044815178279358e-05,
"loss": 0.7963,
"mean_token_accuracy": 0.8303173841536045,
"num_tokens": 255492096.0,
"step": 7800
},
{
"epoch": 4.149107732980832,
"grad_norm": 0.39157313108444214,
"learning_rate": 8.772216770254061e-05,
"loss": 0.7846,
"mean_token_accuracy": 0.8317717489600182,
"num_tokens": 257130496.0,
"step": 7850
},
{
"epoch": 4.175545274289491,
"grad_norm": 0.4963982105255127,
"learning_rate": 8.499618362228765e-05,
"loss": 0.8069,
"mean_token_accuracy": 0.8283883157372475,
"num_tokens": 258768896.0,
"step": 7900
},
{
"epoch": 4.201982815598149,
"grad_norm": 0.3942487835884094,
"learning_rate": 8.227019954203467e-05,
"loss": 0.803,
"mean_token_accuracy": 0.8284738489985466,
"num_tokens": 260407296.0,
"step": 7950
},
{
"epoch": 4.228420356906808,
"grad_norm": 0.37145310640335083,
"learning_rate": 7.95442154617817e-05,
"loss": 0.805,
"step": 8000
},
{
"epoch": 4.228420356906808,
"eval_loss": 0.9421485066413879,
"eval_mean_token_accuracy": 0.8191204303540841,
"eval_num_tokens": 262045696.0,
"eval_runtime": 1716.291,
"eval_samples_per_second": 4.408,
"eval_steps_per_second": 0.551,
"step": 8000
},
{
"epoch": 4.254857898215466,
"grad_norm": 0.3017653524875641,
"learning_rate": 7.681823138152873e-05,
"loss": 0.796,
"mean_token_accuracy": 0.8288511091470718,
"num_tokens": 263684096.0,
"step": 8050
},
{
"epoch": 4.281295439524124,
"grad_norm": 0.4065409004688263,
"learning_rate": 7.409224730127575e-05,
"loss": 0.7812,
"mean_token_accuracy": 0.8326673975586891,
"num_tokens": 265322496.0,
"step": 8100
},
{
"epoch": 4.307732980832783,
"grad_norm": 0.33838245272636414,
"learning_rate": 7.136626322102279e-05,
"loss": 0.7868,
"mean_token_accuracy": 0.8319782489538192,
"num_tokens": 266960896.0,
"step": 8150
},
{
"epoch": 4.334170522141441,
"grad_norm": 0.39351001381874084,
"learning_rate": 6.864027914076983e-05,
"loss": 0.7891,
"mean_token_accuracy": 0.8313220903277397,
"num_tokens": 268599296.0,
"step": 8200
},
{
"epoch": 4.360608063450099,
"grad_norm": 0.3555977940559387,
"learning_rate": 6.591429506051685e-05,
"loss": 0.7858,
"step": 8250
},
{
"epoch": 4.360608063450099,
"eval_loss": 0.9359485507011414,
"eval_mean_token_accuracy": 0.8200258982483983,
"eval_num_tokens": 270237696.0,
"eval_runtime": 1713.1543,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 0.552,
"step": 8250
},
{
"epoch": 4.387045604758757,
"grad_norm": 0.34437137842178345,
"learning_rate": 6.318831098026388e-05,
"loss": 0.7825,
"mean_token_accuracy": 0.8322449275851249,
"num_tokens": 271876096.0,
"step": 8300
},
{
"epoch": 4.413483146067415,
"grad_norm": 0.37046581506729126,
"learning_rate": 6.0462326900010904e-05,
"loss": 0.7936,
"mean_token_accuracy": 0.8298429843783378,
"num_tokens": 273514496.0,
"step": 8350
},
{
"epoch": 4.439920687376074,
"grad_norm": 0.44303834438323975,
"learning_rate": 5.773634281975793e-05,
"loss": 0.7891,
"mean_token_accuracy": 0.8311491903662681,
"num_tokens": 275152896.0,
"step": 8400
},
{
"epoch": 4.466358228684732,
"grad_norm": 0.382201611995697,
"learning_rate": 5.5010358739504963e-05,
"loss": 0.7781,
"mean_token_accuracy": 0.8330895602703094,
"num_tokens": 276791296.0,
"step": 8450
},
{
"epoch": 4.492795769993391,
"grad_norm": 0.39989522099494934,
"learning_rate": 5.228437465925199e-05,
"loss": 0.7877,
"step": 8500
},
{
"epoch": 4.492795769993391,
"eval_loss": 0.9300816059112549,
"eval_mean_token_accuracy": 0.8209756191890789,
"eval_num_tokens": 278429696.0,
"eval_runtime": 1713.4373,
"eval_samples_per_second": 4.415,
"eval_steps_per_second": 0.552,
"step": 8500
},
{
"epoch": 4.519233311302049,
"grad_norm": 0.3832317590713501,
"learning_rate": 4.9558390578999016e-05,
"loss": 0.7782,
"mean_token_accuracy": 0.8327266594767571,
"num_tokens": 280068096.0,
"step": 8550
},
{
"epoch": 4.545670852610707,
"grad_norm": 0.4667583703994751,
"learning_rate": 4.683240649874604e-05,
"loss": 0.7849,
"mean_token_accuracy": 0.8322770014405251,
"num_tokens": 281706496.0,
"step": 8600
},
{
"epoch": 4.572108393919366,
"grad_norm": 0.3977579176425934,
"learning_rate": 4.4106422418493076e-05,
"loss": 0.786,
"mean_token_accuracy": 0.83141067892313,
"num_tokens": 283344896.0,
"step": 8650
},
{
"epoch": 4.598545935228024,
"grad_norm": 0.38620129227638245,
"learning_rate": 4.138043833824011e-05,
"loss": 0.7865,
"mean_token_accuracy": 0.8317729702591896,
"num_tokens": 284983296.0,
"step": 8700
},
{
"epoch": 4.624983476536682,
"grad_norm": 0.3608716130256653,
"learning_rate": 3.8654454257987135e-05,
"loss": 0.7905,
"step": 8750
},
{
"epoch": 4.624983476536682,
"eval_loss": 0.9239566922187805,
"eval_mean_token_accuracy": 0.8218199411607948,
"eval_num_tokens": 286621696.0,
"eval_runtime": 1715.401,
"eval_samples_per_second": 4.41,
"eval_steps_per_second": 0.551,
"step": 8750
},
{
"epoch": 4.65142101784534,
"grad_norm": 0.30198875069618225,
"learning_rate": 3.592847017773417e-05,
"loss": 0.7672,
"mean_token_accuracy": 0.8327672865986824,
"num_tokens": 288260096.0,
"step": 8800
},
{
"epoch": 4.677858559153998,
"grad_norm": 0.3867688477039337,
"learning_rate": 3.3202486097481194e-05,
"loss": 0.7728,
"mean_token_accuracy": 0.8339509972929955,
"num_tokens": 289898496.0,
"step": 8850
},
{
"epoch": 4.704296100462657,
"grad_norm": 0.38331055641174316,
"learning_rate": 3.0476502017228217e-05,
"loss": 0.7801,
"mean_token_accuracy": 0.8332563516497612,
"num_tokens": 291536896.0,
"step": 8900
},
{
"epoch": 4.730733641771315,
"grad_norm": 0.32032325863838196,
"learning_rate": 2.775051793697525e-05,
"loss": 0.7896,
"mean_token_accuracy": 0.8309176415205002,
"num_tokens": 293175296.0,
"step": 8950
},
{
"epoch": 4.757171183079974,
"grad_norm": 0.41111549735069275,
"learning_rate": 2.502453385672228e-05,
"loss": 0.7691,
"step": 9000
},
{
"epoch": 4.757171183079974,
"eval_loss": 0.9195617437362671,
"eval_mean_token_accuracy": 0.8225174557583025,
"eval_num_tokens": 294813696.0,
"eval_runtime": 1712.3817,
"eval_samples_per_second": 4.418,
"eval_steps_per_second": 0.552,
"step": 9000
}
],
"logging_steps": 50,
"max_steps": 9455,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 470934104309760.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}