text2arch-qwen / trainer_state.json
shivank21's picture
Upload folder using huggingface_hub
89cf807 verified
{
"best_global_step": 7500,
"best_metric": 0.7491397857666016,
"best_model_checkpoint": "./results/checkpoint-7500",
"epoch": 4.997752808988764,
"eval_steps": 250,
"global_step": 9455,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.026437541308658295,
"grad_norm": 1.9868909120559692,
"learning_rate": 0.00034625958983852136,
"loss": 1.3782,
"mean_token_accuracy": 0.7697586753964424,
"num_tokens": 1638400.0,
"step": 50
},
{
"epoch": 0.05287508261731659,
"grad_norm": 0.9880499839782715,
"learning_rate": 0.000407611186724682,
"loss": 1.2966,
"mean_token_accuracy": 0.7892405907809734,
"num_tokens": 3276800.0,
"step": 100
},
{
"epoch": 0.07931262392597488,
"grad_norm": 0.6774707436561584,
"learning_rate": 0.0004434995702624468,
"loss": 0.939,
"mean_token_accuracy": 0.8182881197333336,
"num_tokens": 4915200.0,
"step": 150
},
{
"epoch": 0.10575016523463318,
"grad_norm": 0.84303218126297,
"learning_rate": 0.0004689627836108426,
"loss": 0.9239,
"mean_token_accuracy": 0.8214302301406861,
"num_tokens": 6553600.0,
"step": 200
},
{
"epoch": 0.13218770654329148,
"grad_norm": 2.8259122371673584,
"learning_rate": 0.0004887135863147016,
"loss": 1.0103,
"step": 250
},
{
"epoch": 0.13218770654329148,
"eval_loss": 1.3388922214508057,
"eval_mean_token_accuracy": 0.767919923740008,
"eval_num_tokens": 8192000.0,
"eval_runtime": 1597.0105,
"eval_samples_per_second": 4.737,
"eval_steps_per_second": 0.592,
"step": 250
},
{
"epoch": 0.15862524785194976,
"grad_norm": 6.46172571182251,
"learning_rate": 0.0004991822047759241,
"loss": 1.6239,
"mean_token_accuracy": 0.767223238646984,
"num_tokens": 9830400.0,
"step": 300
},
{
"epoch": 0.18506278916060806,
"grad_norm": 17.279727935791016,
"learning_rate": 0.0004964562206956711,
"loss": 1.3753,
"mean_token_accuracy": 0.7646566477417945,
"num_tokens": 11468800.0,
"step": 350
},
{
"epoch": 0.21150033046926636,
"grad_norm": 9.562877655029297,
"learning_rate": 0.0004937302366154182,
"loss": 1.7307,
"mean_token_accuracy": 0.7092528110742569,
"num_tokens": 13107200.0,
"step": 400
},
{
"epoch": 0.23793787177792466,
"grad_norm": 57.86962890625,
"learning_rate": 0.0004910042525351653,
"loss": 3.8636,
"mean_token_accuracy": 0.44049181263893844,
"num_tokens": 14745600.0,
"step": 450
},
{
"epoch": 0.26437541308658297,
"grad_norm": 2.1134138107299805,
"learning_rate": 0.00048827826845491225,
"loss": 3.3381,
"step": 500
},
{
"epoch": 0.26437541308658297,
"eval_loss": 3.0370047092437744,
"eval_mean_token_accuracy": 0.5421812577177052,
"eval_num_tokens": 16384000.0,
"eval_runtime": 1593.4634,
"eval_samples_per_second": 4.748,
"eval_steps_per_second": 0.594,
"step": 500
},
{
"epoch": 0.29081295439524124,
"grad_norm": 2.805110454559326,
"learning_rate": 0.0004855522843746593,
"loss": 2.8023,
"mean_token_accuracy": 0.5313697456568479,
"num_tokens": 18022400.0,
"step": 550
},
{
"epoch": 0.3172504957038995,
"grad_norm": 2.2337939739227295,
"learning_rate": 0.00048282630029440626,
"loss": 2.3643,
"mean_token_accuracy": 0.6257539093494415,
"num_tokens": 19660800.0,
"step": 600
},
{
"epoch": 0.34368803701255785,
"grad_norm": 1.3661304712295532,
"learning_rate": 0.00048010031621415335,
"loss": 2.2117,
"mean_token_accuracy": 0.6438269788026809,
"num_tokens": 21299200.0,
"step": 650
},
{
"epoch": 0.3701255783212161,
"grad_norm": 1.319533109664917,
"learning_rate": 0.0004773743321339004,
"loss": 2.0377,
"mean_token_accuracy": 0.6656103357672691,
"num_tokens": 22937600.0,
"step": 700
},
{
"epoch": 0.3965631196298744,
"grad_norm": 0.9889560341835022,
"learning_rate": 0.00047464834805364736,
"loss": 1.9612,
"step": 750
},
{
"epoch": 0.3965631196298744,
"eval_loss": 1.9166280031204224,
"eval_mean_token_accuracy": 0.6814079303570076,
"eval_num_tokens": 24576000.0,
"eval_runtime": 1597.3511,
"eval_samples_per_second": 4.736,
"eval_steps_per_second": 0.592,
"step": 750
},
{
"epoch": 0.4230006609385327,
"grad_norm": 0.9154905676841736,
"learning_rate": 0.0004719223639733944,
"loss": 1.895,
"mean_token_accuracy": 0.6779982282221317,
"num_tokens": 26214400.0,
"step": 800
},
{
"epoch": 0.449438202247191,
"grad_norm": 1.1073395013809204,
"learning_rate": 0.0004691963798931415,
"loss": 1.8532,
"mean_token_accuracy": 0.6910386118292808,
"num_tokens": 27852800.0,
"step": 850
},
{
"epoch": 0.47587574355584933,
"grad_norm": 0.7696494460105896,
"learning_rate": 0.00046647039581288846,
"loss": 1.7828,
"mean_token_accuracy": 0.7011858496069908,
"num_tokens": 29491200.0,
"step": 900
},
{
"epoch": 0.5023132848645075,
"grad_norm": 0.8735769987106323,
"learning_rate": 0.0004637444117326355,
"loss": 1.7661,
"mean_token_accuracy": 0.7030816239118576,
"num_tokens": 31129600.0,
"step": 950
},
{
"epoch": 0.5287508261731659,
"grad_norm": 0.8184662461280823,
"learning_rate": 0.0004610184276523825,
"loss": 1.7303,
"step": 1000
},
{
"epoch": 0.5287508261731659,
"eval_loss": 1.732823133468628,
"eval_mean_token_accuracy": 0.7076210416774669,
"eval_num_tokens": 32768000.0,
"eval_runtime": 1598.294,
"eval_samples_per_second": 4.733,
"eval_steps_per_second": 0.592,
"step": 1000
},
{
"epoch": 0.5551883674818242,
"grad_norm": 0.805102527141571,
"learning_rate": 0.00045829244357212956,
"loss": 1.7074,
"mean_token_accuracy": 0.7088553883135319,
"num_tokens": 34406400.0,
"step": 1050
},
{
"epoch": 0.5816259087904825,
"grad_norm": 1.1714200973510742,
"learning_rate": 0.0004555664594918766,
"loss": 1.6578,
"mean_token_accuracy": 0.7183681574463844,
"num_tokens": 36044800.0,
"step": 1100
},
{
"epoch": 0.6080634500991408,
"grad_norm": 0.801713228225708,
"learning_rate": 0.0004528404754116236,
"loss": 1.639,
"mean_token_accuracy": 0.7187829902768135,
"num_tokens": 37683200.0,
"step": 1150
},
{
"epoch": 0.634500991407799,
"grad_norm": 0.776907205581665,
"learning_rate": 0.0004501144913313706,
"loss": 1.6155,
"mean_token_accuracy": 0.7220100191235542,
"num_tokens": 39321600.0,
"step": 1200
},
{
"epoch": 0.6609385327164574,
"grad_norm": 0.5754767656326294,
"learning_rate": 0.0004473885072511177,
"loss": 1.5987,
"step": 1250
},
{
"epoch": 0.6609385327164574,
"eval_loss": 1.5740511417388916,
"eval_mean_token_accuracy": 0.7283713231001041,
"eval_num_tokens": 40960000.0,
"eval_runtime": 1599.1655,
"eval_samples_per_second": 4.731,
"eval_steps_per_second": 0.592,
"step": 1250
},
{
"epoch": 0.6873760740251157,
"grad_norm": 0.5739009976387024,
"learning_rate": 0.0004446625231708647,
"loss": 1.5626,
"mean_token_accuracy": 0.7280584080517292,
"num_tokens": 42598400.0,
"step": 1300
},
{
"epoch": 0.713813615333774,
"grad_norm": 0.56184983253479,
"learning_rate": 0.0004419365390906117,
"loss": 1.5383,
"mean_token_accuracy": 0.7336229240894317,
"num_tokens": 44236800.0,
"step": 1350
},
{
"epoch": 0.7402511566424322,
"grad_norm": 0.7078151106834412,
"learning_rate": 0.00043921055501035873,
"loss": 1.4998,
"mean_token_accuracy": 0.7408009549975395,
"num_tokens": 45875200.0,
"step": 1400
},
{
"epoch": 0.7666886979510905,
"grad_norm": 0.6344922184944153,
"learning_rate": 0.0004364845709301058,
"loss": 1.4751,
"mean_token_accuracy": 0.74332783639431,
"num_tokens": 47513600.0,
"step": 1450
},
{
"epoch": 0.7931262392597488,
"grad_norm": 0.4986041486263275,
"learning_rate": 0.0004337585868498528,
"loss": 1.4716,
"step": 1500
},
{
"epoch": 0.7931262392597488,
"eval_loss": 1.4890165328979492,
"eval_mean_token_accuracy": 0.7397930833174361,
"eval_num_tokens": 49152000.0,
"eval_runtime": 1599.8275,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 1500
},
{
"epoch": 0.8195637805684072,
"grad_norm": 0.508725643157959,
"learning_rate": 0.00043103260276959983,
"loss": 1.4734,
"mean_token_accuracy": 0.7423943056166172,
"num_tokens": 50790400.0,
"step": 1550
},
{
"epoch": 0.8460013218770654,
"grad_norm": 0.5139680504798889,
"learning_rate": 0.0004283066186893468,
"loss": 1.4513,
"mean_token_accuracy": 0.7446151030063629,
"num_tokens": 52428800.0,
"step": 1600
},
{
"epoch": 0.8724388631857237,
"grad_norm": 0.5360570549964905,
"learning_rate": 0.0004255806346090939,
"loss": 1.4587,
"mean_token_accuracy": 0.7414125129580498,
"num_tokens": 54067200.0,
"step": 1650
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.46601545810699463,
"learning_rate": 0.00042285465052884093,
"loss": 1.4468,
"mean_token_accuracy": 0.7438508039712906,
"num_tokens": 55705600.0,
"step": 1700
},
{
"epoch": 0.9253139458030403,
"grad_norm": 0.4491994380950928,
"learning_rate": 0.0004201286664485879,
"loss": 1.4234,
"step": 1750
},
{
"epoch": 0.9253139458030403,
"eval_loss": 1.405325174331665,
"eval_mean_token_accuracy": 0.7498895639975025,
"eval_num_tokens": 57344000.0,
"eval_runtime": 1600.1713,
"eval_samples_per_second": 4.728,
"eval_steps_per_second": 0.591,
"step": 1750
},
{
"epoch": 0.9517514871116987,
"grad_norm": 0.40710222721099854,
"learning_rate": 0.00041740268236833495,
"loss": 1.3835,
"mean_token_accuracy": 0.7491015987098217,
"num_tokens": 58982400.0,
"step": 1800
},
{
"epoch": 0.9781890284203569,
"grad_norm": 0.4693559408187866,
"learning_rate": 0.00041467669828808203,
"loss": 1.3498,
"mean_token_accuracy": 0.7580449655652046,
"num_tokens": 60620800.0,
"step": 1850
},
{
"epoch": 1.0042300066093852,
"grad_norm": 0.4662095606327057,
"learning_rate": 0.00041200523388943414,
"loss": 1.3744,
"mean_token_accuracy": 0.7523588169044649,
"num_tokens": 62234624.0,
"step": 1900
},
{
"epoch": 1.0306675479180436,
"grad_norm": 0.39532390236854553,
"learning_rate": 0.0004092792498091811,
"loss": 1.3057,
"mean_token_accuracy": 0.7592387574911118,
"num_tokens": 63873024.0,
"step": 1950
},
{
"epoch": 1.057105089226702,
"grad_norm": 0.4154648780822754,
"learning_rate": 0.00040655326572892816,
"loss": 1.3248,
"step": 2000
},
{
"epoch": 1.057105089226702,
"eval_loss": 1.339290976524353,
"eval_mean_token_accuracy": 0.7579027240422513,
"eval_num_tokens": 65511424.0,
"eval_runtime": 1599.5937,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 2000
},
{
"epoch": 1.0835426305353602,
"grad_norm": 0.3895817697048187,
"learning_rate": 0.00040382728164867513,
"loss": 1.2956,
"mean_token_accuracy": 0.7590417274832726,
"num_tokens": 67149824.0,
"step": 2050
},
{
"epoch": 1.1099801718440185,
"grad_norm": 0.39260634779930115,
"learning_rate": 0.0004011012975684222,
"loss": 1.3223,
"mean_token_accuracy": 0.756996577680111,
"num_tokens": 68788224.0,
"step": 2100
},
{
"epoch": 1.1364177131526767,
"grad_norm": 0.3638737201690674,
"learning_rate": 0.00039837531348816925,
"loss": 1.268,
"mean_token_accuracy": 0.7646328181028366,
"num_tokens": 70426624.0,
"step": 2150
},
{
"epoch": 1.162855254461335,
"grad_norm": 0.3186447322368622,
"learning_rate": 0.00039564932940791623,
"loss": 1.2705,
"mean_token_accuracy": 0.7647727259993553,
"num_tokens": 72065024.0,
"step": 2200
},
{
"epoch": 1.1892927957699935,
"grad_norm": 0.37439003586769104,
"learning_rate": 0.00039292334532766327,
"loss": 1.2631,
"step": 2250
},
{
"epoch": 1.1892927957699935,
"eval_loss": 1.278181791305542,
"eval_mean_token_accuracy": 0.7655498584531578,
"eval_num_tokens": 73703424.0,
"eval_runtime": 1599.9443,
"eval_samples_per_second": 4.728,
"eval_steps_per_second": 0.591,
"step": 2250
},
{
"epoch": 1.2157303370786516,
"grad_norm": 0.36414834856987,
"learning_rate": 0.00039019736124741035,
"loss": 1.2556,
"mean_token_accuracy": 0.7657642959058285,
"num_tokens": 75341824.0,
"step": 2300
},
{
"epoch": 1.24216787838731,
"grad_norm": 0.38630911707878113,
"learning_rate": 0.00038747137716715733,
"loss": 1.2453,
"mean_token_accuracy": 0.7686192587018013,
"num_tokens": 76980224.0,
"step": 2350
},
{
"epoch": 1.2686054196959682,
"grad_norm": 0.34793412685394287,
"learning_rate": 0.00038474539308690437,
"loss": 1.2113,
"mean_token_accuracy": 0.7736504143476486,
"num_tokens": 78618624.0,
"step": 2400
},
{
"epoch": 1.2950429610046266,
"grad_norm": 0.3544578552246094,
"learning_rate": 0.0003820194090066514,
"loss": 1.1924,
"mean_token_accuracy": 0.7755889534950257,
"num_tokens": 80257024.0,
"step": 2450
},
{
"epoch": 1.321480502313285,
"grad_norm": 0.30794623494148254,
"learning_rate": 0.00037929342492639843,
"loss": 1.1748,
"step": 2500
},
{
"epoch": 1.321480502313285,
"eval_loss": 1.198885440826416,
"eval_mean_token_accuracy": 0.776488389197666,
"eval_num_tokens": 81895424.0,
"eval_runtime": 1599.7215,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 2500
},
{
"epoch": 1.3479180436219431,
"grad_norm": 0.2978927493095398,
"learning_rate": 0.00037656744084614547,
"loss": 1.1624,
"mean_token_accuracy": 0.7798365721106529,
"num_tokens": 83533824.0,
"step": 2550
},
{
"epoch": 1.3743555849306015,
"grad_norm": 0.3153753876686096,
"learning_rate": 0.0003738414567658925,
"loss": 1.1462,
"mean_token_accuracy": 0.7827309390902519,
"num_tokens": 85172224.0,
"step": 2600
},
{
"epoch": 1.4007931262392597,
"grad_norm": 0.31813791394233704,
"learning_rate": 0.0003711154726856395,
"loss": 1.1274,
"mean_token_accuracy": 0.7860245615243912,
"num_tokens": 86810624.0,
"step": 2650
},
{
"epoch": 1.427230667547918,
"grad_norm": 0.30844855308532715,
"learning_rate": 0.00036838948860538656,
"loss": 1.118,
"mean_token_accuracy": 0.7876050838828087,
"num_tokens": 88449024.0,
"step": 2700
},
{
"epoch": 1.4536682088565764,
"grad_norm": 0.3054572343826294,
"learning_rate": 0.0003656635045251336,
"loss": 1.1336,
"step": 2750
},
{
"epoch": 1.4536682088565764,
"eval_loss": 1.1190927028656006,
"eval_mean_token_accuracy": 0.7886134508926319,
"eval_num_tokens": 90087424.0,
"eval_runtime": 1599.6779,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 2750
},
{
"epoch": 1.4801057501652346,
"grad_norm": 0.28417208790779114,
"learning_rate": 0.0003629375204448806,
"loss": 1.1039,
"mean_token_accuracy": 0.7864858260750771,
"num_tokens": 91725824.0,
"step": 2800
},
{
"epoch": 1.5065432914738928,
"grad_norm": 0.307099312543869,
"learning_rate": 0.0003602115363646276,
"loss": 1.0909,
"mean_token_accuracy": 0.7900452110171318,
"num_tokens": 93364224.0,
"step": 2850
},
{
"epoch": 1.5329808327825512,
"grad_norm": 0.30008023977279663,
"learning_rate": 0.0003574855522843747,
"loss": 1.0824,
"mean_token_accuracy": 0.7928054749965667,
"num_tokens": 95002624.0,
"step": 2900
},
{
"epoch": 1.5594183740912095,
"grad_norm": 0.27622541785240173,
"learning_rate": 0.0003547595682041217,
"loss": 1.055,
"mean_token_accuracy": 0.7961241453886032,
"num_tokens": 96641024.0,
"step": 2950
},
{
"epoch": 1.585855915399868,
"grad_norm": 0.2670520544052124,
"learning_rate": 0.0003520335841238687,
"loss": 1.0466,
"step": 3000
},
{
"epoch": 1.585855915399868,
"eval_loss": 1.0528658628463745,
"eval_mean_token_accuracy": 0.7982593539149262,
"eval_num_tokens": 98279424.0,
"eval_runtime": 1599.5737,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 3000
},
{
"epoch": 1.612293456708526,
"grad_norm": 0.26690635085105896,
"learning_rate": 0.00034930760004361574,
"loss": 1.0354,
"mean_token_accuracy": 0.7988346171379089,
"num_tokens": 99917824.0,
"step": 3050
},
{
"epoch": 1.6387309980171842,
"grad_norm": 0.27989307045936584,
"learning_rate": 0.0003465816159633628,
"loss": 1.0225,
"mean_token_accuracy": 0.8015902996063232,
"num_tokens": 101556224.0,
"step": 3100
},
{
"epoch": 1.6651685393258426,
"grad_norm": 0.21368129551410675,
"learning_rate": 0.0003438556318831098,
"loss": 1.0197,
"mean_token_accuracy": 0.8019238775968551,
"num_tokens": 103194624.0,
"step": 3150
},
{
"epoch": 1.691606080634501,
"grad_norm": 0.288343220949173,
"learning_rate": 0.00034112964780285684,
"loss": 1.0174,
"mean_token_accuracy": 0.8012603887915611,
"num_tokens": 104833024.0,
"step": 3200
},
{
"epoch": 1.7180436219431594,
"grad_norm": 0.245047464966774,
"learning_rate": 0.0003384036637226039,
"loss": 0.9922,
"step": 3250
},
{
"epoch": 1.7180436219431594,
"eval_loss": 1.0065803527832031,
"eval_mean_token_accuracy": 0.8048020523773943,
"eval_num_tokens": 106471424.0,
"eval_runtime": 1599.1563,
"eval_samples_per_second": 4.731,
"eval_steps_per_second": 0.592,
"step": 3250
},
{
"epoch": 1.7444811632518176,
"grad_norm": 0.23827126622200012,
"learning_rate": 0.0003356776796423509,
"loss": 0.9838,
"mean_token_accuracy": 0.8066547532379628,
"num_tokens": 108109824.0,
"step": 3300
},
{
"epoch": 1.7709187045604757,
"grad_norm": 0.22703391313552856,
"learning_rate": 0.00033295169556209794,
"loss": 0.9587,
"mean_token_accuracy": 0.8113178130984307,
"num_tokens": 109748224.0,
"step": 3350
},
{
"epoch": 1.7973562458691341,
"grad_norm": 0.25331422686576843,
"learning_rate": 0.0003302257114818449,
"loss": 0.9697,
"mean_token_accuracy": 0.8093206259608269,
"num_tokens": 111386624.0,
"step": 3400
},
{
"epoch": 1.8237937871777925,
"grad_norm": 0.264260470867157,
"learning_rate": 0.000327499727401592,
"loss": 0.956,
"mean_token_accuracy": 0.8123435971140861,
"num_tokens": 113025024.0,
"step": 3450
},
{
"epoch": 1.8502313284864509,
"grad_norm": 0.2458537220954895,
"learning_rate": 0.00032477374332133904,
"loss": 0.9539,
"step": 3500
},
{
"epoch": 1.8502313284864509,
"eval_loss": 0.9670175909996033,
"eval_mean_token_accuracy": 0.8104942284729214,
"eval_num_tokens": 114663424.0,
"eval_runtime": 1599.1718,
"eval_samples_per_second": 4.731,
"eval_steps_per_second": 0.592,
"step": 3500
},
{
"epoch": 1.876668869795109,
"grad_norm": 0.20451125502586365,
"learning_rate": 0.000322047759241086,
"loss": 0.9479,
"mean_token_accuracy": 0.8118679732084274,
"num_tokens": 116301824.0,
"step": 3550
},
{
"epoch": 1.9031064111037672,
"grad_norm": 0.22584660351276398,
"learning_rate": 0.00031932177516083305,
"loss": 0.9688,
"mean_token_accuracy": 0.8094049346446991,
"num_tokens": 117940224.0,
"step": 3600
},
{
"epoch": 1.9295439524124256,
"grad_norm": 0.2119428962469101,
"learning_rate": 0.00031659579108058014,
"loss": 0.9229,
"mean_token_accuracy": 0.816270771920681,
"num_tokens": 119578624.0,
"step": 3650
},
{
"epoch": 1.955981493721084,
"grad_norm": 0.2210853099822998,
"learning_rate": 0.0003138698070003271,
"loss": 0.9341,
"mean_token_accuracy": 0.814192325770855,
"num_tokens": 121217024.0,
"step": 3700
},
{
"epoch": 1.9824190350297424,
"grad_norm": 0.1966710090637207,
"learning_rate": 0.00031114382292007415,
"loss": 0.9283,
"step": 3750
},
{
"epoch": 1.9824190350297424,
"eval_loss": 0.9337447881698608,
"eval_mean_token_accuracy": 0.8153352634725308,
"eval_num_tokens": 122855424.0,
"eval_runtime": 1599.5384,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 3750
},
{
"epoch": 2.0084600132187704,
"grad_norm": 0.20168109238147736,
"learning_rate": 0.00030847235852142626,
"loss": 0.8927,
"mean_token_accuracy": 0.8173428005175266,
"num_tokens": 124469248.0,
"step": 3800
},
{
"epoch": 2.034897554527429,
"grad_norm": 0.1905670017004013,
"learning_rate": 0.00030574637444117324,
"loss": 0.8427,
"mean_token_accuracy": 0.8238172018527985,
"num_tokens": 126107648.0,
"step": 3850
},
{
"epoch": 2.061335095836087,
"grad_norm": 0.19004780054092407,
"learning_rate": 0.0003030203903609203,
"loss": 0.8536,
"mean_token_accuracy": 0.8214613863825798,
"num_tokens": 127746048.0,
"step": 3900
},
{
"epoch": 2.0877726371447456,
"grad_norm": 0.21092021465301514,
"learning_rate": 0.00030029440628066736,
"loss": 0.8347,
"mean_token_accuracy": 0.8248136582970619,
"num_tokens": 129384448.0,
"step": 3950
},
{
"epoch": 2.114210178453404,
"grad_norm": 0.2002408355474472,
"learning_rate": 0.00029756842220041434,
"loss": 0.8409,
"step": 4000
},
{
"epoch": 2.114210178453404,
"eval_loss": 0.913899838924408,
"eval_mean_token_accuracy": 0.8182373826271888,
"eval_num_tokens": 131022848.0,
"eval_runtime": 1599.607,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 4000
},
{
"epoch": 2.140647719762062,
"grad_norm": 0.22307777404785156,
"learning_rate": 0.0002948424381201614,
"loss": 0.8428,
"mean_token_accuracy": 0.8242137080430985,
"num_tokens": 132661248.0,
"step": 4050
},
{
"epoch": 2.1670852610707203,
"grad_norm": 0.1873617023229599,
"learning_rate": 0.0002921164540399084,
"loss": 0.8439,
"mean_token_accuracy": 0.8233803743124009,
"num_tokens": 134299648.0,
"step": 4100
},
{
"epoch": 2.1935228023793787,
"grad_norm": 0.1888233870267868,
"learning_rate": 0.00028939046995965544,
"loss": 0.8406,
"mean_token_accuracy": 0.8241153433918953,
"num_tokens": 135938048.0,
"step": 4150
},
{
"epoch": 2.219960343688037,
"grad_norm": 0.19996315240859985,
"learning_rate": 0.00028666448587940247,
"loss": 0.8337,
"mean_token_accuracy": 0.8248002156615257,
"num_tokens": 137576448.0,
"step": 4200
},
{
"epoch": 2.2463978849966955,
"grad_norm": 0.21117758750915527,
"learning_rate": 0.0002839385017991495,
"loss": 0.8411,
"step": 4250
},
{
"epoch": 2.2463978849966955,
"eval_loss": 0.893865704536438,
"eval_mean_token_accuracy": 0.8210062818093733,
"eval_num_tokens": 139214848.0,
"eval_runtime": 1599.858,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 4250
},
{
"epoch": 2.2728354263053534,
"grad_norm": 0.20331983268260956,
"learning_rate": 0.00028121251771889654,
"loss": 0.8389,
"mean_token_accuracy": 0.824597994685173,
"num_tokens": 140853248.0,
"step": 4300
},
{
"epoch": 2.299272967614012,
"grad_norm": 0.19736993312835693,
"learning_rate": 0.00027848653363864357,
"loss": 0.8168,
"mean_token_accuracy": 0.8279356023669243,
"num_tokens": 142491648.0,
"step": 4350
},
{
"epoch": 2.32571050892267,
"grad_norm": 0.1942383050918579,
"learning_rate": 0.0002757605495583906,
"loss": 0.8158,
"mean_token_accuracy": 0.8288569149374961,
"num_tokens": 144130048.0,
"step": 4400
},
{
"epoch": 2.3521480502313286,
"grad_norm": 0.18327121436595917,
"learning_rate": 0.0002730345654781376,
"loss": 0.8097,
"mean_token_accuracy": 0.8300702553987503,
"num_tokens": 145768448.0,
"step": 4450
},
{
"epoch": 2.378585591539987,
"grad_norm": 0.17920152842998505,
"learning_rate": 0.00027030858139788467,
"loss": 0.8017,
"step": 4500
},
{
"epoch": 2.378585591539987,
"eval_loss": 0.874257504940033,
"eval_mean_token_accuracy": 0.823767999828996,
"eval_num_tokens": 147406848.0,
"eval_runtime": 1599.5025,
"eval_samples_per_second": 4.73,
"eval_steps_per_second": 0.591,
"step": 4500
},
{
"epoch": 2.405023132848645,
"grad_norm": 0.18811027705669403,
"learning_rate": 0.0002675825973176317,
"loss": 0.8215,
"mean_token_accuracy": 0.8293267333507538,
"num_tokens": 149045248.0,
"step": 4550
},
{
"epoch": 2.4314606741573033,
"grad_norm": 0.20340368151664734,
"learning_rate": 0.0002648566132373787,
"loss": 0.8249,
"mean_token_accuracy": 0.8268548348546028,
"num_tokens": 150683648.0,
"step": 4600
},
{
"epoch": 2.4578982154659617,
"grad_norm": 0.18492697179317474,
"learning_rate": 0.0002621306291571257,
"loss": 0.7914,
"mean_token_accuracy": 0.832571476995945,
"num_tokens": 152322048.0,
"step": 4650
},
{
"epoch": 2.48433575677462,
"grad_norm": 0.19855117797851562,
"learning_rate": 0.0002594046450768728,
"loss": 0.8077,
"mean_token_accuracy": 0.8298674210906029,
"num_tokens": 153960448.0,
"step": 4700
},
{
"epoch": 2.5107732980832784,
"grad_norm": 0.1997339129447937,
"learning_rate": 0.0002566786609966198,
"loss": 0.809,
"step": 4750
},
{
"epoch": 2.5107732980832784,
"eval_loss": 0.8553281426429749,
"eval_mean_token_accuracy": 0.8265610535729511,
"eval_num_tokens": 155598848.0,
"eval_runtime": 1599.9059,
"eval_samples_per_second": 4.728,
"eval_steps_per_second": 0.591,
"step": 4750
},
{
"epoch": 2.5372108393919364,
"grad_norm": 0.19008329510688782,
"learning_rate": 0.0002539526769163668,
"loss": 0.797,
"mean_token_accuracy": 0.8298222103714943,
"num_tokens": 157237248.0,
"step": 4800
},
{
"epoch": 2.5636483807005948,
"grad_norm": 0.18476171791553497,
"learning_rate": 0.00025122669283611385,
"loss": 0.7987,
"mean_token_accuracy": 0.8304337722063064,
"num_tokens": 158875648.0,
"step": 4850
},
{
"epoch": 2.590085922009253,
"grad_norm": 0.18693213164806366,
"learning_rate": 0.0002485007087558609,
"loss": 0.8042,
"mean_token_accuracy": 0.8297446221113205,
"num_tokens": 160514048.0,
"step": 4900
},
{
"epoch": 2.6165234633179115,
"grad_norm": 0.19470660388469696,
"learning_rate": 0.0002457747246756079,
"loss": 0.8024,
"mean_token_accuracy": 0.8308174461126328,
"num_tokens": 162152448.0,
"step": 4950
},
{
"epoch": 2.64296100462657,
"grad_norm": 0.23168876767158508,
"learning_rate": 0.00024304874059535492,
"loss": 0.7903,
"step": 5000
},
{
"epoch": 2.64296100462657,
"eval_loss": 0.8376234769821167,
"eval_mean_token_accuracy": 0.828871109394896,
"eval_num_tokens": 163790848.0,
"eval_runtime": 1600.0988,
"eval_samples_per_second": 4.728,
"eval_steps_per_second": 0.591,
"step": 5000
},
{
"epoch": 2.669398545935228,
"grad_norm": 0.15908803045749664,
"learning_rate": 0.00024032275651510195,
"loss": 0.7967,
"mean_token_accuracy": 0.8314005956053734,
"num_tokens": 165429248.0,
"step": 5050
},
{
"epoch": 2.6958360872438862,
"grad_norm": 0.1805862933397293,
"learning_rate": 0.000237596772434849,
"loss": 0.7774,
"mean_token_accuracy": 0.8344085997343064,
"num_tokens": 167067648.0,
"step": 5100
},
{
"epoch": 2.7222736285525446,
"grad_norm": 0.17997150123119354,
"learning_rate": 0.00023487078835459602,
"loss": 0.7851,
"mean_token_accuracy": 0.8325213807821273,
"num_tokens": 168706048.0,
"step": 5150
},
{
"epoch": 2.748711169861203,
"grad_norm": 0.18113110959529877,
"learning_rate": 0.00023214480427434303,
"loss": 0.776,
"mean_token_accuracy": 0.8346639758348465,
"num_tokens": 170344448.0,
"step": 5200
},
{
"epoch": 2.7751487111698614,
"grad_norm": 0.18302254378795624,
"learning_rate": 0.00022941882019409009,
"loss": 0.7854,
"step": 5250
},
{
"epoch": 2.7751487111698614,
"eval_loss": 0.8233165144920349,
"eval_mean_token_accuracy": 0.830954508725987,
"eval_num_tokens": 171982848.0,
"eval_runtime": 1599.9718,
"eval_samples_per_second": 4.728,
"eval_steps_per_second": 0.591,
"step": 5250
},
{
"epoch": 2.8015862524785193,
"grad_norm": 0.1922728568315506,
"learning_rate": 0.0002266928361138371,
"loss": 0.7936,
"mean_token_accuracy": 0.8322769993543625,
"num_tokens": 173621248.0,
"step": 5300
},
{
"epoch": 2.8280237937871777,
"grad_norm": 0.1617008000612259,
"learning_rate": 0.00022396685203358413,
"loss": 0.7738,
"mean_token_accuracy": 0.8344037118554115,
"num_tokens": 175259648.0,
"step": 5350
},
{
"epoch": 2.854461335095836,
"grad_norm": 0.17171062529087067,
"learning_rate": 0.00022124086795333116,
"loss": 0.7697,
"mean_token_accuracy": 0.8351166906952858,
"num_tokens": 176898048.0,
"step": 5400
},
{
"epoch": 2.8808988764044945,
"grad_norm": 0.1803775280714035,
"learning_rate": 0.0002185148838730782,
"loss": 0.7735,
"mean_token_accuracy": 0.8350091609358787,
"num_tokens": 178536448.0,
"step": 5450
},
{
"epoch": 2.907336417713153,
"grad_norm": 0.17305733263492584,
"learning_rate": 0.0002157888997928252,
"loss": 0.7716,
"step": 5500
},
{
"epoch": 2.907336417713153,
"eval_loss": 0.8076795339584351,
"eval_mean_token_accuracy": 0.8331229730841977,
"eval_num_tokens": 180174848.0,
"eval_runtime": 1600.6859,
"eval_samples_per_second": 4.726,
"eval_steps_per_second": 0.591,
"step": 5500
},
{
"epoch": 2.933773959021811,
"grad_norm": 0.17064611613750458,
"learning_rate": 0.00021306291571257226,
"loss": 0.7713,
"mean_token_accuracy": 0.8356136959791184,
"num_tokens": 181813248.0,
"step": 5550
},
{
"epoch": 2.960211500330469,
"grad_norm": 0.18137440085411072,
"learning_rate": 0.00021033693163231926,
"loss": 0.7667,
"mean_token_accuracy": 0.8351374611258506,
"num_tokens": 183451648.0,
"step": 5600
},
{
"epoch": 2.9866490416391276,
"grad_norm": 0.17405763268470764,
"learning_rate": 0.0002076109475520663,
"loss": 0.7495,
"mean_token_accuracy": 0.8385416662693024,
"num_tokens": 185090048.0,
"step": 5650
},
{
"epoch": 3.012690019828156,
"grad_norm": 0.17279721796512604,
"learning_rate": 0.0002049394831534184,
"loss": 0.7159,
"mean_token_accuracy": 0.8417613173499325,
"num_tokens": 186703872.0,
"step": 5700
},
{
"epoch": 3.0391275611368145,
"grad_norm": 0.19387085735797882,
"learning_rate": 0.0002022134990731654,
"loss": 0.666,
"step": 5750
},
{
"epoch": 3.0391275611368145,
"eval_loss": 0.8053749799728394,
"eval_mean_token_accuracy": 0.8340540434416959,
"eval_num_tokens": 188342272.0,
"eval_runtime": 1600.205,
"eval_samples_per_second": 4.728,
"eval_steps_per_second": 0.591,
"step": 5750
},
{
"epoch": 3.0655651024454724,
"grad_norm": 0.18193645775318146,
"learning_rate": 0.00019948751499291245,
"loss": 0.6644,
"mean_token_accuracy": 0.8480684906244278,
"num_tokens": 189980672.0,
"step": 5800
},
{
"epoch": 3.092002643754131,
"grad_norm": 0.16633963584899902,
"learning_rate": 0.00019676153091265948,
"loss": 0.6691,
"mean_token_accuracy": 0.847120603621006,
"num_tokens": 191619072.0,
"step": 5850
},
{
"epoch": 3.118440185062789,
"grad_norm": 0.17585037648677826,
"learning_rate": 0.0001940355468324065,
"loss": 0.6636,
"mean_token_accuracy": 0.84809934258461,
"num_tokens": 193257472.0,
"step": 5900
},
{
"epoch": 3.1448777263714476,
"grad_norm": 0.1676415503025055,
"learning_rate": 0.00019130956275215352,
"loss": 0.6672,
"mean_token_accuracy": 0.8475995865464211,
"num_tokens": 194895872.0,
"step": 5950
},
{
"epoch": 3.1713152676801055,
"grad_norm": 0.18070462346076965,
"learning_rate": 0.00018858357867190058,
"loss": 0.6627,
"step": 6000
},
{
"epoch": 3.1713152676801055,
"eval_loss": 0.801948070526123,
"eval_mean_token_accuracy": 0.8345137661909704,
"eval_num_tokens": 196534272.0,
"eval_runtime": 1599.8066,
"eval_samples_per_second": 4.729,
"eval_steps_per_second": 0.591,
"step": 6000
},
{
"epoch": 3.197752808988764,
"grad_norm": 0.16841137409210205,
"learning_rate": 0.00018585759459164758,
"loss": 0.6569,
"mean_token_accuracy": 0.8492378443479538,
"num_tokens": 198172672.0,
"step": 6050
},
{
"epoch": 3.2241903502974223,
"grad_norm": 0.18084491789340973,
"learning_rate": 0.00018313161051139462,
"loss": 0.6678,
"mean_token_accuracy": 0.8477779817581177,
"num_tokens": 199811072.0,
"step": 6100
},
{
"epoch": 3.2506278916060807,
"grad_norm": 0.17532089352607727,
"learning_rate": 0.00018040562643114165,
"loss": 0.6693,
"mean_token_accuracy": 0.8475476580858231,
"num_tokens": 201449472.0,
"step": 6150
},
{
"epoch": 3.277065432914739,
"grad_norm": 0.17762629687786102,
"learning_rate": 0.00017767964235088868,
"loss": 0.6568,
"mean_token_accuracy": 0.8500018376111984,
"num_tokens": 203087872.0,
"step": 6200
},
{
"epoch": 3.303502974223397,
"grad_norm": 0.17803572118282318,
"learning_rate": 0.0001749536582706357,
"loss": 0.6664,
"step": 6250
},
{
"epoch": 3.303502974223397,
"eval_loss": 0.7924287915229797,
"eval_mean_token_accuracy": 0.8360363316838384,
"eval_num_tokens": 204726272.0,
"eval_runtime": 1600.5314,
"eval_samples_per_second": 4.727,
"eval_steps_per_second": 0.591,
"step": 6250
},
{
"epoch": 3.3299405155320554,
"grad_norm": 0.1736496537923813,
"learning_rate": 0.00017222767419038275,
"loss": 0.6626,
"mean_token_accuracy": 0.8480022014677524,
"num_tokens": 206364672.0,
"step": 6300
},
{
"epoch": 3.3563780568407138,
"grad_norm": 0.1790972799062729,
"learning_rate": 0.00016950169011012976,
"loss": 0.666,
"mean_token_accuracy": 0.8478036442399025,
"num_tokens": 208003072.0,
"step": 6350
},
{
"epoch": 3.382815598149372,
"grad_norm": 0.17161910235881805,
"learning_rate": 0.0001667757060298768,
"loss": 0.6635,
"mean_token_accuracy": 0.8481677681207657,
"num_tokens": 209641472.0,
"step": 6400
},
{
"epoch": 3.4092531394580305,
"grad_norm": 0.17608526349067688,
"learning_rate": 0.00016404972194962382,
"loss": 0.6483,
"mean_token_accuracy": 0.8513996881246567,
"num_tokens": 211279872.0,
"step": 6450
},
{
"epoch": 3.4356906807666885,
"grad_norm": 0.17622597515583038,
"learning_rate": 0.00016132373786937086,
"loss": 0.6562,
"step": 6500
},
{
"epoch": 3.4356906807666885,
"eval_loss": 0.7829640507698059,
"eval_mean_token_accuracy": 0.8375042610768284,
"eval_num_tokens": 212918272.0,
"eval_runtime": 1600.7277,
"eval_samples_per_second": 4.726,
"eval_steps_per_second": 0.591,
"step": 6500
},
{
"epoch": 3.462128222075347,
"grad_norm": 0.18006405234336853,
"learning_rate": 0.00015859775378911786,
"loss": 0.6498,
"mean_token_accuracy": 0.8504380528628827,
"num_tokens": 214556672.0,
"step": 6550
},
{
"epoch": 3.4885657633840053,
"grad_norm": 0.16343793272972107,
"learning_rate": 0.0001558717697088649,
"loss": 0.6519,
"mean_token_accuracy": 0.850884655714035,
"num_tokens": 216195072.0,
"step": 6600
},
{
"epoch": 3.5150033046926636,
"grad_norm": 0.16798467934131622,
"learning_rate": 0.00015314578562861193,
"loss": 0.6648,
"mean_token_accuracy": 0.8490127098560333,
"num_tokens": 217833472.0,
"step": 6650
},
{
"epoch": 3.541440846001322,
"grad_norm": 0.15794213116168976,
"learning_rate": 0.00015041980154835896,
"loss": 0.6471,
"mean_token_accuracy": 0.8517173796892166,
"num_tokens": 219471872.0,
"step": 6700
},
{
"epoch": 3.56787838730998,
"grad_norm": 0.1636921763420105,
"learning_rate": 0.00014769381746810597,
"loss": 0.6424,
"step": 6750
},
{
"epoch": 3.56787838730998,
"eval_loss": 0.773522138595581,
"eval_mean_token_accuracy": 0.8390711046928583,
"eval_num_tokens": 221110272.0,
"eval_runtime": 1600.7216,
"eval_samples_per_second": 4.726,
"eval_steps_per_second": 0.591,
"step": 6750
},
{
"epoch": 3.5943159286186384,
"grad_norm": 0.15980064868927002,
"learning_rate": 0.00014496783338785303,
"loss": 0.6571,
"mean_token_accuracy": 0.851312015503645,
"num_tokens": 222748672.0,
"step": 6800
},
{
"epoch": 3.6207534699272967,
"grad_norm": 0.1708955615758896,
"learning_rate": 0.00014224184930760003,
"loss": 0.6484,
"mean_token_accuracy": 0.8513654717803001,
"num_tokens": 224387072.0,
"step": 6850
},
{
"epoch": 3.647191011235955,
"grad_norm": 0.16906002163887024,
"learning_rate": 0.00013951586522734707,
"loss": 0.6517,
"mean_token_accuracy": 0.8500537672638893,
"num_tokens": 226025472.0,
"step": 6900
},
{
"epoch": 3.6736285525446135,
"grad_norm": 0.16365185379981995,
"learning_rate": 0.0001367898811470941,
"loss": 0.6372,
"mean_token_accuracy": 0.8536284250020981,
"num_tokens": 227663872.0,
"step": 6950
},
{
"epoch": 3.7000660938532715,
"grad_norm": 0.17780087888240814,
"learning_rate": 0.00013406389706684113,
"loss": 0.6501,
"step": 7000
},
{
"epoch": 3.7000660938532715,
"eval_loss": 0.7657620906829834,
"eval_mean_token_accuracy": 0.8401046276848614,
"eval_num_tokens": 229302272.0,
"eval_runtime": 1600.1467,
"eval_samples_per_second": 4.728,
"eval_steps_per_second": 0.591,
"step": 7000
},
{
"epoch": 3.72650363516193,
"grad_norm": 0.17722897231578827,
"learning_rate": 0.00013133791298658814,
"loss": 0.6527,
"mean_token_accuracy": 0.8508571648597717,
"num_tokens": 230940672.0,
"step": 7050
},
{
"epoch": 3.7529411764705882,
"grad_norm": 0.16244906187057495,
"learning_rate": 0.0001286119289063352,
"loss": 0.6356,
"mean_token_accuracy": 0.8537634432315826,
"num_tokens": 232579072.0,
"step": 7100
},
{
"epoch": 3.7793787177792466,
"grad_norm": 0.15864387154579163,
"learning_rate": 0.0001258859448260822,
"loss": 0.6452,
"mean_token_accuracy": 0.8518102434277535,
"num_tokens": 234217472.0,
"step": 7150
},
{
"epoch": 3.805816259087905,
"grad_norm": 0.16620229184627533,
"learning_rate": 0.00012315996074582924,
"loss": 0.6418,
"mean_token_accuracy": 0.8521817001700401,
"num_tokens": 235855872.0,
"step": 7200
},
{
"epoch": 3.832253800396563,
"grad_norm": 0.1765565574169159,
"learning_rate": 0.00012043397666557627,
"loss": 0.6387,
"step": 7250
},
{
"epoch": 3.832253800396563,
"eval_loss": 0.7578161358833313,
"eval_mean_token_accuracy": 0.8413670561404359,
"eval_num_tokens": 237494272.0,
"eval_runtime": 1602.8152,
"eval_samples_per_second": 4.72,
"eval_steps_per_second": 0.59,
"step": 7250
},
{
"epoch": 3.8586913417052213,
"grad_norm": 0.15968503057956696,
"learning_rate": 0.0001177079925853233,
"loss": 0.6365,
"mean_token_accuracy": 0.8533528861403465,
"num_tokens": 239132672.0,
"step": 7300
},
{
"epoch": 3.8851288830138797,
"grad_norm": 0.15743543207645416,
"learning_rate": 0.00011498200850507034,
"loss": 0.6486,
"mean_token_accuracy": 0.8513813573122024,
"num_tokens": 240771072.0,
"step": 7350
},
{
"epoch": 3.911566424322538,
"grad_norm": 0.18122394382953644,
"learning_rate": 0.00011225602442481736,
"loss": 0.6384,
"mean_token_accuracy": 0.8533547213673591,
"num_tokens": 242409472.0,
"step": 7400
},
{
"epoch": 3.9380039656311965,
"grad_norm": 0.15892641246318817,
"learning_rate": 0.00010953004034456439,
"loss": 0.6338,
"mean_token_accuracy": 0.8538844108581543,
"num_tokens": 244047872.0,
"step": 7450
},
{
"epoch": 3.9644415069398544,
"grad_norm": 0.16563069820404053,
"learning_rate": 0.00010680405626431142,
"loss": 0.6256,
"step": 7500
},
{
"epoch": 3.9644415069398544,
"eval_loss": 0.7491397857666016,
"eval_mean_token_accuracy": 0.8425506683535102,
"eval_num_tokens": 245686272.0,
"eval_runtime": 1603.9187,
"eval_samples_per_second": 4.717,
"eval_steps_per_second": 0.59,
"step": 7500
},
{
"epoch": 3.990879048248513,
"grad_norm": 0.1561686098575592,
"learning_rate": 0.00010407807218405844,
"loss": 0.6398,
"mean_token_accuracy": 0.8541101579368114,
"num_tokens": 247324672.0,
"step": 7550
},
{
"epoch": 4.016920026437541,
"grad_norm": 0.17185606062412262,
"learning_rate": 0.00010135208810380548,
"loss": 0.5504,
"mean_token_accuracy": 0.8682107241625713,
"num_tokens": 248938496.0,
"step": 7600
},
{
"epoch": 4.0433575677462,
"grad_norm": 0.17470529675483704,
"learning_rate": 9.86261040235525e-05,
"loss": 0.5029,
"mean_token_accuracy": 0.8748790314793586,
"num_tokens": 250576896.0,
"step": 7650
},
{
"epoch": 4.069795109054858,
"grad_norm": 0.1801612824201584,
"learning_rate": 9.590011994329953e-05,
"loss": 0.5043,
"mean_token_accuracy": 0.8748985821008682,
"num_tokens": 252215296.0,
"step": 7700
},
{
"epoch": 4.0962326503635165,
"grad_norm": 0.16825653612613678,
"learning_rate": 9.317413586304656e-05,
"loss": 0.4967,
"step": 7750
},
{
"epoch": 4.0962326503635165,
"eval_loss": 0.7883051037788391,
"eval_mean_token_accuracy": 0.8408105385983973,
"eval_num_tokens": 253853696.0,
"eval_runtime": 1607.5856,
"eval_samples_per_second": 4.706,
"eval_steps_per_second": 0.588,
"step": 7750
},
{
"epoch": 4.122670191672174,
"grad_norm": 0.17985741794109344,
"learning_rate": 9.044815178279358e-05,
"loss": 0.5031,
"mean_token_accuracy": 0.875472262352705,
"num_tokens": 255492096.0,
"step": 7800
},
{
"epoch": 4.149107732980832,
"grad_norm": 0.17613214254379272,
"learning_rate": 8.772216770254061e-05,
"loss": 0.4969,
"mean_token_accuracy": 0.8762671053409576,
"num_tokens": 257130496.0,
"step": 7850
},
{
"epoch": 4.175545274289491,
"grad_norm": 0.17405198514461517,
"learning_rate": 8.499618362228765e-05,
"loss": 0.5095,
"mean_token_accuracy": 0.8734744620323182,
"num_tokens": 258768896.0,
"step": 7900
},
{
"epoch": 4.201982815598149,
"grad_norm": 0.17185764014720917,
"learning_rate": 8.227019954203467e-05,
"loss": 0.5074,
"mean_token_accuracy": 0.8739729967713356,
"num_tokens": 260407296.0,
"step": 7950
},
{
"epoch": 4.228420356906808,
"grad_norm": 0.17758677899837494,
"learning_rate": 7.95442154617817e-05,
"loss": 0.5085,
"step": 8000
},
{
"epoch": 4.228420356906808,
"eval_loss": 0.7870664000511169,
"eval_mean_token_accuracy": 0.8414596145929292,
"eval_num_tokens": 262045696.0,
"eval_runtime": 1607.3849,
"eval_samples_per_second": 4.706,
"eval_steps_per_second": 0.589,
"step": 8000
},
{
"epoch": 4.254857898215466,
"grad_norm": 0.16629241406917572,
"learning_rate": 7.681823138152873e-05,
"loss": 0.5032,
"mean_token_accuracy": 0.8741639178991317,
"num_tokens": 263684096.0,
"step": 8050
},
{
"epoch": 4.281295439524124,
"grad_norm": 0.173508420586586,
"learning_rate": 7.409224730127575e-05,
"loss": 0.4909,
"mean_token_accuracy": 0.8775629255175591,
"num_tokens": 265322496.0,
"step": 8100
},
{
"epoch": 4.307732980832783,
"grad_norm": 0.1713671237230301,
"learning_rate": 7.136626322102279e-05,
"loss": 0.4923,
"mean_token_accuracy": 0.8772788345813751,
"num_tokens": 266960896.0,
"step": 8150
},
{
"epoch": 4.334170522141441,
"grad_norm": 0.17122632265090942,
"learning_rate": 6.864027914076983e-05,
"loss": 0.5,
"mean_token_accuracy": 0.8755180832743644,
"num_tokens": 268599296.0,
"step": 8200
},
{
"epoch": 4.360608063450099,
"grad_norm": 0.17359545826911926,
"learning_rate": 6.591429506051685e-05,
"loss": 0.4943,
"step": 8250
},
{
"epoch": 4.360608063450099,
"eval_loss": 0.7823996543884277,
"eval_mean_token_accuracy": 0.8421699439370355,
"eval_num_tokens": 270237696.0,
"eval_runtime": 1607.7567,
"eval_samples_per_second": 4.705,
"eval_steps_per_second": 0.588,
"step": 8250
},
{
"epoch": 4.387045604758757,
"grad_norm": 0.17702388763427734,
"learning_rate": 6.318831098026388e-05,
"loss": 0.4904,
"mean_token_accuracy": 0.8775449013710022,
"num_tokens": 271876096.0,
"step": 8300
},
{
"epoch": 4.413483146067415,
"grad_norm": 0.18663644790649414,
"learning_rate": 6.0462326900010904e-05,
"loss": 0.4959,
"mean_token_accuracy": 0.8762383911013604,
"num_tokens": 273514496.0,
"step": 8350
},
{
"epoch": 4.439920687376074,
"grad_norm": 0.1880512684583664,
"learning_rate": 5.773634281975793e-05,
"loss": 0.4931,
"mean_token_accuracy": 0.8767839661240577,
"num_tokens": 275152896.0,
"step": 8400
},
{
"epoch": 4.466358228684732,
"grad_norm": 0.18527589738368988,
"learning_rate": 5.5010358739504963e-05,
"loss": 0.4877,
"mean_token_accuracy": 0.87819525629282,
"num_tokens": 276791296.0,
"step": 8450
},
{
"epoch": 4.492795769993391,
"grad_norm": 0.19010977447032928,
"learning_rate": 5.228437465925199e-05,
"loss": 0.4894,
"step": 8500
},
{
"epoch": 4.492795769993391,
"eval_loss": 0.7803131341934204,
"eval_mean_token_accuracy": 0.8430041650637008,
"eval_num_tokens": 278429696.0,
"eval_runtime": 1610.9854,
"eval_samples_per_second": 4.696,
"eval_steps_per_second": 0.587,
"step": 8500
},
{
"epoch": 4.519233311302049,
"grad_norm": 0.17016442120075226,
"learning_rate": 4.9558390578999016e-05,
"loss": 0.4847,
"mean_token_accuracy": 0.8786284182965756,
"num_tokens": 280068096.0,
"step": 8550
},
{
"epoch": 4.545670852610707,
"grad_norm": 0.1719425618648529,
"learning_rate": 4.683240649874604e-05,
"loss": 0.4875,
"mean_token_accuracy": 0.8785123375058174,
"num_tokens": 281706496.0,
"step": 8600
},
{
"epoch": 4.572108393919366,
"grad_norm": 0.17816464602947235,
"learning_rate": 4.4106422418493076e-05,
"loss": 0.4863,
"mean_token_accuracy": 0.8782337459921837,
"num_tokens": 283344896.0,
"step": 8650
},
{
"epoch": 4.598545935228024,
"grad_norm": 0.1728549599647522,
"learning_rate": 4.138043833824011e-05,
"loss": 0.4879,
"mean_token_accuracy": 0.8787457209825515,
"num_tokens": 284983296.0,
"step": 8700
},
{
"epoch": 4.624983476536682,
"grad_norm": 0.18577666580677032,
"learning_rate": 3.8654454257987135e-05,
"loss": 0.4914,
"step": 8750
},
{
"epoch": 4.624983476536682,
"eval_loss": 0.7784421443939209,
"eval_mean_token_accuracy": 0.8436387255000262,
"eval_num_tokens": 286621696.0,
"eval_runtime": 1611.4393,
"eval_samples_per_second": 4.695,
"eval_steps_per_second": 0.587,
"step": 8750
},
{
"epoch": 4.65142101784534,
"grad_norm": 0.16825436055660248,
"learning_rate": 3.592847017773417e-05,
"loss": 0.4756,
"mean_token_accuracy": 0.8792506690323353,
"num_tokens": 288260096.0,
"step": 8800
},
{
"epoch": 4.677858559153998,
"grad_norm": 0.18510740995407104,
"learning_rate": 3.3202486097481194e-05,
"loss": 0.4788,
"mean_token_accuracy": 0.8801001918315887,
"num_tokens": 289898496.0,
"step": 8850
},
{
"epoch": 4.704296100462657,
"grad_norm": 0.18907974660396576,
"learning_rate": 3.0476502017228217e-05,
"loss": 0.4837,
"mean_token_accuracy": 0.8794446450471878,
"num_tokens": 291536896.0,
"step": 8900
},
{
"epoch": 4.730733641771315,
"grad_norm": 0.1798245906829834,
"learning_rate": 2.775051793697525e-05,
"loss": 0.4883,
"mean_token_accuracy": 0.8778897827863693,
"num_tokens": 293175296.0,
"step": 8950
},
{
"epoch": 4.757171183079974,
"grad_norm": 0.17980748414993286,
"learning_rate": 2.502453385672228e-05,
"loss": 0.475,
"step": 9000
},
{
"epoch": 4.757171183079974,
"eval_loss": 0.7753015756607056,
"eval_mean_token_accuracy": 0.8443130426754659,
"eval_num_tokens": 294813696.0,
"eval_runtime": 1611.452,
"eval_samples_per_second": 4.695,
"eval_steps_per_second": 0.587,
"step": 9000
},
{
"epoch": 4.783608724388632,
"grad_norm": 0.17731408774852753,
"learning_rate": 2.2298549776469306e-05,
"loss": 0.4657,
"mean_token_accuracy": 0.8821294555068016,
"num_tokens": 296452096.0,
"step": 9050
},
{
"epoch": 4.81004626569729,
"grad_norm": 0.19258248805999756,
"learning_rate": 1.9572565696216336e-05,
"loss": 0.4779,
"mean_token_accuracy": 0.8807239699363708,
"num_tokens": 298090496.0,
"step": 9100
},
{
"epoch": 4.836483807005949,
"grad_norm": 0.17705880105495453,
"learning_rate": 1.6846581615963362e-05,
"loss": 0.476,
"mean_token_accuracy": 0.8808369943499565,
"num_tokens": 299728896.0,
"step": 9150
},
{
"epoch": 4.8629213483146065,
"grad_norm": 0.1794816255569458,
"learning_rate": 1.4120597535710392e-05,
"loss": 0.4742,
"mean_token_accuracy": 0.8813196429610253,
"num_tokens": 301367296.0,
"step": 9200
},
{
"epoch": 4.889358889623265,
"grad_norm": 0.17823387682437897,
"learning_rate": 1.139461345545742e-05,
"loss": 0.4719,
"step": 9250
},
{
"epoch": 4.889358889623265,
"eval_loss": 0.7754274010658264,
"eval_mean_token_accuracy": 0.844791491931387,
"eval_num_tokens": 303005696.0,
"eval_runtime": 1610.8654,
"eval_samples_per_second": 4.696,
"eval_steps_per_second": 0.587,
"step": 9250
},
{
"epoch": 4.915796430931923,
"grad_norm": 0.16834519803524017,
"learning_rate": 8.668629375204448e-06,
"loss": 0.4653,
"mean_token_accuracy": 0.8821077673137188,
"num_tokens": 304644096.0,
"step": 9300
},
{
"epoch": 4.942233972240581,
"grad_norm": 0.17272663116455078,
"learning_rate": 5.942645294951477e-06,
"loss": 0.4783,
"mean_token_accuracy": 0.8806390488147735,
"num_tokens": 306282496.0,
"step": 9350
},
{
"epoch": 4.96867151354924,
"grad_norm": 0.17334023118019104,
"learning_rate": 3.2166612146985063e-06,
"loss": 0.4794,
"mean_token_accuracy": 0.8807239702343941,
"num_tokens": 307920896.0,
"step": 9400
},
{
"epoch": 4.995109054857898,
"grad_norm": 0.17255398631095886,
"learning_rate": 4.906771344455349e-07,
"loss": 0.4793,
"mean_token_accuracy": 0.8803439608216286,
"num_tokens": 309559296.0,
"step": 9450
}
],
"logging_steps": 50,
"max_steps": 9455,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 619390244487168.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}