Graphite1.0-4B / last-checkpoint /trainer_state.json
Starred
Training in progress, step 2256, checkpoint
e8ac637 verified
{
"best_global_step": 2250,
"best_metric": 0.18876151740550995,
"best_model_checkpoint": "/kaggle/working/obsidian_critic_qwen35_t4x2_unsloth/runs/obsidian_critic_full_epoch/checkpoint-2250",
"epoch": 1.0,
"eval_steps": 125,
"global_step": 2256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004434343994235353,
"grad_norm": 0.21817488968372345,
"last_batch_tokens": 257,
"learning_rate": 0.0,
"loss": 2.5221590995788574,
"lr": 2e-05,
"step": 1,
"tokens_per_second": 27.955696254559246,
"tokens_per_step": 1560.0,
"total_tokens_seen": 1560
},
{
"epoch": 0.022171719971176763,
"grad_norm": 0.49327757954597473,
"last_batch_tokens": 229,
"learning_rate": 9.990575514806563e-05,
"loss": 1.981216119260204,
"lr": 9.990142403513012e-05,
"step": 50,
"tokens_per_second": 76.9470637679645,
"tokens_per_step": 1551.7,
"total_tokens_seen": 77585
},
{
"epoch": 0.04434343994235353,
"grad_norm": 0.8501848578453064,
"last_batch_tokens": 193,
"learning_rate": 9.957034339013742e-05,
"loss": 1.1713996887207032,
"lr": 9.956116660116155e-05,
"step": 100,
"tokens_per_second": 86.53475088010235,
"tokens_per_step": 1572.7,
"total_tokens_seen": 157270
},
{
"epoch": 0.05542929992794191,
"eval_loss": 0.9524237513542175,
"eval_runtime": 104.4026,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.743,
"last_batch_tokens": 172,
"lr": 9.930042238269485e-05,
"step": 125,
"tokens_per_second": 133.98724044386626,
"tokens_per_step": 1853.72,
"total_tokens_seen": 231715
},
{
"epoch": 0.0665151599135303,
"grad_norm": 0.892642617225647,
"last_batch_tokens": 59,
"learning_rate": 9.899364434012273e-05,
"loss": 0.8618771362304688,
"lr": 9.897966654380171e-05,
"step": 150,
"tokens_per_second": 78.95926927075263,
"tokens_per_step": 1780.0866666666666,
"total_tokens_seen": 267013
},
{
"epoch": 0.08868687988470705,
"grad_norm": 0.8177722692489624,
"last_batch_tokens": 275,
"learning_rate": 9.817846512306061e-05,
"loss": 0.69920166015625,
"lr": 9.815975435734603e-05,
"step": 200,
"tokens_per_second": 83.13681099990983,
"tokens_per_step": 1700.74,
"total_tokens_seen": 340148
},
{
"epoch": 0.11085859985588382,
"grad_norm": 1.009503960609436,
"last_batch_tokens": 181,
"learning_rate": 9.712877368374224e-05,
"loss": 0.6449888610839843,
"lr": 9.710542102466229e-05,
"step": 250,
"tokens_per_second": 82.25185579838005,
"tokens_per_step": 1656.984,
"total_tokens_seen": 414246
},
{
"epoch": 0.11085859985588382,
"eval_loss": 0.6187728047370911,
"eval_runtime": 88.3548,
"eval_samples_per_second": 4.108,
"eval_steps_per_second": 2.06,
"last_batch_tokens": 172,
"lr": 9.710542102466229e-05,
"step": 250,
"tokens_per_second": 363.7051920895653,
"tokens_per_step": 1785.536,
"total_tokens_seen": 446384
},
{
"epoch": 0.1330303198270606,
"grad_norm": 0.5571497082710266,
"last_batch_tokens": 329,
"learning_rate": 9.584967947244769e-05,
"loss": 0.5449295806884765,
"lr": 9.582179859078793e-05,
"step": 300,
"tokens_per_second": 81.02823836816424,
"tokens_per_step": 1724.5733333333333,
"total_tokens_seen": 517372
},
{
"epoch": 0.15520203979823735,
"grad_norm": 0.7961392998695374,
"last_batch_tokens": 165,
"learning_rate": 9.434740857432105e-05,
"loss": 0.46938041687011717,
"lr": 9.431513518232342e-05,
"step": 350,
"tokens_per_second": 89.52354398651325,
"tokens_per_step": 1704.1371428571429,
"total_tokens_seen": 596448
},
{
"epoch": 0.16628789978382574,
"eval_loss": 0.4863806366920471,
"eval_runtime": 87.0251,
"eval_samples_per_second": 4.171,
"eval_steps_per_second": 2.091,
"last_batch_tokens": 172,
"lr": 9.348041345533653e-05,
"step": 375,
"tokens_per_second": 135.38091364115044,
"tokens_per_step": 1784.712,
"total_tokens_seen": 669267
},
{
"epoch": 0.1773737597694141,
"grad_norm": 0.7586395144462585,
"last_batch_tokens": 351,
"learning_rate": 9.262927340344295e-05,
"loss": 0.4675440216064453,
"lr": 9.259276459421655e-05,
"step": 400,
"tokens_per_second": 81.3096016563381,
"tokens_per_step": 1764.9875,
"total_tokens_seen": 705995
},
{
"epoch": 0.19954547974059086,
"grad_norm": 0.7313582897186279,
"last_batch_tokens": 369,
"learning_rate": 9.070363710911735e-05,
"loss": 0.3964078140258789,
"lr": 9.066307059197612e-05,
"step": 450,
"tokens_per_second": 87.86278133239196,
"tokens_per_step": 1744.9444444444443,
"total_tokens_seen": 785225
},
{
"epoch": 0.22171719971176765,
"grad_norm": 0.5969849228858948,
"last_batch_tokens": 193,
"learning_rate": 8.857987286762718e-05,
"loss": 0.3672472381591797,
"lr": 8.853544610307675e-05,
"step": 500,
"tokens_per_second": 87.74574317837812,
"tokens_per_step": 1729.026,
"total_tokens_seen": 864513
},
{
"epoch": 0.22171719971176765,
"eval_loss": 0.40328726172447205,
"eval_runtime": 87.1124,
"eval_samples_per_second": 4.167,
"eval_steps_per_second": 2.089,
"last_batch_tokens": 172,
"lr": 8.853544610307675e-05,
"step": 500,
"tokens_per_second": 368.8907701487212,
"tokens_per_step": 1793.302,
"total_tokens_seen": 896651
},
{
"epoch": 0.2438889196829444,
"grad_norm": 0.7751753330230713,
"last_batch_tokens": 273,
"learning_rate": 8.626831825760946e-05,
"loss": 0.3414393615722656,
"lr": 8.622024749619364e-05,
"step": 550,
"tokens_per_second": 82.92877874873523,
"tokens_per_step": 1766.3690909090908,
"total_tokens_seen": 971503
},
{
"epoch": 0.2660606396541212,
"grad_norm": 0.7136653065681458,
"last_batch_tokens": 305,
"learning_rate": 8.378022494113098e-05,
"loss": 0.3377827072143555,
"lr": 8.372874417081631e-05,
"step": 600,
"tokens_per_second": 90.40251231127895,
"tokens_per_step": 1748.685,
"total_tokens_seen": 1049211
},
{
"epoch": 0.27714649963970955,
"eval_loss": 0.35334891080856323,
"eval_runtime": 87.0325,
"eval_samples_per_second": 4.171,
"eval_steps_per_second": 2.091,
"last_batch_tokens": 172,
"lr": 8.24206361704162e-05,
"step": 625,
"tokens_per_second": 135.75737480096265,
"tokens_per_step": 1791.824,
"total_tokens_seen": 1119890
},
{
"epoch": 0.2882323596252979,
"grad_norm": 0.7202998399734497,
"last_batch_tokens": 211,
"learning_rate": 8.112770389539574e-05,
"loss": 0.3233934020996094,
"lr": 8.107306370261785e-05,
"step": 650,
"tokens_per_second": 84.5144051400581,
"tokens_per_step": 1779.3815384615384,
"total_tokens_seen": 1156598
},
{
"epoch": 0.3104040795964747,
"grad_norm": 0.7681185007095337,
"last_batch_tokens": 236,
"learning_rate": 7.832366646167268e-05,
"loss": 0.3125551414489746,
"lr": 7.826613281158841e-05,
"step": 700,
"tokens_per_second": 84.37944807859942,
"tokens_per_step": 1759.6771428571428,
"total_tokens_seen": 1231774
},
{
"epoch": 0.3325757995676515,
"grad_norm": 0.659271776676178,
"last_batch_tokens": 939,
"learning_rate": 7.538176149839243e-05,
"loss": 0.28798053741455076,
"lr": 7.532161444027488e-05,
"step": 750,
"tokens_per_second": 87.73140620694117,
"tokens_per_step": 1745.06,
"total_tokens_seen": 1308795
},
{
"epoch": 0.3325757995676515,
"eval_loss": 0.3200623393058777,
"eval_runtime": 87.2377,
"eval_samples_per_second": 4.161,
"eval_steps_per_second": 2.086,
"last_batch_tokens": 172,
"lr": 7.532161444027488e-05,
"step": 750,
"tokens_per_second": 368.35941630029333,
"tokens_per_step": 1787.9106666666667,
"total_tokens_seen": 1340933
},
{
"epoch": 0.3547475195388282,
"grad_norm": 0.5721789598464966,
"last_batch_tokens": 124,
"learning_rate": 7.231630894432527e-05,
"loss": 0.29953609466552733,
"lr": 7.22538412484033e-05,
"step": 800,
"tokens_per_second": 65.97096831279634,
"tokens_per_step": 98.35625,
"total_tokens_seen": 78685
},
{
"epoch": 0.376919239510005,
"grad_norm": 0.4275953471660614,
"last_batch_tokens": 266,
"learning_rate": 6.914223011522581e-05,
"loss": 0.27611801147460935,
"lr": 6.907774584760349e-05,
"step": 850,
"tokens_per_second": 76.59339331072898,
"tokens_per_step": 183.97411764705882,
"total_tokens_seen": 156378
},
{
"epoch": 0.38800509949559336,
"eval_loss": 0.28222641348838806,
"eval_runtime": 113.424,
"eval_samples_per_second": 3.2,
"eval_steps_per_second": 1.605,
"last_batch_tokens": 172,
"lr": 6.745388997609773e-05,
"step": 875,
"tokens_per_second": 114.49594753151979,
"tokens_per_step": 258.8742857142857,
"total_tokens_seen": 226515
},
{
"epoch": 0.39909095948118173,
"grad_norm": 0.5093332529067993,
"last_batch_tokens": 209,
"learning_rate": 6.587497507323132e-05,
"loss": 0.26179553985595705,
"lr": 6.580878811582379e-05,
"step": 900,
"tokens_per_second": 82.29563477689274,
"tokens_per_step": 298.55555555555554,
"total_tokens_seen": 268700
},
{
"epoch": 0.4212626794523585,
"grad_norm": 0.3912750482559204,
"last_batch_tokens": 103,
"learning_rate": 6.253044742254792e-05,
"loss": 0.25117488861083986,
"lr": 6.246287994523805e-05,
"step": 950,
"tokens_per_second": 79.79549481684828,
"tokens_per_step": 366.02947368421053,
"total_tokens_seen": 347728
},
{
"epoch": 0.4434343994235353,
"grad_norm": 0.4664643406867981,
"last_batch_tokens": 203,
"learning_rate": 5.9124926897487534e-05,
"loss": 0.25925636291503906,
"lr": 5.9056307789940357e-05,
"step": 1000,
"tokens_per_second": 76.53280228762407,
"tokens_per_step": 422.387,
"total_tokens_seen": 422387
},
{
"epoch": 0.4434343994235353,
"eval_loss": 0.26276224851608276,
"eval_runtime": 95.1275,
"eval_samples_per_second": 3.816,
"eval_steps_per_second": 1.913,
"last_batch_tokens": 172,
"lr": 5.9056307789940357e-05,
"step": 1000,
"tokens_per_second": 337.8095566509732,
"tokens_per_step": 454.525,
"total_tokens_seen": 454525
},
{
"epoch": 0.465606119394712,
"grad_norm": 0.7413877248764038,
"last_batch_tokens": 252,
"learning_rate": 5.56749901196638e-05,
"loss": 0.2307398223876953,
"lr": 5.5605653390431875e-05,
"step": 1050,
"tokens_per_second": 85.43713054173512,
"tokens_per_step": 512.1695238095238,
"total_tokens_seen": 537778
},
{
"epoch": 0.4877778393658888,
"grad_norm": 0.43335428833961487,
"last_batch_tokens": 142,
"learning_rate": 5.219742991006728e-05,
"loss": 0.24115974426269532,
"lr": 5.21277130607795e-05,
"step": 1100,
"tokens_per_second": 75.7193860694182,
"tokens_per_step": 556.2981818181818,
"total_tokens_seen": 611928
},
{
"epoch": 0.4988636993514772,
"eval_loss": 0.24811844527721405,
"eval_runtime": 94.9042,
"eval_samples_per_second": 3.825,
"eval_steps_per_second": 1.918,
"last_batch_tokens": 172,
"lr": 5.038379808781369e-05,
"step": 1125,
"tokens_per_second": 123.01878328450903,
"tokens_per_step": 607.1377777777777,
"total_tokens_seen": 683030
},
{
"epoch": 0.5099495593370655,
"grad_norm": 0.6529182195663452,
"last_batch_tokens": 102,
"learning_rate": 4.870917354877421e-05,
"loss": 0.22134504318237305,
"lr": 4.8639415931321794e-05,
"step": 1150,
"tokens_per_second": 83.41761800246071,
"tokens_per_step": 630.3573913043479,
"total_tokens_seen": 724911
},
{
"epoch": 0.5321212793082424,
"grad_norm": 0.4320646822452545,
"last_batch_tokens": 175,
"learning_rate": 4.522720038016592e-05,
"loss": 0.2152995491027832,
"lr": 4.515774154488211e-05,
"step": 1200,
"tokens_per_second": 82.13691539662977,
"tokens_per_step": 672.07,
"total_tokens_seen": 806484
},
{
"epoch": 0.5542929992794191,
"grad_norm": 0.6192132234573364,
"last_batch_tokens": 267,
"learning_rate": 4.1768459164721196e-05,
"loss": 0.20546873092651366,
"lr": 4.1699637207595034e-05,
"step": 1250,
"tokens_per_second": 83.92327455847254,
"tokens_per_step": 710.0544,
"total_tokens_seen": 887568
},
{
"epoch": 0.5542929992794191,
"eval_loss": 0.23204679787158966,
"eval_runtime": 94.3616,
"eval_samples_per_second": 3.847,
"eval_steps_per_second": 1.929,
"last_batch_tokens": 172,
"lr": 4.1699637207595034e-05,
"step": 1250,
"tokens_per_second": 340.54961477393465,
"tokens_per_step": 735.7648,
"total_tokens_seen": 919706
},
{
"epoch": 0.5764647192505958,
"grad_norm": 0.3487900495529175,
"last_batch_tokens": 134,
"learning_rate": 3.8349785579678194e-05,
"loss": 0.21177234649658203,
"lr": 3.828193549664752e-05,
"step": 1300,
"tokens_per_second": 79.01101943117263,
"tokens_per_step": 766.2323076923077,
"total_tokens_seen": 996102
},
{
"epoch": 0.5986364392217727,
"grad_norm": 0.42593374848365784,
"last_batch_tokens": 942,
"learning_rate": 3.498782027013742e-05,
"loss": 0.2180424690246582,
"lr": 3.492127232647139e-05,
"step": 1350,
"tokens_per_second": 80.48352941836103,
"tokens_per_step": 795.4074074074074,
"total_tokens_seen": 1073800
},
{
"epoch": 0.609722299207361,
"eval_loss": 0.2193347066640854,
"eval_runtime": 94.4814,
"eval_samples_per_second": 3.842,
"eval_steps_per_second": 1.926,
"last_batch_tokens": 172,
"lr": 3.326745518863976e-05,
"step": 1375,
"tokens_per_second": 124.66627567382365,
"tokens_per_step": 832.9498181818182,
"total_tokens_seen": 1145306
},
{
"epoch": 0.6208081591929494,
"grad_norm": 0.3440966010093689,
"last_batch_tokens": 176,
"learning_rate": 3.169892784949768e-05,
"loss": 0.22419458389282226,
"lr": 3.163400597220633e-05,
"step": 1400,
"tokens_per_second": 84.21467446062582,
"tokens_per_step": 847.435,
"total_tokens_seen": 1186409
},
{
"epoch": 0.6429798791641261,
"grad_norm": 0.48472294211387634,
"last_batch_tokens": 99,
"learning_rate": 2.8499117243496988e-05,
"loss": 0.20303966522216796,
"lr": 2.843613744459269e-05,
"step": 1450,
"tokens_per_second": 84.12853596803436,
"tokens_per_step": 874.0124137931034,
"total_tokens_seen": 1267318
},
{
"epoch": 0.665151599135303,
"grad_norm": 0.48055633902549744,
"last_batch_tokens": 92,
"learning_rate": 2.5403963765589118e-05,
"loss": 0.18697463989257812,
"lr": 2.5343232603874866e-05,
"step": 1500,
"tokens_per_second": 83.84733093235428,
"tokens_per_step": 900.2046666666666,
"total_tokens_seen": 1350307
},
{
"epoch": 0.665151599135303,
"eval_loss": 0.20863106846809387,
"eval_runtime": 94.5131,
"eval_samples_per_second": 3.841,
"eval_steps_per_second": 1.926,
"last_batch_tokens": 172,
"lr": 2.5343232603874866e-05,
"step": 1500,
"tokens_per_second": 340.00444794736484,
"tokens_per_step": 921.63,
"total_tokens_seen": 1382445
},
{
"epoch": 0.6873233191064797,
"grad_norm": 0.41916459798812866,
"last_batch_tokens": 426,
"learning_rate": 2.2428533302959837e-05,
"loss": 0.201729736328125,
"lr": 2.2370346391831737e-05,
"step": 1550,
"tokens_per_second": 80.49134617228279,
"tokens_per_step": 942.8058064516129,
"total_tokens_seen": 1461349
},
{
"epoch": 0.7094950390776564,
"grad_norm": 0.38731154799461365,
"last_batch_tokens": 312,
"learning_rate": 1.9587308982213076e-05,
"loss": 0.18205615997314453,
"lr": 1.953194955074038e-05,
"step": 1600,
"tokens_per_second": 79.4246014683713,
"tokens_per_step": 961.505,
"total_tokens_seen": 1538408
},
{
"epoch": 0.7205808990632449,
"eval_loss": 0.20174801349639893,
"eval_runtime": 94.2485,
"eval_samples_per_second": 3.852,
"eval_steps_per_second": 1.931,
"last_batch_tokens": 172,
"lr": 1.816752961112065e-05,
"step": 1625,
"tokens_per_second": 120.14447834109774,
"tokens_per_step": 988.5981538461539,
"total_tokens_seen": 1606472
},
{
"epoch": 0.7316667590488332,
"grad_norm": 0.42647936940193176,
"last_batch_tokens": 168,
"learning_rate": 1.6894120671686986e-05,
"loss": 0.1889303970336914,
"lr": 1.6841858185973775e-05,
"step": 1650,
"tokens_per_second": 75.64734832954207,
"tokens_per_step": 995.8060606060606,
"total_tokens_seen": 1643080
},
{
"epoch": 0.75383847902001,
"grad_norm": 0.41556963324546814,
"last_batch_tokens": 169,
"learning_rate": 1.4362077663552753e-05,
"loss": 0.1900373077392578,
"lr": 1.4313166515091864e-05,
"step": 1700,
"tokens_per_second": 76.28403702273542,
"tokens_per_step": 1009.9758823529412,
"total_tokens_seen": 1716959
},
{
"epoch": 0.7760101989911867,
"grad_norm": 0.4044085443019867,
"last_batch_tokens": 140,
"learning_rate": 1.2003504863370746e-05,
"loss": 0.1899305534362793,
"lr": 1.1958183130774469e-05,
"step": 1750,
"tokens_per_second": 84.05214560453553,
"tokens_per_step": 1027.8245714285715,
"total_tokens_seen": 1798693
},
{
"epoch": 0.7760101989911867,
"eval_loss": 0.19616812467575073,
"eval_runtime": 94.684,
"eval_samples_per_second": 3.834,
"eval_steps_per_second": 1.922,
"last_batch_tokens": 172,
"lr": 1.1958183130774469e-05,
"step": 1750,
"tokens_per_second": 339.39215730410245,
"tokens_per_step": 1046.1891428571428,
"total_tokens_seen": 1830831
},
{
"epoch": 0.7981819189623635,
"grad_norm": 0.5659682154655457,
"last_batch_tokens": 103,
"learning_rate": 9.829882797706336e-06,
"loss": 0.1962204933166504,
"lr": 9.788371087841237e-06,
"step": 1800,
"tokens_per_second": 83.61138183187425,
"tokens_per_step": 1063.1733333333334,
"total_tokens_seen": 1913712
},
{
"epoch": 0.8203536389335403,
"grad_norm": 0.3827808201313019,
"last_batch_tokens": 211,
"learning_rate": 7.85179173182246e-06,
"loss": 0.17033554077148438,
"lr": 7.814292105989308e-06,
"step": 1850,
"tokens_per_second": 82.77525601918174,
"tokens_per_step": 1078.207027027027,
"total_tokens_seen": 1994683
},
{
"epoch": 0.8314394989191286,
"eval_loss": 0.19150112569332123,
"eval_runtime": 94.6106,
"eval_samples_per_second": 3.837,
"eval_steps_per_second": 1.924,
"last_batch_tokens": 172,
"lr": 6.9036938458111764e-06,
"step": 1875,
"tokens_per_second": 129.4537055546321,
"tokens_per_step": 1103.9941333333334,
"total_tokens_seen": 2069989
},
{
"epoch": 0.842525358904717,
"grad_norm": 0.4506838917732239,
"last_batch_tokens": 132,
"learning_rate": 6.078860169460415e-06,
"loss": 0.18061737060546876,
"lr": 6.045555159845828e-06,
"step": 1900,
"tokens_per_second": 84.8577689646082,
"tokens_per_step": 1111.4515789473685,
"total_tokens_seen": 2111758
},
{
"epoch": 0.8646970788758938,
"grad_norm": 0.42664435505867004,
"last_batch_tokens": 123,
"learning_rate": 4.519717985389665e-06,
"loss": 0.18581958770751952,
"lr": 4.490769706577352e-06,
"step": 1950,
"tokens_per_second": 81.16470478657682,
"tokens_per_step": 1123.2635897435898,
"total_tokens_seen": 2190364
},
{
"epoch": 0.8868687988470706,
"grad_norm": 0.3934974670410156,
"last_batch_tokens": 291,
"learning_rate": 3.18195441885778e-06,
"loss": 0.17605453491210937,
"lr": 3.157503778723847e-06,
"step": 2000,
"tokens_per_second": 78.4029933600584,
"tokens_per_step": 1133.7585,
"total_tokens_seen": 2267517
},
{
"epoch": 0.8868687988470706,
"eval_loss": 0.1900114119052887,
"eval_runtime": 94.4943,
"eval_samples_per_second": 3.842,
"eval_steps_per_second": 1.926,
"last_batch_tokens": 172,
"lr": 3.157503778723847e-06,
"step": 2000,
"tokens_per_second": 340.07002840114217,
"tokens_per_step": 1149.8275,
"total_tokens_seen": 2299655
},
{
"epoch": 0.9090405188182473,
"grad_norm": 0.44616127014160156,
"last_batch_tokens": 151,
"learning_rate": 2.072081132410253e-06,
"loss": 0.1782122802734375,
"lr": 2.0522471462437796e-06,
"step": 2050,
"tokens_per_second": 81.0157351221381,
"tokens_per_step": 1160.878536585366,
"total_tokens_seen": 2379801
},
{
"epoch": 0.931212238789424,
"grad_norm": 0.4230777621269226,
"last_batch_tokens": 188,
"learning_rate": 1.195500515894149e-06,
"loss": 0.17306018829345704,
"lr": 1.1803797270814765e-06,
"step": 2100,
"tokens_per_second": 80.14939686559167,
"tokens_per_step": 1170.3680952380953,
"total_tokens_seen": 2457773
},
{
"epoch": 0.9422980987750125,
"eval_loss": 0.18897105753421783,
"eval_runtime": 95.0115,
"eval_samples_per_second": 3.821,
"eval_steps_per_second": 1.916,
"last_batch_tokens": 172,
"lr": 8.333381642750881e-07,
"step": 2125,
"tokens_per_second": 120.10909547338339,
"tokens_per_step": 1188.5943529411766,
"total_tokens_seen": 2525763
},
{
"epoch": 0.9533839587606009,
"grad_norm": 0.2957008183002472,
"last_batch_tokens": 305,
"learning_rate": 5.564793899281884e-07,
"loss": 0.1782497787475586,
"lr": 5.461454000209198e-07,
"step": 2150,
"tokens_per_second": 83.24645935651418,
"tokens_per_step": 1193.8697674418604,
"total_tokens_seen": 2566820
},
{
"epoch": 0.9755556787317776,
"grad_norm": 0.49967435002326965,
"last_batch_tokens": 156,
"learning_rate": 1.5812823683962197e-07,
"loss": 0.19703115463256837,
"lr": 1.5263134729363583e-07,
"step": 2200,
"tokens_per_second": 75.00656410059429,
"tokens_per_step": 1199.9336363636364,
"total_tokens_seen": 2639854
},
{
"epoch": 0.9977273987029543,
"grad_norm": 0.26038259267807007,
"last_batch_tokens": 322,
"learning_rate": 2.386060162717918e-09,
"loss": 0.17010717391967772,
"lr": 1.7530274921462308e-09,
"step": 2250,
"tokens_per_second": 78.96076733362268,
"tokens_per_step": 1208.1137777777778,
"total_tokens_seen": 2718256
},
{
"epoch": 0.9977273987029543,
"eval_loss": 0.18876151740550995,
"eval_runtime": 95.314,
"eval_samples_per_second": 3.808,
"eval_steps_per_second": 1.909,
"last_batch_tokens": 172,
"lr": 1.7530274921462308e-09,
"step": 2250,
"tokens_per_second": 337.1431434660513,
"tokens_per_step": 1222.3973333333333,
"total_tokens_seen": 2750394
}
],
"logging_steps": 50,
"max_steps": 2256,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.666058653049815e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}