LLama3.A / checkpoint-600 /trainer_state.json
Thehunter99's picture
Upload folder using huggingface_hub
78d8b97 verified
{
"best_global_step": 490,
"best_metric": 1.0370746850967407,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 10,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1,
"grad_norm": 4.007787227630615,
"learning_rate": 5e-05,
"loss": 5.2601,
"step": 10
},
{
"epoch": 0.1,
"eval_loss": 4.682629585266113,
"eval_mean_token_accuracy": 0.37830855041742323,
"eval_num_tokens": 6333.0,
"eval_runtime": 97.2723,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 10
},
{
"epoch": 0.2,
"grad_norm": 8.059301376342773,
"learning_rate": 9.999927156177032e-05,
"loss": 3.7743,
"step": 20
},
{
"epoch": 0.2,
"eval_loss": 2.5042078495025635,
"eval_mean_token_accuracy": 0.6304361110925675,
"eval_num_tokens": 12685.0,
"eval_runtime": 97.2264,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 20
},
{
"epoch": 0.3,
"grad_norm": 2.4571871757507324,
"learning_rate": 9.991188465340766e-05,
"loss": 2.0475,
"step": 30
},
{
"epoch": 0.3,
"eval_loss": 1.8001786470413208,
"eval_mean_token_accuracy": 0.6825134682655335,
"eval_num_tokens": 19121.0,
"eval_runtime": 97.26,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 30
},
{
"epoch": 0.4,
"grad_norm": 1.8676475286483765,
"learning_rate": 9.967910180154889e-05,
"loss": 1.6727,
"step": 40
},
{
"epoch": 0.4,
"eval_loss": 1.611749529838562,
"eval_mean_token_accuracy": 0.6887511330842971,
"eval_num_tokens": 25515.0,
"eval_runtime": 97.2714,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 40
},
{
"epoch": 0.5,
"grad_norm": 1.9121489524841309,
"learning_rate": 9.930160111487716e-05,
"loss": 1.5602,
"step": 50
},
{
"epoch": 0.5,
"eval_loss": 1.519342303276062,
"eval_mean_token_accuracy": 0.6936497485637665,
"eval_num_tokens": 31902.0,
"eval_runtime": 97.2536,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 50
},
{
"epoch": 0.6,
"grad_norm": 1.4662973880767822,
"learning_rate": 9.87804822727352e-05,
"loss": 1.4916,
"step": 60
},
{
"epoch": 0.6,
"eval_loss": 1.4659887552261353,
"eval_mean_token_accuracy": 0.6948034042119979,
"eval_num_tokens": 38297.0,
"eval_runtime": 97.2339,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 60
},
{
"epoch": 0.7,
"grad_norm": 2.132735252380371,
"learning_rate": 9.811726332170153e-05,
"loss": 1.4291,
"step": 70
},
{
"epoch": 0.7,
"eval_loss": 1.422730565071106,
"eval_mean_token_accuracy": 0.6967671060562134,
"eval_num_tokens": 44694.0,
"eval_runtime": 97.2271,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 70
},
{
"epoch": 0.8,
"grad_norm": 1.3734078407287598,
"learning_rate": 9.731387625344104e-05,
"loss": 1.3832,
"step": 80
},
{
"epoch": 0.8,
"eval_loss": 1.3986200094223022,
"eval_mean_token_accuracy": 0.6979268860816955,
"eval_num_tokens": 51069.0,
"eval_runtime": 97.2072,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 80
},
{
"epoch": 0.9,
"grad_norm": 1.4604731798171997,
"learning_rate": 9.637266137671177e-05,
"loss": 1.3693,
"step": 90
},
{
"epoch": 0.9,
"eval_loss": 1.3532851934432983,
"eval_mean_token_accuracy": 0.7013956385850907,
"eval_num_tokens": 57445.0,
"eval_runtime": 97.2573,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 90
},
{
"epoch": 1.0,
"grad_norm": 1.3173726797103882,
"learning_rate": 9.529636049992234e-05,
"loss": 1.3143,
"step": 100
},
{
"epoch": 1.0,
"eval_loss": 1.2985877990722656,
"eval_mean_token_accuracy": 0.7077123075723648,
"eval_num_tokens": 63853.0,
"eval_runtime": 97.2409,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 100
},
{
"epoch": 1.1,
"grad_norm": 1.3028841018676758,
"learning_rate": 9.408810894410009e-05,
"loss": 1.2416,
"step": 110
},
{
"epoch": 1.1,
"eval_loss": 1.2666221857070923,
"eval_mean_token_accuracy": 0.7159780770540237,
"eval_num_tokens": 70269.0,
"eval_runtime": 97.2354,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 110
},
{
"epoch": 1.2,
"grad_norm": 1.479728102684021,
"learning_rate": 9.2751426409536e-05,
"loss": 1.208,
"step": 120
},
{
"epoch": 1.2,
"eval_loss": 1.2293524742126465,
"eval_mean_token_accuracy": 0.7204430556297302,
"eval_num_tokens": 76691.0,
"eval_runtime": 97.2278,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 120
},
{
"epoch": 1.3,
"grad_norm": 1.6283237934112549,
"learning_rate": 9.129020672271283e-05,
"loss": 1.1881,
"step": 130
},
{
"epoch": 1.3,
"eval_loss": 1.2158491611480713,
"eval_mean_token_accuracy": 0.7250880861282348,
"eval_num_tokens": 83083.0,
"eval_runtime": 97.221,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 130
},
{
"epoch": 1.4,
"grad_norm": 1.2515714168548584,
"learning_rate": 8.970870649338387e-05,
"loss": 1.1893,
"step": 140
},
{
"epoch": 1.4,
"eval_loss": 1.2067173719406128,
"eval_mean_token_accuracy": 0.7283873379230499,
"eval_num_tokens": 89470.0,
"eval_runtime": 97.217,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 140
},
{
"epoch": 1.5,
"grad_norm": 1.7679073810577393,
"learning_rate": 8.801153271484502e-05,
"loss": 1.1742,
"step": 150
},
{
"epoch": 1.5,
"eval_loss": 1.2018429040908813,
"eval_mean_token_accuracy": 0.7298205083608628,
"eval_num_tokens": 95793.0,
"eval_runtime": 97.2454,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 150
},
{
"epoch": 1.6,
"grad_norm": 1.4671496152877808,
"learning_rate": 8.620362934352109e-05,
"loss": 1.1713,
"step": 160
},
{
"epoch": 1.6,
"eval_loss": 1.2024760246276855,
"eval_mean_token_accuracy": 0.7248463779687881,
"eval_num_tokens": 102197.0,
"eval_runtime": 97.2434,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 160
},
{
"epoch": 1.7,
"grad_norm": 1.3356602191925049,
"learning_rate": 8.429026289696091e-05,
"loss": 1.1616,
"step": 170
},
{
"epoch": 1.7,
"eval_loss": 1.1988317966461182,
"eval_mean_token_accuracy": 0.7268992912769318,
"eval_num_tokens": 108568.0,
"eval_runtime": 97.2357,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 170
},
{
"epoch": 1.8,
"grad_norm": 1.8510276079177856,
"learning_rate": 8.227700711219493e-05,
"loss": 1.1541,
"step": 180
},
{
"epoch": 1.8,
"eval_loss": 1.1827600002288818,
"eval_mean_token_accuracy": 0.7310593771934509,
"eval_num_tokens": 114964.0,
"eval_runtime": 97.2104,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 180
},
{
"epoch": 1.9,
"grad_norm": 1.4240201711654663,
"learning_rate": 8.016972670914624e-05,
"loss": 1.1708,
"step": 190
},
{
"epoch": 1.9,
"eval_loss": 1.1701805591583252,
"eval_mean_token_accuracy": 0.7321947473287582,
"eval_num_tokens": 121333.0,
"eval_runtime": 97.2429,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 190
},
{
"epoch": 2.0,
"grad_norm": 1.8542824983596802,
"learning_rate": 7.797456030639313e-05,
"loss": 1.1509,
"step": 200
},
{
"epoch": 2.0,
"eval_loss": 1.157992959022522,
"eval_mean_token_accuracy": 0.7394243097305297,
"eval_num_tokens": 127706.0,
"eval_runtime": 97.2191,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 200
},
{
"epoch": 2.1,
"grad_norm": 1.5795457363128662,
"learning_rate": 7.569790253905059e-05,
"loss": 1.0941,
"step": 210
},
{
"epoch": 2.1,
"eval_loss": 1.148593544960022,
"eval_mean_token_accuracy": 0.7429650634527206,
"eval_num_tokens": 134095.0,
"eval_runtime": 97.2383,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 210
},
{
"epoch": 2.2,
"grad_norm": 1.4866969585418701,
"learning_rate": 7.334638543086203e-05,
"loss": 1.0931,
"step": 220
},
{
"epoch": 2.2,
"eval_loss": 1.1364799737930298,
"eval_mean_token_accuracy": 0.7470852738618851,
"eval_num_tokens": 140527.0,
"eval_runtime": 97.2225,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 220
},
{
"epoch": 2.3,
"grad_norm": 1.651310920715332,
"learning_rate": 7.092685907476558e-05,
"loss": 1.0897,
"step": 230
},
{
"epoch": 2.3,
"eval_loss": 1.1412357091903687,
"eval_mean_token_accuracy": 0.7455077153444291,
"eval_num_tokens": 146939.0,
"eval_runtime": 97.2224,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 230
},
{
"epoch": 2.4,
"grad_norm": 1.3703193664550781,
"learning_rate": 6.844637167821326e-05,
"loss": 1.0945,
"step": 240
},
{
"epoch": 2.4,
"eval_loss": 1.1335656642913818,
"eval_mean_token_accuracy": 0.7461127752065658,
"eval_num_tokens": 153360.0,
"eval_runtime": 97.2102,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 240
},
{
"epoch": 2.5,
"grad_norm": 1.7046772241592407,
"learning_rate": 6.59121490313722e-05,
"loss": 1.0803,
"step": 250
},
{
"epoch": 2.5,
"eval_loss": 1.1254699230194092,
"eval_mean_token_accuracy": 0.7490965259075165,
"eval_num_tokens": 159731.0,
"eval_runtime": 97.2222,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 250
},
{
"epoch": 2.6,
"grad_norm": 1.6481680870056152,
"learning_rate": 6.333157345801809e-05,
"loss": 1.0859,
"step": 260
},
{
"epoch": 2.6,
"eval_loss": 1.1334922313690186,
"eval_mean_token_accuracy": 0.7479477733373642,
"eval_num_tokens": 166113.0,
"eval_runtime": 97.2341,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 260
},
{
"epoch": 2.7,
"grad_norm": 1.7207422256469727,
"learning_rate": 6.071216231043799e-05,
"loss": 1.0934,
"step": 270
},
{
"epoch": 2.7,
"eval_loss": 1.1130690574645996,
"eval_mean_token_accuracy": 0.7543548595905304,
"eval_num_tokens": 172481.0,
"eval_runtime": 97.2173,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 270
},
{
"epoch": 2.8,
"grad_norm": 1.7942646741867065,
"learning_rate": 5.8061546070987994e-05,
"loss": 1.0733,
"step": 280
},
{
"epoch": 2.8,
"eval_loss": 1.1070621013641357,
"eval_mean_token_accuracy": 0.7557503712177277,
"eval_num_tokens": 178813.0,
"eval_runtime": 97.2529,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 280
},
{
"epoch": 2.9,
"grad_norm": 1.5024192333221436,
"learning_rate": 5.538744612409701e-05,
"loss": 1.0767,
"step": 290
},
{
"epoch": 2.9,
"eval_loss": 1.109175205230713,
"eval_mean_token_accuracy": 0.7545353853702546,
"eval_num_tokens": 185220.0,
"eval_runtime": 97.25,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 290
},
{
"epoch": 3.0,
"grad_norm": 1.7613085508346558,
"learning_rate": 5.2697652263468125e-05,
"loss": 1.0574,
"step": 300
},
{
"epoch": 3.0,
"eval_loss": 1.0976698398590088,
"eval_mean_token_accuracy": 0.759365046620369,
"eval_num_tokens": 191559.0,
"eval_runtime": 97.2105,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 300
},
{
"epoch": 3.1,
"grad_norm": 1.6670249700546265,
"learning_rate": 5e-05,
"loss": 1.0182,
"step": 310
},
{
"epoch": 3.1,
"eval_loss": 1.094779372215271,
"eval_mean_token_accuracy": 0.76161092877388,
"eval_num_tokens": 197915.0,
"eval_runtime": 97.2406,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 310
},
{
"epoch": 3.2,
"grad_norm": 1.7392572164535522,
"learning_rate": 4.730234773653188e-05,
"loss": 1.007,
"step": 320
},
{
"epoch": 3.2,
"eval_loss": 1.0834596157073975,
"eval_mean_token_accuracy": 0.7656834137439728,
"eval_num_tokens": 204252.0,
"eval_runtime": 97.2394,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 320
},
{
"epoch": 3.3,
"grad_norm": 2.0105206966400146,
"learning_rate": 4.461255387590299e-05,
"loss": 1.0119,
"step": 330
},
{
"epoch": 3.3,
"eval_loss": 1.0809874534606934,
"eval_mean_token_accuracy": 0.7655541002750397,
"eval_num_tokens": 210574.0,
"eval_runtime": 97.2409,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 330
},
{
"epoch": 3.4,
"grad_norm": 1.759490966796875,
"learning_rate": 4.193845392901201e-05,
"loss": 0.9974,
"step": 340
},
{
"epoch": 3.4,
"eval_loss": 1.0805025100708008,
"eval_mean_token_accuracy": 0.7660670423507691,
"eval_num_tokens": 216977.0,
"eval_runtime": 97.206,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 340
},
{
"epoch": 3.5,
"grad_norm": 1.8894627094268799,
"learning_rate": 3.9287837689562016e-05,
"loss": 1.0107,
"step": 350
},
{
"epoch": 3.5,
"eval_loss": 1.0740913152694702,
"eval_mean_token_accuracy": 0.7694036465883255,
"eval_num_tokens": 223412.0,
"eval_runtime": 97.2057,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 350
},
{
"epoch": 3.6,
"grad_norm": 2.0457475185394287,
"learning_rate": 3.666842654198191e-05,
"loss": 1.0004,
"step": 360
},
{
"epoch": 3.6,
"eval_loss": 1.0669925212860107,
"eval_mean_token_accuracy": 0.7717740494012832,
"eval_num_tokens": 229731.0,
"eval_runtime": 97.1971,
"eval_samples_per_second": 2.058,
"eval_steps_per_second": 1.029,
"step": 360
},
{
"epoch": 3.7,
"grad_norm": 1.5154274702072144,
"learning_rate": 3.408785096862782e-05,
"loss": 0.9902,
"step": 370
},
{
"epoch": 3.7,
"eval_loss": 1.0629109144210815,
"eval_mean_token_accuracy": 0.7715724587440491,
"eval_num_tokens": 236188.0,
"eval_runtime": 97.203,
"eval_samples_per_second": 2.058,
"eval_steps_per_second": 1.029,
"step": 370
},
{
"epoch": 3.8,
"grad_norm": 2.06585693359375,
"learning_rate": 3.1553628321786745e-05,
"loss": 1.0053,
"step": 380
},
{
"epoch": 3.8,
"eval_loss": 1.0563435554504395,
"eval_mean_token_accuracy": 0.7732643353939056,
"eval_num_tokens": 242558.0,
"eval_runtime": 97.2143,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 380
},
{
"epoch": 3.9,
"grad_norm": 1.730677843093872,
"learning_rate": 2.907314092523442e-05,
"loss": 0.9813,
"step": 390
},
{
"epoch": 3.9,
"eval_loss": 1.0525062084197998,
"eval_mean_token_accuracy": 0.7765388804674148,
"eval_num_tokens": 249000.0,
"eval_runtime": 97.2525,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 390
},
{
"epoch": 4.0,
"grad_norm": 1.5716614723205566,
"learning_rate": 2.6653614569137968e-05,
"loss": 0.9852,
"step": 400
},
{
"epoch": 4.0,
"eval_loss": 1.0519495010375977,
"eval_mean_token_accuracy": 0.7759864777326584,
"eval_num_tokens": 255412.0,
"eval_runtime": 97.2138,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 400
},
{
"epoch": 4.1,
"grad_norm": 1.607945203781128,
"learning_rate": 2.430209746094943e-05,
"loss": 0.9515,
"step": 410
},
{
"epoch": 4.1,
"eval_loss": 1.0613422393798828,
"eval_mean_token_accuracy": 0.7760749870538711,
"eval_num_tokens": 261847.0,
"eval_runtime": 97.2595,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 410
},
{
"epoch": 4.2,
"grad_norm": 1.7474143505096436,
"learning_rate": 2.2025439693606882e-05,
"loss": 0.9453,
"step": 420
},
{
"epoch": 4.2,
"eval_loss": 1.0487031936645508,
"eval_mean_token_accuracy": 0.7783306258916854,
"eval_num_tokens": 268220.0,
"eval_runtime": 97.2251,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.029,
"step": 420
},
{
"epoch": 4.3,
"grad_norm": 1.3773257732391357,
"learning_rate": 1.983027329085377e-05,
"loss": 0.9398,
"step": 430
},
{
"epoch": 4.3,
"eval_loss": 1.0485966205596924,
"eval_mean_token_accuracy": 0.7771646714210511,
"eval_num_tokens": 274612.0,
"eval_runtime": 97.2492,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 430
},
{
"epoch": 4.4,
"grad_norm": 1.5962660312652588,
"learning_rate": 1.772299288780508e-05,
"loss": 0.9283,
"step": 440
},
{
"epoch": 4.4,
"eval_loss": 1.049791693687439,
"eval_mean_token_accuracy": 0.778401963710785,
"eval_num_tokens": 281009.0,
"eval_runtime": 97.2759,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 440
},
{
"epoch": 4.5,
"grad_norm": 2.201416015625,
"learning_rate": 1.5709737103039103e-05,
"loss": 0.944,
"step": 450
},
{
"epoch": 4.5,
"eval_loss": 1.0448309183120728,
"eval_mean_token_accuracy": 0.7783755934238434,
"eval_num_tokens": 287341.0,
"eval_runtime": 97.2585,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 450
},
{
"epoch": 4.6,
"grad_norm": 1.6953043937683105,
"learning_rate": 1.3796370656478935e-05,
"loss": 0.9357,
"step": 460
},
{
"epoch": 4.6,
"eval_loss": 1.0441021919250488,
"eval_mean_token_accuracy": 0.7798552727699279,
"eval_num_tokens": 293667.0,
"eval_runtime": 97.2712,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 460
},
{
"epoch": 4.7,
"grad_norm": 2.495961904525757,
"learning_rate": 1.1988467285154987e-05,
"loss": 0.9543,
"step": 470
},
{
"epoch": 4.7,
"eval_loss": 1.0410436391830444,
"eval_mean_token_accuracy": 0.7808067119121551,
"eval_num_tokens": 300075.0,
"eval_runtime": 97.273,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 470
},
{
"epoch": 4.8,
"grad_norm": 2.314516067504883,
"learning_rate": 1.0291293506616133e-05,
"loss": 0.9453,
"step": 480
},
{
"epoch": 4.8,
"eval_loss": 1.040062427520752,
"eval_mean_token_accuracy": 0.7821026688814163,
"eval_num_tokens": 306434.0,
"eval_runtime": 97.2666,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 480
},
{
"epoch": 4.9,
"grad_norm": 2.006592035293579,
"learning_rate": 8.70979327728718e-06,
"loss": 0.9376,
"step": 490
},
{
"epoch": 4.9,
"eval_loss": 1.0370746850967407,
"eval_mean_token_accuracy": 0.7834529572725296,
"eval_num_tokens": 312881.0,
"eval_runtime": 97.2721,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 490
},
{
"epoch": 5.0,
"grad_norm": 1.7751625776290894,
"learning_rate": 7.248573590464014e-06,
"loss": 0.9288,
"step": 500
},
{
"epoch": 5.0,
"eval_loss": 1.0378228425979614,
"eval_mean_token_accuracy": 0.7834442704916,
"eval_num_tokens": 319265.0,
"eval_runtime": 97.2917,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 500
},
{
"epoch": 5.1,
"grad_norm": 1.7418688535690308,
"learning_rate": 5.91189105589992e-06,
"loss": 0.9145,
"step": 510
},
{
"epoch": 5.1,
"eval_loss": 1.0420323610305786,
"eval_mean_token_accuracy": 0.7838906270265579,
"eval_num_tokens": 325626.0,
"eval_runtime": 97.2737,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 510
},
{
"epoch": 5.2,
"grad_norm": 1.826409101486206,
"learning_rate": 4.703639500077656e-06,
"loss": 0.9237,
"step": 520
},
{
"epoch": 5.2,
"eval_loss": 1.0423357486724854,
"eval_mean_token_accuracy": 0.7844956815242767,
"eval_num_tokens": 331976.0,
"eval_runtime": 97.2685,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 520
},
{
"epoch": 5.3,
"grad_norm": 1.7615423202514648,
"learning_rate": 3.6273386232882343e-06,
"loss": 0.9201,
"step": 530
},
{
"epoch": 5.3,
"eval_loss": 1.0414623022079468,
"eval_mean_token_accuracy": 0.7839177978038788,
"eval_num_tokens": 338413.0,
"eval_runtime": 97.2327,
"eval_samples_per_second": 2.057,
"eval_steps_per_second": 1.028,
"step": 530
},
{
"epoch": 5.4,
"grad_norm": 1.5898215770721436,
"learning_rate": 2.686123746558961e-06,
"loss": 0.9002,
"step": 540
},
{
"epoch": 5.4,
"eval_loss": 1.0424439907073975,
"eval_mean_token_accuracy": 0.7836556518077851,
"eval_num_tokens": 344800.0,
"eval_runtime": 97.2686,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 540
},
{
"epoch": 5.5,
"grad_norm": 1.721126675605774,
"learning_rate": 1.8827366782984913e-06,
"loss": 0.9025,
"step": 550
},
{
"epoch": 5.5,
"eval_loss": 1.043124794960022,
"eval_mean_token_accuracy": 0.7836492872238159,
"eval_num_tokens": 351242.0,
"eval_runtime": 97.3002,
"eval_samples_per_second": 2.055,
"eval_steps_per_second": 1.028,
"step": 550
},
{
"epoch": 5.6,
"grad_norm": 1.763380527496338,
"learning_rate": 1.2195177272648127e-06,
"loss": 0.9187,
"step": 560
},
{
"epoch": 5.6,
"eval_loss": 1.0419065952301025,
"eval_mean_token_accuracy": 0.7836943608522415,
"eval_num_tokens": 357626.0,
"eval_runtime": 97.2575,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 560
},
{
"epoch": 5.7,
"grad_norm": 1.7301427125930786,
"learning_rate": 6.983988851228473e-07,
"loss": 0.9192,
"step": 570
},
{
"epoch": 5.7,
"eval_loss": 1.0414124727249146,
"eval_mean_token_accuracy": 0.784173795580864,
"eval_num_tokens": 363989.0,
"eval_runtime": 97.2552,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 570
},
{
"epoch": 5.8,
"grad_norm": 1.9148461818695068,
"learning_rate": 3.208981984511195e-07,
"loss": 0.9075,
"step": 580
},
{
"epoch": 5.8,
"eval_loss": 1.0412297248840332,
"eval_mean_token_accuracy": 0.7838809263706207,
"eval_num_tokens": 370407.0,
"eval_runtime": 97.2933,
"eval_samples_per_second": 2.056,
"eval_steps_per_second": 1.028,
"step": 580
},
{
"epoch": 5.9,
"grad_norm": 1.3000129461288452,
"learning_rate": 8.811534659234899e-08,
"loss": 0.9122,
"step": 590
},
{
"epoch": 5.9,
"eval_loss": 1.041398048400879,
"eval_mean_token_accuracy": 0.7838524436950683,
"eval_num_tokens": 376689.0,
"eval_runtime": 97.3246,
"eval_samples_per_second": 2.055,
"eval_steps_per_second": 1.027,
"step": 590
},
{
"epoch": 6.0,
"grad_norm": 1.879518747329712,
"learning_rate": 7.284382296801617e-10,
"loss": 0.9134,
"step": 600
},
{
"epoch": 6.0,
"eval_loss": 1.0410884618759155,
"eval_mean_token_accuracy": 0.7832156884670257,
"eval_num_tokens": 383118.0,
"eval_runtime": 97.3129,
"eval_samples_per_second": 2.055,
"eval_steps_per_second": 1.028,
"step": 600
}
],
"logging_steps": 10,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.777512987967488e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}