SemioSentiment_MoA / trainer_state.json
Tuliosil's picture
Upload folder using huggingface_hub
2f1a6b8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3297872340425532,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013297872340425532,
"grad_norm": 0.00017850878066383302,
"learning_rate": 2.9999069195872345e-05,
"loss": 1.5526,
"num_input_tokens_seen": 22912,
"step": 5,
"train_runtime": 15.9018,
"train_tokens_per_second": 1440.846
},
{
"epoch": 0.026595744680851064,
"grad_norm": 0.00022191159951034933,
"learning_rate": 2.9995288002087968e-05,
"loss": 0.1475,
"num_input_tokens_seen": 47104,
"step": 10,
"train_runtime": 23.1749,
"train_tokens_per_second": 2032.547
},
{
"epoch": 0.0398936170212766,
"grad_norm": 0.00020294415298849344,
"learning_rate": 2.9988598976060308e-05,
"loss": 0.153,
"num_input_tokens_seen": 73920,
"step": 15,
"train_runtime": 31.9246,
"train_tokens_per_second": 2315.457
},
{
"epoch": 0.05319148936170213,
"grad_norm": 7.007523527136073e-05,
"learning_rate": 2.9979003414901197e-05,
"loss": 0.1529,
"num_input_tokens_seen": 99360,
"step": 20,
"train_runtime": 39.4019,
"train_tokens_per_second": 2521.709
},
{
"epoch": 0.06648936170212766,
"grad_norm": 0.00010616348299663514,
"learning_rate": 2.99665031793473e-05,
"loss": 0.1295,
"num_input_tokens_seen": 124192,
"step": 25,
"train_runtime": 46.8336,
"train_tokens_per_second": 2651.77
},
{
"epoch": 0.0797872340425532,
"grad_norm": 5.6807843066053465e-05,
"learning_rate": 2.995110069339927e-05,
"loss": 0.1431,
"num_input_tokens_seen": 151456,
"step": 30,
"train_runtime": 54.8766,
"train_tokens_per_second": 2759.94
},
{
"epoch": 0.09308510638297872,
"grad_norm": 8.653431723359972e-05,
"learning_rate": 2.993279894385171e-05,
"loss": 0.1003,
"num_input_tokens_seen": 177344,
"step": 35,
"train_runtime": 62.5606,
"train_tokens_per_second": 2834.754
},
{
"epoch": 0.10638297872340426,
"grad_norm": 4.548930155579001e-05,
"learning_rate": 2.9911601479713985e-05,
"loss": 0.1126,
"num_input_tokens_seen": 205952,
"step": 40,
"train_runtime": 70.8371,
"train_tokens_per_second": 2907.403
},
{
"epoch": 0.1196808510638298,
"grad_norm": 0.000141258497023955,
"learning_rate": 2.988751241152199e-05,
"loss": 0.1204,
"num_input_tokens_seen": 237920,
"step": 45,
"train_runtime": 79.888,
"train_tokens_per_second": 2978.17
},
{
"epoch": 0.13297872340425532,
"grad_norm": 4.336608981247991e-05,
"learning_rate": 2.9860536410541076e-05,
"loss": 0.069,
"num_input_tokens_seen": 264128,
"step": 50,
"train_runtime": 87.681,
"train_tokens_per_second": 3012.373
},
{
"epoch": 0.14627659574468085,
"grad_norm": 7.17395669198595e-05,
"learning_rate": 2.983067870786019e-05,
"loss": 0.0447,
"num_input_tokens_seen": 288896,
"step": 55,
"train_runtime": 95.0758,
"train_tokens_per_second": 3038.587
},
{
"epoch": 0.1595744680851064,
"grad_norm": 4.3858930439455435e-05,
"learning_rate": 2.9797945093377513e-05,
"loss": 0.07,
"num_input_tokens_seen": 311680,
"step": 60,
"train_runtime": 102.0665,
"train_tokens_per_second": 3053.696
},
{
"epoch": 0.17287234042553193,
"grad_norm": 8.529757906217128e-05,
"learning_rate": 2.976234191467767e-05,
"loss": 0.0789,
"num_input_tokens_seen": 334976,
"step": 65,
"train_runtime": 109.0924,
"train_tokens_per_second": 3070.572
},
{
"epoch": 0.18617021276595744,
"grad_norm": 6.885492621222511e-05,
"learning_rate": 2.9723876075800846e-05,
"loss": 0.083,
"num_input_tokens_seen": 360480,
"step": 70,
"train_runtime": 116.6544,
"train_tokens_per_second": 3090.154
},
{
"epoch": 0.19946808510638298,
"grad_norm": 7.440832268912345e-05,
"learning_rate": 2.968255503590398e-05,
"loss": 0.0511,
"num_input_tokens_seen": 384768,
"step": 75,
"train_runtime": 123.9537,
"train_tokens_per_second": 3104.126
},
{
"epoch": 0.2127659574468085,
"grad_norm": 7.101365190465003e-05,
"learning_rate": 2.963838680781431e-05,
"loss": 0.0788,
"num_input_tokens_seen": 410304,
"step": 80,
"train_runtime": 131.6092,
"train_tokens_per_second": 3117.593
},
{
"epoch": 0.22606382978723405,
"grad_norm": 1.87977020686958e-05,
"learning_rate": 2.959137995647556e-05,
"loss": 0.0467,
"num_input_tokens_seen": 437888,
"step": 85,
"train_runtime": 139.7728,
"train_tokens_per_second": 3132.856
},
{
"epoch": 0.2393617021276596,
"grad_norm": 0.0001294072571909055,
"learning_rate": 2.9541543597287034e-05,
"loss": 0.053,
"num_input_tokens_seen": 462976,
"step": 90,
"train_runtime": 147.3061,
"train_tokens_per_second": 3142.952
},
{
"epoch": 0.2526595744680851,
"grad_norm": 9.718466753838584e-05,
"learning_rate": 2.9488887394336025e-05,
"loss": 0.0345,
"num_input_tokens_seen": 485280,
"step": 95,
"train_runtime": 154.2543,
"train_tokens_per_second": 3145.973
},
{
"epoch": 0.26595744680851063,
"grad_norm": 0.00011859676305903122,
"learning_rate": 2.9433421558523767e-05,
"loss": 0.0716,
"num_input_tokens_seen": 509856,
"step": 100,
"train_runtime": 161.6729,
"train_tokens_per_second": 3153.627
},
{
"epoch": 0.27925531914893614,
"grad_norm": 6.369210314005613e-05,
"learning_rate": 2.9375156845585374e-05,
"loss": 0.0562,
"num_input_tokens_seen": 535264,
"step": 105,
"train_runtime": 170.0802,
"train_tokens_per_second": 3147.127
},
{
"epoch": 0.2925531914893617,
"grad_norm": 4.727758641820401e-05,
"learning_rate": 2.9314104554004137e-05,
"loss": 0.0371,
"num_input_tokens_seen": 562912,
"step": 110,
"train_runtime": 178.2035,
"train_tokens_per_second": 3158.815
},
{
"epoch": 0.3058510638297872,
"grad_norm": 0.00010592794569674879,
"learning_rate": 2.925027652282056e-05,
"loss": 0.0586,
"num_input_tokens_seen": 585280,
"step": 115,
"train_runtime": 185.0929,
"train_tokens_per_second": 3162.088
},
{
"epoch": 0.3191489361702128,
"grad_norm": 2.9270680897752754e-05,
"learning_rate": 2.918368512933657e-05,
"loss": 0.0633,
"num_input_tokens_seen": 612224,
"step": 120,
"train_runtime": 192.9777,
"train_tokens_per_second": 3172.512
},
{
"epoch": 0.3324468085106383,
"grad_norm": 0.00010040538472821936,
"learning_rate": 2.911434328671536e-05,
"loss": 0.0751,
"num_input_tokens_seen": 639264,
"step": 125,
"train_runtime": 201.0075,
"train_tokens_per_second": 3180.299
},
{
"epoch": 0.34574468085106386,
"grad_norm": 0.0001042517542373389,
"learning_rate": 2.904226444147732e-05,
"loss": 0.0677,
"num_input_tokens_seen": 665280,
"step": 130,
"train_runtime": 208.7729,
"train_tokens_per_second": 3186.621
},
{
"epoch": 0.35904255319148937,
"grad_norm": 7.185106369433925e-05,
"learning_rate": 2.896746257089251e-05,
"loss": 0.0587,
"num_input_tokens_seen": 689216,
"step": 135,
"train_runtime": 216.0407,
"train_tokens_per_second": 3190.214
},
{
"epoch": 0.3723404255319149,
"grad_norm": 5.872031761100516e-05,
"learning_rate": 2.8889952180270287e-05,
"loss": 0.0605,
"num_input_tokens_seen": 714880,
"step": 140,
"train_runtime": 223.7009,
"train_tokens_per_second": 3195.695
},
{
"epoch": 0.38563829787234044,
"grad_norm": 2.933590076281689e-05,
"learning_rate": 2.880974830014643e-05,
"loss": 0.1054,
"num_input_tokens_seen": 739904,
"step": 145,
"train_runtime": 231.1836,
"train_tokens_per_second": 3200.504
},
{
"epoch": 0.39893617021276595,
"grad_norm": 0.00012435043754521757,
"learning_rate": 2.872686648336853e-05,
"loss": 0.0479,
"num_input_tokens_seen": 765824,
"step": 150,
"train_runtime": 238.8742,
"train_tokens_per_second": 3205.972
},
{
"epoch": 0.4122340425531915,
"grad_norm": 8.882827387424186e-05,
"learning_rate": 2.8641322802079984e-05,
"loss": 0.0508,
"num_input_tokens_seen": 797952,
"step": 155,
"train_runtime": 248.022,
"train_tokens_per_second": 3217.263
},
{
"epoch": 0.425531914893617,
"grad_norm": 9.789071918930858e-05,
"learning_rate": 2.8553133844603382e-05,
"loss": 0.0399,
"num_input_tokens_seen": 823264,
"step": 160,
"train_runtime": 255.6112,
"train_tokens_per_second": 3220.766
},
{
"epoch": 0.43882978723404253,
"grad_norm": 4.716894181910902e-05,
"learning_rate": 2.846231671222374e-05,
"loss": 0.062,
"num_input_tokens_seen": 849216,
"step": 165,
"train_runtime": 263.3712,
"train_tokens_per_second": 3224.408
},
{
"epoch": 0.4521276595744681,
"grad_norm": 8.95522753125988e-05,
"learning_rate": 2.836888901587229e-05,
"loss": 0.1292,
"num_input_tokens_seen": 874208,
"step": 170,
"train_runtime": 270.894,
"train_tokens_per_second": 3227.122
},
{
"epoch": 0.4654255319148936,
"grad_norm": 3.6886351153953e-05,
"learning_rate": 2.827286887271143e-05,
"loss": 0.0558,
"num_input_tokens_seen": 898624,
"step": 175,
"train_runtime": 278.2599,
"train_tokens_per_second": 3229.441
},
{
"epoch": 0.4787234042553192,
"grad_norm": 7.180378452176228e-05,
"learning_rate": 2.8174274902621495e-05,
"loss": 0.0506,
"num_input_tokens_seen": 921728,
"step": 180,
"train_runtime": 285.3501,
"train_tokens_per_second": 3230.166
},
{
"epoch": 0.4920212765957447,
"grad_norm": 2.529071207391098e-05,
"learning_rate": 2.8073126224590073e-05,
"loss": 0.0713,
"num_input_tokens_seen": 948160,
"step": 185,
"train_runtime": 293.1898,
"train_tokens_per_second": 3233.946
},
{
"epoch": 0.5053191489361702,
"grad_norm": 2.971558387798723e-05,
"learning_rate": 2.7969442453004525e-05,
"loss": 0.0423,
"num_input_tokens_seen": 974688,
"step": 190,
"train_runtime": 301.0219,
"train_tokens_per_second": 3237.93
},
{
"epoch": 0.5186170212765957,
"grad_norm": 1.3908083019487094e-05,
"learning_rate": 2.786324369384841e-05,
"loss": 0.0376,
"num_input_tokens_seen": 999232,
"step": 195,
"train_runtime": 308.373,
"train_tokens_per_second": 3240.336
},
{
"epoch": 0.5319148936170213,
"grad_norm": 8.287108357762918e-05,
"learning_rate": 2.7754550540802632e-05,
"loss": 0.0505,
"num_input_tokens_seen": 1024352,
"step": 200,
"train_runtime": 315.9074,
"train_tokens_per_second": 3242.57
},
{
"epoch": 0.5452127659574468,
"grad_norm": 7.783296314300969e-05,
"learning_rate": 2.7643384071251957e-05,
"loss": 0.0347,
"num_input_tokens_seen": 1049088,
"step": 205,
"train_runtime": 324.1076,
"train_tokens_per_second": 3236.851
},
{
"epoch": 0.5585106382978723,
"grad_norm": 0.0001195428121718578,
"learning_rate": 2.7529765842197798e-05,
"loss": 0.0386,
"num_input_tokens_seen": 1073024,
"step": 210,
"train_runtime": 331.3284,
"train_tokens_per_second": 3238.552
},
{
"epoch": 0.5718085106382979,
"grad_norm": 4.606168658938259e-05,
"learning_rate": 2.741371788607793e-05,
"loss": 0.0616,
"num_input_tokens_seen": 1098880,
"step": 215,
"train_runtime": 339.0001,
"train_tokens_per_second": 3241.533
},
{
"epoch": 0.5851063829787234,
"grad_norm": 0.00013229926116764545,
"learning_rate": 2.729526270649405e-05,
"loss": 0.0821,
"num_input_tokens_seen": 1127328,
"step": 220,
"train_runtime": 347.2586,
"train_tokens_per_second": 3246.364
},
{
"epoch": 0.598404255319149,
"grad_norm": 8.632720710011199e-05,
"learning_rate": 2.7174423273847966e-05,
"loss": 0.0685,
"num_input_tokens_seen": 1151584,
"step": 225,
"train_runtime": 354.5073,
"train_tokens_per_second": 3248.407
},
{
"epoch": 0.6117021276595744,
"grad_norm": 4.496889596339315e-05,
"learning_rate": 2.705122302088725e-05,
"loss": 0.0667,
"num_input_tokens_seen": 1180544,
"step": 230,
"train_runtime": 363.022,
"train_tokens_per_second": 3251.991
},
{
"epoch": 0.625,
"grad_norm": 1.9521097783581354e-05,
"learning_rate": 2.6925685838161247e-05,
"loss": 0.035,
"num_input_tokens_seen": 1206080,
"step": 235,
"train_runtime": 370.6153,
"train_tokens_per_second": 3254.264
},
{
"epoch": 0.6382978723404256,
"grad_norm": 4.637776146410033e-05,
"learning_rate": 2.67978360693883e-05,
"loss": 0.0604,
"num_input_tokens_seen": 1230304,
"step": 240,
"train_runtime": 377.9559,
"train_tokens_per_second": 3255.153
},
{
"epoch": 0.651595744680851,
"grad_norm": 3.3805175917223096e-05,
"learning_rate": 2.6667698506735113e-05,
"loss": 0.0556,
"num_input_tokens_seen": 1256640,
"step": 245,
"train_runtime": 385.7509,
"train_tokens_per_second": 3257.646
},
{
"epoch": 0.6648936170212766,
"grad_norm": 0.00010089632996823639,
"learning_rate": 2.6535298386009144e-05,
"loss": 0.0487,
"num_input_tokens_seen": 1280064,
"step": 250,
"train_runtime": 392.8672,
"train_tokens_per_second": 3258.262
},
{
"epoch": 0.6781914893617021,
"grad_norm": 3.6058525438420475e-05,
"learning_rate": 2.6400661381764962e-05,
"loss": 0.0702,
"num_input_tokens_seen": 1305984,
"step": 255,
"train_runtime": 400.5999,
"train_tokens_per_second": 3260.071
},
{
"epoch": 0.6914893617021277,
"grad_norm": 1.9650842659757473e-05,
"learning_rate": 2.6263813602325525e-05,
"loss": 0.0422,
"num_input_tokens_seen": 1333088,
"step": 260,
"train_runtime": 408.608,
"train_tokens_per_second": 3262.511
},
{
"epoch": 0.7047872340425532,
"grad_norm": 2.503952600818593e-05,
"learning_rate": 2.6124781584719365e-05,
"loss": 0.0674,
"num_input_tokens_seen": 1357728,
"step": 265,
"train_runtime": 416.0446,
"train_tokens_per_second": 3263.419
},
{
"epoch": 0.7180851063829787,
"grad_norm": 3.540120815159753e-05,
"learning_rate": 2.5983592289534602e-05,
"loss": 0.0446,
"num_input_tokens_seen": 1383104,
"step": 270,
"train_runtime": 423.6735,
"train_tokens_per_second": 3264.552
},
{
"epoch": 0.7313829787234043,
"grad_norm": 5.61477463634219e-05,
"learning_rate": 2.584027309569086e-05,
"loss": 0.0382,
"num_input_tokens_seen": 1408096,
"step": 275,
"train_runtime": 431.1736,
"train_tokens_per_second": 3265.729
},
{
"epoch": 0.7446808510638298,
"grad_norm": 1.1481101864774246e-05,
"learning_rate": 2.5694851795130044e-05,
"loss": 0.0189,
"num_input_tokens_seen": 1434048,
"step": 280,
"train_runtime": 438.8402,
"train_tokens_per_second": 3267.813
},
{
"epoch": 0.7579787234042553,
"grad_norm": 0.0001053257001331076,
"learning_rate": 2.5547356587427017e-05,
"loss": 0.0246,
"num_input_tokens_seen": 1457856,
"step": 285,
"train_runtime": 446.036,
"train_tokens_per_second": 3268.471
},
{
"epoch": 0.7712765957446809,
"grad_norm": 5.1625109335873276e-05,
"learning_rate": 2.539781607432125e-05,
"loss": 0.0624,
"num_input_tokens_seen": 1481120,
"step": 290,
"train_runtime": 453.1392,
"train_tokens_per_second": 3268.576
},
{
"epoch": 0.7845744680851063,
"grad_norm": 6.952533112780657e-06,
"learning_rate": 2.5246259254170464e-05,
"loss": 0.0346,
"num_input_tokens_seen": 1506176,
"step": 295,
"train_runtime": 460.6884,
"train_tokens_per_second": 3269.403
},
{
"epoch": 0.7978723404255319,
"grad_norm": 9.527090878691524e-05,
"learning_rate": 2.5092715516327384e-05,
"loss": 0.075,
"num_input_tokens_seen": 1529824,
"step": 300,
"train_runtime": 467.9003,
"train_tokens_per_second": 3269.551
},
{
"epoch": 0.8111702127659575,
"grad_norm": 2.904631219280418e-05,
"learning_rate": 2.4937214635440665e-05,
"loss": 0.0361,
"num_input_tokens_seen": 1552384,
"step": 305,
"train_runtime": 475.6103,
"train_tokens_per_second": 3263.983
},
{
"epoch": 0.824468085106383,
"grad_norm": 4.446757884579711e-05,
"learning_rate": 2.4779786765681082e-05,
"loss": 0.0367,
"num_input_tokens_seen": 1579072,
"step": 310,
"train_runtime": 483.4588,
"train_tokens_per_second": 3266.198
},
{
"epoch": 0.8377659574468085,
"grad_norm": 6.499775918200612e-05,
"learning_rate": 2.4620462434894158e-05,
"loss": 0.0503,
"num_input_tokens_seen": 1603744,
"step": 315,
"train_runtime": 490.8348,
"train_tokens_per_second": 3267.381
},
{
"epoch": 0.851063829787234,
"grad_norm": 2.785153810691554e-05,
"learning_rate": 2.4459272538680308e-05,
"loss": 0.0371,
"num_input_tokens_seen": 1627712,
"step": 320,
"train_runtime": 498.0766,
"train_tokens_per_second": 3267.995
},
{
"epoch": 0.8643617021276596,
"grad_norm": 6.219661008799449e-05,
"learning_rate": 2.4296248334403672e-05,
"loss": 0.0635,
"num_input_tokens_seen": 1653600,
"step": 325,
"train_runtime": 505.7239,
"train_tokens_per_second": 3269.768
},
{
"epoch": 0.8776595744680851,
"grad_norm": 4.4950455048820004e-05,
"learning_rate": 2.413142143513081e-05,
"loss": 0.0597,
"num_input_tokens_seen": 1676928,
"step": 330,
"train_runtime": 512.8025,
"train_tokens_per_second": 3270.125
},
{
"epoch": 0.8909574468085106,
"grad_norm": 3.27678098983597e-05,
"learning_rate": 2.3964823803500395e-05,
"loss": 0.052,
"num_input_tokens_seen": 1707808,
"step": 335,
"train_runtime": 521.6471,
"train_tokens_per_second": 3273.876
},
{
"epoch": 0.9042553191489362,
"grad_norm": 4.62313364550937e-05,
"learning_rate": 2.3796487745525145e-05,
"loss": 0.048,
"num_input_tokens_seen": 1732576,
"step": 340,
"train_runtime": 529.0667,
"train_tokens_per_second": 3274.778
},
{
"epoch": 0.9175531914893617,
"grad_norm": 3.923915573977865e-05,
"learning_rate": 2.3626445904327155e-05,
"loss": 0.0205,
"num_input_tokens_seen": 1758016,
"step": 345,
"train_runtime": 536.6618,
"train_tokens_per_second": 3275.836
},
{
"epoch": 0.9308510638297872,
"grad_norm": 6.78059086567373e-06,
"learning_rate": 2.3454731253807862e-05,
"loss": 0.0232,
"num_input_tokens_seen": 1783872,
"step": 350,
"train_runtime": 544.3484,
"train_tokens_per_second": 3277.078
},
{
"epoch": 0.9441489361702128,
"grad_norm": 2.773117921606172e-05,
"learning_rate": 2.328137709225385e-05,
"loss": 0.0152,
"num_input_tokens_seen": 1807008,
"step": 355,
"train_runtime": 551.3669,
"train_tokens_per_second": 3277.324
},
{
"epoch": 0.9574468085106383,
"grad_norm": 9.006850450532511e-05,
"learning_rate": 2.3106417035879797e-05,
"loss": 0.0517,
"num_input_tokens_seen": 1834048,
"step": 360,
"train_runtime": 559.3162,
"train_tokens_per_second": 3279.089
},
{
"epoch": 0.9707446808510638,
"grad_norm": 5.6452212447766215e-05,
"learning_rate": 2.2929885012309697e-05,
"loss": 0.0419,
"num_input_tokens_seen": 1861728,
"step": 365,
"train_runtime": 567.4458,
"train_tokens_per_second": 3280.891
},
{
"epoch": 0.9840425531914894,
"grad_norm": 2.901201980876067e-07,
"learning_rate": 2.2751815253997783e-05,
"loss": 0.0186,
"num_input_tokens_seen": 1885376,
"step": 370,
"train_runtime": 574.6516,
"train_tokens_per_second": 3280.903
},
{
"epoch": 0.9973404255319149,
"grad_norm": 1.5018988506199094e-06,
"learning_rate": 2.2572242291590264e-05,
"loss": 0.0293,
"num_input_tokens_seen": 1908128,
"step": 375,
"train_runtime": 581.6252,
"train_tokens_per_second": 3280.683
},
{
"epoch": 1.0106382978723405,
"grad_norm": 0.00014393814490176737,
"learning_rate": 2.239120094722926e-05,
"loss": 0.2291,
"num_input_tokens_seen": 1932528,
"step": 380,
"train_runtime": 589.9529,
"train_tokens_per_second": 3275.733
},
{
"epoch": 1.023936170212766,
"grad_norm": 0.00014409016876015812,
"learning_rate": 2.2208726327800257e-05,
"loss": 1.028,
"num_input_tokens_seen": 1957648,
"step": 385,
"train_runtime": 597.4479,
"train_tokens_per_second": 3276.684
},
{
"epoch": 1.0372340425531914,
"grad_norm": 0.00011712688865372911,
"learning_rate": 2.202485381812426e-05,
"loss": 1.012,
"num_input_tokens_seen": 1985392,
"step": 390,
"train_runtime": 605.572,
"train_tokens_per_second": 3278.54
},
{
"epoch": 1.050531914893617,
"grad_norm": 9.847906039794907e-05,
"learning_rate": 2.1839619074096117e-05,
"loss": 1.1113,
"num_input_tokens_seen": 2014320,
"step": 395,
"train_runtime": 614.0367,
"train_tokens_per_second": 3280.455
},
{
"epoch": 1.0638297872340425,
"grad_norm": 0.00011392939632060006,
"learning_rate": 2.1653058015770262e-05,
"loss": 1.0173,
"num_input_tokens_seen": 2041328,
"step": 400,
"train_runtime": 622.0201,
"train_tokens_per_second": 3281.772
},
{
"epoch": 1.077127659574468,
"grad_norm": 8.545993478037417e-05,
"learning_rate": 2.146520682039522e-05,
"loss": 0.7919,
"num_input_tokens_seen": 2068848,
"step": 405,
"train_runtime": 630.9237,
"train_tokens_per_second": 3279.078
},
{
"epoch": 1.0904255319148937,
"grad_norm": 8.802056254353374e-05,
"learning_rate": 2.127610191539825e-05,
"loss": 0.696,
"num_input_tokens_seen": 2094352,
"step": 410,
"train_runtime": 638.5178,
"train_tokens_per_second": 3280.022
},
{
"epoch": 1.1037234042553192,
"grad_norm": 6.343067070702091e-05,
"learning_rate": 2.1085779971321456e-05,
"loss": 0.5359,
"num_input_tokens_seen": 2120592,
"step": 415,
"train_runtime": 646.3244,
"train_tokens_per_second": 3281.003
},
{
"epoch": 1.1170212765957448,
"grad_norm": 0.0001089554643840529,
"learning_rate": 2.089427789471078e-05,
"loss": 0.4819,
"num_input_tokens_seen": 2147376,
"step": 420,
"train_runtime": 654.2112,
"train_tokens_per_second": 3282.39
},
{
"epoch": 1.1303191489361701,
"grad_norm": 5.21246729476843e-05,
"learning_rate": 2.0701632820959223e-05,
"loss": 0.3732,
"num_input_tokens_seen": 2170992,
"step": 425,
"train_runtime": 661.38,
"train_tokens_per_second": 3282.518
},
{
"epoch": 1.1436170212765957,
"grad_norm": 6.755034701200202e-05,
"learning_rate": 2.0507882107105664e-05,
"loss": 0.3435,
"num_input_tokens_seen": 2199216,
"step": 430,
"train_runtime": 669.64,
"train_tokens_per_second": 3284.177
},
{
"epoch": 1.1569148936170213,
"grad_norm": 9.302370017394423e-05,
"learning_rate": 2.0313063324590736e-05,
"loss": 0.2404,
"num_input_tokens_seen": 2223120,
"step": 435,
"train_runtime": 676.8858,
"train_tokens_per_second": 3284.335
},
{
"epoch": 1.1702127659574468,
"grad_norm": 7.59345421101898e-05,
"learning_rate": 2.0117214251971088e-05,
"loss": 0.2588,
"num_input_tokens_seen": 2246128,
"step": 440,
"train_runtime": 683.9358,
"train_tokens_per_second": 3284.121
},
{
"epoch": 1.1835106382978724,
"grad_norm": 0.00014472956536337733,
"learning_rate": 1.9920372867593537e-05,
"loss": 0.1375,
"num_input_tokens_seen": 2274448,
"step": 445,
"train_runtime": 692.2182,
"train_tokens_per_second": 3285.738
},
{
"epoch": 1.196808510638298,
"grad_norm": 6.864719762234017e-05,
"learning_rate": 1.9722577342230408e-05,
"loss": 0.1394,
"num_input_tokens_seen": 2298736,
"step": 450,
"train_runtime": 699.5691,
"train_tokens_per_second": 3285.931
},
{
"epoch": 1.2101063829787235,
"grad_norm": 7.647907477803528e-05,
"learning_rate": 1.9523866031677607e-05,
"loss": 0.1386,
"num_input_tokens_seen": 2326192,
"step": 455,
"train_runtime": 707.7177,
"train_tokens_per_second": 3286.893
},
{
"epoch": 1.2234042553191489,
"grad_norm": 5.32688463863451e-05,
"learning_rate": 1.9324277469316807e-05,
"loss": 0.1507,
"num_input_tokens_seen": 2354992,
"step": 460,
"train_runtime": 716.0771,
"train_tokens_per_second": 3288.741
},
{
"epoch": 1.2367021276595744,
"grad_norm": 6.799784750910476e-05,
"learning_rate": 1.9123850358643208e-05,
"loss": 0.1407,
"num_input_tokens_seen": 2378736,
"step": 465,
"train_runtime": 723.2828,
"train_tokens_per_second": 3288.805
},
{
"epoch": 1.25,
"grad_norm": 6.61658777971752e-05,
"learning_rate": 1.8922623565760255e-05,
"loss": 0.1241,
"num_input_tokens_seen": 2402928,
"step": 470,
"train_runtime": 730.5847,
"train_tokens_per_second": 3289.048
},
{
"epoch": 1.2632978723404256,
"grad_norm": 6.660693179583177e-05,
"learning_rate": 1.87206361118429e-05,
"loss": 0.1393,
"num_input_tokens_seen": 2427536,
"step": 475,
"train_runtime": 737.9838,
"train_tokens_per_second": 3289.417
},
{
"epoch": 1.2765957446808511,
"grad_norm": 5.5432989029213786e-05,
"learning_rate": 1.8517927165570745e-05,
"loss": 0.1068,
"num_input_tokens_seen": 2451952,
"step": 480,
"train_runtime": 745.3344,
"train_tokens_per_second": 3289.734
},
{
"epoch": 1.2898936170212765,
"grad_norm": 5.3888677939539775e-05,
"learning_rate": 1.831453603553259e-05,
"loss": 0.1255,
"num_input_tokens_seen": 2480912,
"step": 485,
"train_runtime": 753.8109,
"train_tokens_per_second": 3291.16
},
{
"epoch": 1.3031914893617023,
"grad_norm": 6.483653123723343e-05,
"learning_rate": 1.811050216260385e-05,
"loss": 0.0855,
"num_input_tokens_seen": 2505744,
"step": 490,
"train_runtime": 761.3174,
"train_tokens_per_second": 3291.326
},
{
"epoch": 1.3164893617021276,
"grad_norm": 5.622122625936754e-05,
"learning_rate": 1.790586511229832e-05,
"loss": 0.1123,
"num_input_tokens_seen": 2528720,
"step": 495,
"train_runtime": 768.3834,
"train_tokens_per_second": 3290.961
},
{
"epoch": 1.3297872340425532,
"grad_norm": 4.249440462444909e-05,
"learning_rate": 1.7700664567095788e-05,
"loss": 0.0643,
"num_input_tokens_seen": 2551760,
"step": 500,
"train_runtime": 775.3677,
"train_tokens_per_second": 3291.032
}
],
"logging_steps": 5,
"max_steps": 1128,
"num_input_tokens_seen": 2551760,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0923743661195264e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}