ttm4hvac-source-default / trainer_state.json
Ferran Aran
initial commit
da372b7 unverified
{
"best_global_step": 2128,
"best_metric": 0.08128391951322556,
"best_model_checkpoint": "tmp/out/1536-96-r2_common_channel_fcmCtx4_fcmLayers6_fcmChMixingTrue_stride24_bs512_lr0.001_a7e3/checkpoint-2128",
"epoch": 76.0,
"eval_steps": 500,
"global_step": 2128,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 0.18818920850753784,
"learning_rate": 0.000999997705696395,
"loss": 0.1949,
"step": 28
},
{
"epoch": 1.0,
"eval_loss": 0.13799113035202026,
"eval_runtime": 13.9945,
"eval_samples_per_second": 388.011,
"eval_steps_per_second": 0.786,
"step": 28
},
{
"epoch": 2.0,
"grad_norm": 0.17697054147720337,
"learning_rate": 0.0009999904797644875,
"loss": 0.1397,
"step": 56
},
{
"epoch": 2.0,
"eval_loss": 0.1200891062617302,
"eval_runtime": 13.7764,
"eval_samples_per_second": 394.151,
"eval_steps_per_second": 0.798,
"step": 56
},
{
"epoch": 3.0,
"grad_norm": 0.12785276770591736,
"learning_rate": 0.0009999783191283983,
"loss": 0.1239,
"step": 84
},
{
"epoch": 3.0,
"eval_loss": 0.11045144498348236,
"eval_runtime": 13.7689,
"eval_samples_per_second": 394.367,
"eval_steps_per_second": 0.799,
"step": 84
},
{
"epoch": 4.0,
"grad_norm": 0.15085896849632263,
"learning_rate": 0.0009999612239081498,
"loss": 0.1156,
"step": 112
},
{
"epoch": 4.0,
"eval_loss": 0.10991678386926651,
"eval_runtime": 14.5903,
"eval_samples_per_second": 372.166,
"eval_steps_per_second": 0.754,
"step": 112
},
{
"epoch": 5.0,
"grad_norm": 0.1445561647415161,
"learning_rate": 0.0009999391942724632,
"loss": 0.1106,
"step": 140
},
{
"epoch": 5.0,
"eval_loss": 0.1034143716096878,
"eval_runtime": 12.8666,
"eval_samples_per_second": 422.022,
"eval_steps_per_second": 0.855,
"step": 140
},
{
"epoch": 6.0,
"grad_norm": 0.13405078649520874,
"learning_rate": 0.000999912230438763,
"loss": 0.1072,
"step": 168
},
{
"epoch": 6.0,
"eval_loss": 0.10323869436979294,
"eval_runtime": 13.5764,
"eval_samples_per_second": 399.957,
"eval_steps_per_second": 0.81,
"step": 168
},
{
"epoch": 7.0,
"grad_norm": 0.13994024693965912,
"learning_rate": 0.0009998803326731703,
"loss": 0.1042,
"step": 196
},
{
"epoch": 7.0,
"eval_loss": 0.10014788061380386,
"eval_runtime": 14.0652,
"eval_samples_per_second": 386.058,
"eval_steps_per_second": 0.782,
"step": 196
},
{
"epoch": 8.0,
"grad_norm": 0.13133035600185394,
"learning_rate": 0.0009998435012905044,
"loss": 0.1025,
"step": 224
},
{
"epoch": 8.0,
"eval_loss": 0.1004691943526268,
"eval_runtime": 13.771,
"eval_samples_per_second": 394.307,
"eval_steps_per_second": 0.799,
"step": 224
},
{
"epoch": 9.0,
"grad_norm": 0.1343812197446823,
"learning_rate": 0.0009998017366542756,
"loss": 0.1008,
"step": 252
},
{
"epoch": 9.0,
"eval_loss": 0.09979160130023956,
"eval_runtime": 14.0034,
"eval_samples_per_second": 387.762,
"eval_steps_per_second": 0.786,
"step": 252
},
{
"epoch": 10.0,
"grad_norm": 0.1553795337677002,
"learning_rate": 0.0009997550391766844,
"loss": 0.0991,
"step": 280
},
{
"epoch": 10.0,
"eval_loss": 0.098526231944561,
"eval_runtime": 13.3534,
"eval_samples_per_second": 406.638,
"eval_steps_per_second": 0.824,
"step": 280
},
{
"epoch": 11.0,
"grad_norm": 0.1313014179468155,
"learning_rate": 0.000999703409318616,
"loss": 0.0977,
"step": 308
},
{
"epoch": 11.0,
"eval_loss": 0.09693964570760727,
"eval_runtime": 14.1264,
"eval_samples_per_second": 384.385,
"eval_steps_per_second": 0.779,
"step": 308
},
{
"epoch": 12.0,
"grad_norm": 0.14203286170959473,
"learning_rate": 0.0009996468475896355,
"loss": 0.0964,
"step": 336
},
{
"epoch": 12.0,
"eval_loss": 0.0984538123011589,
"eval_runtime": 13.8783,
"eval_samples_per_second": 391.259,
"eval_steps_per_second": 0.793,
"step": 336
},
{
"epoch": 13.0,
"grad_norm": 0.16649910807609558,
"learning_rate": 0.0009995853545479853,
"loss": 0.0955,
"step": 364
},
{
"epoch": 13.0,
"eval_loss": 0.09544174373149872,
"eval_runtime": 13.7858,
"eval_samples_per_second": 393.884,
"eval_steps_per_second": 0.798,
"step": 364
},
{
"epoch": 14.0,
"grad_norm": 0.11850055307149887,
"learning_rate": 0.0009995189308005762,
"loss": 0.0945,
"step": 392
},
{
"epoch": 14.0,
"eval_loss": 0.09443824738264084,
"eval_runtime": 14.0327,
"eval_samples_per_second": 386.953,
"eval_steps_per_second": 0.784,
"step": 392
},
{
"epoch": 15.0,
"grad_norm": 0.1464478075504303,
"learning_rate": 0.0009994475770029841,
"loss": 0.0926,
"step": 420
},
{
"epoch": 15.0,
"eval_loss": 0.09755795449018478,
"eval_runtime": 13.5982,
"eval_samples_per_second": 399.319,
"eval_steps_per_second": 0.809,
"step": 420
},
{
"epoch": 16.0,
"grad_norm": 0.13713239133358002,
"learning_rate": 0.0009993712938594424,
"loss": 0.0922,
"step": 448
},
{
"epoch": 16.0,
"eval_loss": 0.09337516874074936,
"eval_runtime": 13.7553,
"eval_samples_per_second": 394.757,
"eval_steps_per_second": 0.8,
"step": 448
},
{
"epoch": 17.0,
"grad_norm": 0.17032985389232635,
"learning_rate": 0.0009992900821228345,
"loss": 0.0914,
"step": 476
},
{
"epoch": 17.0,
"eval_loss": 0.09800300747156143,
"eval_runtime": 13.4874,
"eval_samples_per_second": 402.597,
"eval_steps_per_second": 0.816,
"step": 476
},
{
"epoch": 18.0,
"grad_norm": 0.11109838634729385,
"learning_rate": 0.000999203942594687,
"loss": 0.091,
"step": 504
},
{
"epoch": 18.0,
"eval_loss": 0.09393668174743652,
"eval_runtime": 13.6194,
"eval_samples_per_second": 398.697,
"eval_steps_per_second": 0.808,
"step": 504
},
{
"epoch": 19.0,
"grad_norm": 0.13168472051620483,
"learning_rate": 0.0009991128761251632,
"loss": 0.0894,
"step": 532
},
{
"epoch": 19.0,
"eval_loss": 0.09444674849510193,
"eval_runtime": 13.817,
"eval_samples_per_second": 392.994,
"eval_steps_per_second": 0.796,
"step": 532
},
{
"epoch": 20.0,
"grad_norm": 0.12797316908836365,
"learning_rate": 0.0009990168836130527,
"loss": 0.0892,
"step": 560
},
{
"epoch": 20.0,
"eval_loss": 0.09130553901195526,
"eval_runtime": 12.8149,
"eval_samples_per_second": 423.724,
"eval_steps_per_second": 0.858,
"step": 560
},
{
"epoch": 21.0,
"grad_norm": 0.12064854055643082,
"learning_rate": 0.0009989159660057615,
"loss": 0.0878,
"step": 588
},
{
"epoch": 21.0,
"eval_loss": 0.09315093606710434,
"eval_runtime": 13.0593,
"eval_samples_per_second": 415.796,
"eval_steps_per_second": 0.842,
"step": 588
},
{
"epoch": 22.0,
"grad_norm": 0.11039382964372635,
"learning_rate": 0.0009988101242993065,
"loss": 0.0873,
"step": 616
},
{
"epoch": 22.0,
"eval_loss": 0.09174513071775436,
"eval_runtime": 13.6345,
"eval_samples_per_second": 398.254,
"eval_steps_per_second": 0.807,
"step": 616
},
{
"epoch": 23.0,
"grad_norm": 0.11381122469902039,
"learning_rate": 0.000998699359538303,
"loss": 0.0862,
"step": 644
},
{
"epoch": 23.0,
"eval_loss": 0.09421718120574951,
"eval_runtime": 13.1437,
"eval_samples_per_second": 413.126,
"eval_steps_per_second": 0.837,
"step": 644
},
{
"epoch": 24.0,
"grad_norm": 0.13068965077400208,
"learning_rate": 0.0009985836728159524,
"loss": 0.0862,
"step": 672
},
{
"epoch": 24.0,
"eval_loss": 0.0908147320151329,
"eval_runtime": 13.6756,
"eval_samples_per_second": 397.059,
"eval_steps_per_second": 0.804,
"step": 672
},
{
"epoch": 25.0,
"grad_norm": 0.19063422083854675,
"learning_rate": 0.0009984630652740383,
"loss": 0.0861,
"step": 700
},
{
"epoch": 25.0,
"eval_loss": 0.09025771915912628,
"eval_runtime": 12.5305,
"eval_samples_per_second": 433.342,
"eval_steps_per_second": 0.878,
"step": 700
},
{
"epoch": 26.0,
"grad_norm": 0.1407005786895752,
"learning_rate": 0.0009983375381029088,
"loss": 0.0854,
"step": 728
},
{
"epoch": 26.0,
"eval_loss": 0.08990131318569183,
"eval_runtime": 13.5474,
"eval_samples_per_second": 400.814,
"eval_steps_per_second": 0.812,
"step": 728
},
{
"epoch": 27.0,
"grad_norm": 0.1301293671131134,
"learning_rate": 0.0009982070925414637,
"loss": 0.0847,
"step": 756
},
{
"epoch": 27.0,
"eval_loss": 0.08932916820049286,
"eval_runtime": 13.7031,
"eval_samples_per_second": 396.259,
"eval_steps_per_second": 0.803,
"step": 756
},
{
"epoch": 28.0,
"grad_norm": 0.15950527787208557,
"learning_rate": 0.0009980717298771495,
"loss": 0.0836,
"step": 784
},
{
"epoch": 28.0,
"eval_loss": 0.08872799575328827,
"eval_runtime": 12.7264,
"eval_samples_per_second": 426.672,
"eval_steps_per_second": 0.864,
"step": 784
},
{
"epoch": 29.0,
"grad_norm": 0.14679257571697235,
"learning_rate": 0.000997931451445941,
"loss": 0.0831,
"step": 812
},
{
"epoch": 29.0,
"eval_loss": 0.09064222127199173,
"eval_runtime": 13.2879,
"eval_samples_per_second": 408.642,
"eval_steps_per_second": 0.828,
"step": 812
},
{
"epoch": 30.0,
"grad_norm": 0.1313679814338684,
"learning_rate": 0.0009977862586323298,
"loss": 0.0827,
"step": 840
},
{
"epoch": 30.0,
"eval_loss": 0.09007434546947479,
"eval_runtime": 13.714,
"eval_samples_per_second": 395.947,
"eval_steps_per_second": 0.802,
"step": 840
},
{
"epoch": 31.0,
"grad_norm": 0.1265498697757721,
"learning_rate": 0.0009976361528693104,
"loss": 0.0819,
"step": 868
},
{
"epoch": 31.0,
"eval_loss": 0.08951247483491898,
"eval_runtime": 13.4697,
"eval_samples_per_second": 403.128,
"eval_steps_per_second": 0.817,
"step": 868
},
{
"epoch": 32.0,
"grad_norm": 0.1129317358136177,
"learning_rate": 0.0009974811356383668,
"loss": 0.0819,
"step": 896
},
{
"epoch": 32.0,
"eval_loss": 0.09060715138912201,
"eval_runtime": 13.0035,
"eval_samples_per_second": 417.579,
"eval_steps_per_second": 0.846,
"step": 896
},
{
"epoch": 33.0,
"grad_norm": 0.12748871743679047,
"learning_rate": 0.0009973212084694557,
"loss": 0.0812,
"step": 924
},
{
"epoch": 33.0,
"eval_loss": 0.0907522663474083,
"eval_runtime": 14.0967,
"eval_samples_per_second": 385.196,
"eval_steps_per_second": 0.78,
"step": 924
},
{
"epoch": 34.0,
"grad_norm": 0.10055958479642868,
"learning_rate": 0.000997156372940993,
"loss": 0.0815,
"step": 952
},
{
"epoch": 34.0,
"eval_loss": 0.08895213901996613,
"eval_runtime": 13.4637,
"eval_samples_per_second": 403.308,
"eval_steps_per_second": 0.817,
"step": 952
},
{
"epoch": 35.0,
"grad_norm": 0.12697456777095795,
"learning_rate": 0.00099698663067984,
"loss": 0.0805,
"step": 980
},
{
"epoch": 35.0,
"eval_loss": 0.08886592090129852,
"eval_runtime": 13.6328,
"eval_samples_per_second": 398.305,
"eval_steps_per_second": 0.807,
"step": 980
},
{
"epoch": 36.0,
"grad_norm": 0.20513515174388885,
"learning_rate": 0.0009968119833612843,
"loss": 0.0804,
"step": 1008
},
{
"epoch": 36.0,
"eval_loss": 0.08658694475889206,
"eval_runtime": 13.2186,
"eval_samples_per_second": 410.785,
"eval_steps_per_second": 0.832,
"step": 1008
},
{
"epoch": 37.0,
"grad_norm": 0.1370176076889038,
"learning_rate": 0.000996632432709024,
"loss": 0.0803,
"step": 1036
},
{
"epoch": 37.0,
"eval_loss": 0.08886294066905975,
"eval_runtime": 13.4889,
"eval_samples_per_second": 402.554,
"eval_steps_per_second": 0.815,
"step": 1036
},
{
"epoch": 38.0,
"grad_norm": 0.13838233053684235,
"learning_rate": 0.0009964479804951505,
"loss": 0.0795,
"step": 1064
},
{
"epoch": 38.0,
"eval_loss": 0.0883188471198082,
"eval_runtime": 13.4375,
"eval_samples_per_second": 404.093,
"eval_steps_per_second": 0.819,
"step": 1064
},
{
"epoch": 39.0,
"grad_norm": 0.1312042474746704,
"learning_rate": 0.000996258628540135,
"loss": 0.0786,
"step": 1092
},
{
"epoch": 39.0,
"eval_loss": 0.0872192457318306,
"eval_runtime": 13.5729,
"eval_samples_per_second": 400.063,
"eval_steps_per_second": 0.81,
"step": 1092
},
{
"epoch": 40.0,
"grad_norm": 0.1502840369939804,
"learning_rate": 0.0009960643787128027,
"loss": 0.0783,
"step": 1120
},
{
"epoch": 40.0,
"eval_loss": 0.08811386674642563,
"eval_runtime": 13.7911,
"eval_samples_per_second": 393.732,
"eval_steps_per_second": 0.798,
"step": 1120
},
{
"epoch": 41.0,
"grad_norm": 0.10295199602842331,
"learning_rate": 0.0009958652329303218,
"loss": 0.0779,
"step": 1148
},
{
"epoch": 41.0,
"eval_loss": 0.08700462430715561,
"eval_runtime": 12.9613,
"eval_samples_per_second": 418.94,
"eval_steps_per_second": 0.849,
"step": 1148
},
{
"epoch": 42.0,
"grad_norm": 0.14093568921089172,
"learning_rate": 0.0009956611931581812,
"loss": 0.0779,
"step": 1176
},
{
"epoch": 42.0,
"eval_loss": 0.08765130490064621,
"eval_runtime": 13.6238,
"eval_samples_per_second": 398.567,
"eval_steps_per_second": 0.807,
"step": 1176
},
{
"epoch": 43.0,
"grad_norm": 0.12385249137878418,
"learning_rate": 0.00099545226141017,
"loss": 0.0771,
"step": 1204
},
{
"epoch": 43.0,
"eval_loss": 0.08630397915840149,
"eval_runtime": 13.9701,
"eval_samples_per_second": 388.687,
"eval_steps_per_second": 0.787,
"step": 1204
},
{
"epoch": 44.0,
"grad_norm": 0.12912018597126007,
"learning_rate": 0.000995238439748361,
"loss": 0.0765,
"step": 1232
},
{
"epoch": 44.0,
"eval_loss": 0.08651801943778992,
"eval_runtime": 14.0705,
"eval_samples_per_second": 385.913,
"eval_steps_per_second": 0.782,
"step": 1232
},
{
"epoch": 45.0,
"grad_norm": 0.13436101377010345,
"learning_rate": 0.000995019730283088,
"loss": 0.0768,
"step": 1260
},
{
"epoch": 45.0,
"eval_loss": 0.08759938925504684,
"eval_runtime": 13.8608,
"eval_samples_per_second": 391.754,
"eval_steps_per_second": 0.794,
"step": 1260
},
{
"epoch": 46.0,
"grad_norm": 0.1179327666759491,
"learning_rate": 0.000994796135172924,
"loss": 0.076,
"step": 1288
},
{
"epoch": 46.0,
"eval_loss": 0.08753702789545059,
"eval_runtime": 13.1322,
"eval_samples_per_second": 413.488,
"eval_steps_per_second": 0.838,
"step": 1288
},
{
"epoch": 47.0,
"grad_norm": 0.12396474927663803,
"learning_rate": 0.0009945676566246633,
"loss": 0.0758,
"step": 1316
},
{
"epoch": 47.0,
"eval_loss": 0.0872747004032135,
"eval_runtime": 13.9301,
"eval_samples_per_second": 389.803,
"eval_steps_per_second": 0.79,
"step": 1316
},
{
"epoch": 48.0,
"grad_norm": 0.13964654505252838,
"learning_rate": 0.0009943342968932972,
"loss": 0.0753,
"step": 1344
},
{
"epoch": 48.0,
"eval_loss": 0.08765023946762085,
"eval_runtime": 14.1213,
"eval_samples_per_second": 384.525,
"eval_steps_per_second": 0.779,
"step": 1344
},
{
"epoch": 49.0,
"grad_norm": 0.14741012454032898,
"learning_rate": 0.0009940960582819915,
"loss": 0.0751,
"step": 1372
},
{
"epoch": 49.0,
"eval_loss": 0.08702078461647034,
"eval_runtime": 14.1874,
"eval_samples_per_second": 382.734,
"eval_steps_per_second": 0.775,
"step": 1372
},
{
"epoch": 50.0,
"grad_norm": 0.1287020742893219,
"learning_rate": 0.0009938529431420646,
"loss": 0.075,
"step": 1400
},
{
"epoch": 50.0,
"eval_loss": 0.08901604264974594,
"eval_runtime": 13.9742,
"eval_samples_per_second": 388.575,
"eval_steps_per_second": 0.787,
"step": 1400
},
{
"epoch": 51.0,
"grad_norm": 0.12099709361791611,
"learning_rate": 0.0009936049538729656,
"loss": 0.0748,
"step": 1428
},
{
"epoch": 51.0,
"eval_loss": 0.08469326049089432,
"eval_runtime": 13.9824,
"eval_samples_per_second": 388.345,
"eval_steps_per_second": 0.787,
"step": 1428
},
{
"epoch": 52.0,
"grad_norm": 0.10210338979959488,
"learning_rate": 0.0009933520929222485,
"loss": 0.0742,
"step": 1456
},
{
"epoch": 52.0,
"eval_loss": 0.08571095019578934,
"eval_runtime": 14.305,
"eval_samples_per_second": 379.588,
"eval_steps_per_second": 0.769,
"step": 1456
},
{
"epoch": 53.0,
"grad_norm": 0.21403531730175018,
"learning_rate": 0.0009930943627855485,
"loss": 0.0738,
"step": 1484
},
{
"epoch": 53.0,
"eval_loss": 0.08666189014911652,
"eval_runtime": 13.2505,
"eval_samples_per_second": 409.795,
"eval_steps_per_second": 0.83,
"step": 1484
},
{
"epoch": 54.0,
"grad_norm": 0.15750914812088013,
"learning_rate": 0.0009928317660065577,
"loss": 0.073,
"step": 1512
},
{
"epoch": 54.0,
"eval_loss": 0.08511006087064743,
"eval_runtime": 13.0651,
"eval_samples_per_second": 415.611,
"eval_steps_per_second": 0.842,
"step": 1512
},
{
"epoch": 55.0,
"grad_norm": 0.13378620147705078,
"learning_rate": 0.000992564305177001,
"loss": 0.0731,
"step": 1540
},
{
"epoch": 55.0,
"eval_loss": 0.08461768925189972,
"eval_runtime": 13.8593,
"eval_samples_per_second": 391.796,
"eval_steps_per_second": 0.794,
"step": 1540
},
{
"epoch": 56.0,
"grad_norm": 0.10955790430307388,
"learning_rate": 0.0009922919829366086,
"loss": 0.0731,
"step": 1568
},
{
"epoch": 56.0,
"eval_loss": 0.0852380245923996,
"eval_runtime": 13.9377,
"eval_samples_per_second": 389.59,
"eval_steps_per_second": 0.789,
"step": 1568
},
{
"epoch": 57.0,
"grad_norm": 0.13638441264629364,
"learning_rate": 0.0009920148019730913,
"loss": 0.072,
"step": 1596
},
{
"epoch": 57.0,
"eval_loss": 0.08537213504314423,
"eval_runtime": 13.8736,
"eval_samples_per_second": 391.392,
"eval_steps_per_second": 0.793,
"step": 1596
},
{
"epoch": 58.0,
"grad_norm": 0.1874396950006485,
"learning_rate": 0.0009917327650221124,
"loss": 0.0719,
"step": 1624
},
{
"epoch": 58.0,
"eval_loss": 0.0842534676194191,
"eval_runtime": 14.039,
"eval_samples_per_second": 386.779,
"eval_steps_per_second": 0.784,
"step": 1624
},
{
"epoch": 59.0,
"grad_norm": 0.12774085998535156,
"learning_rate": 0.0009914458748672634,
"loss": 0.0717,
"step": 1652
},
{
"epoch": 59.0,
"eval_loss": 0.08502174913883209,
"eval_runtime": 14.0351,
"eval_samples_per_second": 386.886,
"eval_steps_per_second": 0.784,
"step": 1652
},
{
"epoch": 60.0,
"grad_norm": 0.13431188464164734,
"learning_rate": 0.000991154134340034,
"loss": 0.071,
"step": 1680
},
{
"epoch": 60.0,
"eval_loss": 0.08464298397302628,
"eval_runtime": 14.5396,
"eval_samples_per_second": 373.462,
"eval_steps_per_second": 0.757,
"step": 1680
},
{
"epoch": 61.0,
"grad_norm": 0.12022250145673752,
"learning_rate": 0.0009908575463197854,
"loss": 0.071,
"step": 1708
},
{
"epoch": 61.0,
"eval_loss": 0.0845947265625,
"eval_runtime": 13.51,
"eval_samples_per_second": 401.924,
"eval_steps_per_second": 0.814,
"step": 1708
},
{
"epoch": 62.0,
"grad_norm": 0.14051498472690582,
"learning_rate": 0.0009905561137337224,
"loss": 0.0706,
"step": 1736
},
{
"epoch": 62.0,
"eval_loss": 0.08576343953609467,
"eval_runtime": 13.5918,
"eval_samples_per_second": 399.506,
"eval_steps_per_second": 0.809,
"step": 1736
},
{
"epoch": 63.0,
"grad_norm": 0.12124724686145782,
"learning_rate": 0.0009902498395568619,
"loss": 0.0701,
"step": 1764
},
{
"epoch": 63.0,
"eval_loss": 0.08432195335626602,
"eval_runtime": 13.7606,
"eval_samples_per_second": 394.605,
"eval_steps_per_second": 0.799,
"step": 1764
},
{
"epoch": 64.0,
"grad_norm": 0.15731370449066162,
"learning_rate": 0.0009899387268120072,
"loss": 0.0701,
"step": 1792
},
{
"epoch": 64.0,
"eval_loss": 0.08528588712215424,
"eval_runtime": 13.8086,
"eval_samples_per_second": 393.233,
"eval_steps_per_second": 0.797,
"step": 1792
},
{
"epoch": 65.0,
"grad_norm": 0.12616483867168427,
"learning_rate": 0.0009896227785697153,
"loss": 0.0697,
"step": 1820
},
{
"epoch": 65.0,
"eval_loss": 0.08514665067195892,
"eval_runtime": 13.7015,
"eval_samples_per_second": 396.308,
"eval_steps_per_second": 0.803,
"step": 1820
},
{
"epoch": 66.0,
"grad_norm": 0.12217582017183304,
"learning_rate": 0.0009893019979482674,
"loss": 0.0697,
"step": 1848
},
{
"epoch": 66.0,
"eval_loss": 0.08603604882955551,
"eval_runtime": 13.8552,
"eval_samples_per_second": 391.91,
"eval_steps_per_second": 0.794,
"step": 1848
},
{
"epoch": 67.0,
"grad_norm": 0.1277308613061905,
"learning_rate": 0.0009889763881136386,
"loss": 0.0693,
"step": 1876
},
{
"epoch": 67.0,
"eval_loss": 0.08344025909900665,
"eval_runtime": 13.4274,
"eval_samples_per_second": 404.396,
"eval_steps_per_second": 0.819,
"step": 1876
},
{
"epoch": 68.0,
"grad_norm": 0.18230511248111725,
"learning_rate": 0.0009886459522794678,
"loss": 0.0692,
"step": 1904
},
{
"epoch": 68.0,
"eval_loss": 0.08314071595668793,
"eval_runtime": 13.346,
"eval_samples_per_second": 406.863,
"eval_steps_per_second": 0.824,
"step": 1904
},
{
"epoch": 69.0,
"grad_norm": 0.17872166633605957,
"learning_rate": 0.0009883106937070216,
"loss": 0.0694,
"step": 1932
},
{
"epoch": 69.0,
"eval_loss": 0.08490563184022903,
"eval_runtime": 14.1692,
"eval_samples_per_second": 383.224,
"eval_steps_per_second": 0.776,
"step": 1932
},
{
"epoch": 70.0,
"grad_norm": 0.1801847666501999,
"learning_rate": 0.000987970615705167,
"loss": 0.0691,
"step": 1960
},
{
"epoch": 70.0,
"eval_loss": 0.08387839794158936,
"eval_runtime": 13.4426,
"eval_samples_per_second": 403.94,
"eval_steps_per_second": 0.818,
"step": 1960
},
{
"epoch": 71.0,
"grad_norm": 0.1893129050731659,
"learning_rate": 0.0009876257216303382,
"loss": 0.0693,
"step": 1988
},
{
"epoch": 71.0,
"eval_loss": 0.08392627537250519,
"eval_runtime": 13.6535,
"eval_samples_per_second": 397.7,
"eval_steps_per_second": 0.806,
"step": 1988
},
{
"epoch": 72.0,
"grad_norm": 0.10252843797206879,
"learning_rate": 0.0009872760148864983,
"loss": 0.0678,
"step": 2016
},
{
"epoch": 72.0,
"eval_loss": 0.08312556147575378,
"eval_runtime": 13.2392,
"eval_samples_per_second": 410.145,
"eval_steps_per_second": 0.831,
"step": 2016
},
{
"epoch": 73.0,
"grad_norm": 0.10509613156318665,
"learning_rate": 0.0009869214989251126,
"loss": 0.0675,
"step": 2044
},
{
"epoch": 73.0,
"eval_loss": 0.08445987850427628,
"eval_runtime": 12.9266,
"eval_samples_per_second": 420.065,
"eval_steps_per_second": 0.851,
"step": 2044
},
{
"epoch": 74.0,
"grad_norm": 0.12325132638216019,
"learning_rate": 0.0009865621772451112,
"loss": 0.0676,
"step": 2072
},
{
"epoch": 74.0,
"eval_loss": 0.08372888714075089,
"eval_runtime": 13.3229,
"eval_samples_per_second": 407.57,
"eval_steps_per_second": 0.826,
"step": 2072
},
{
"epoch": 75.0,
"grad_norm": 0.1851738691329956,
"learning_rate": 0.000986198053392854,
"loss": 0.0675,
"step": 2100
},
{
"epoch": 75.0,
"eval_loss": 0.08268258720636368,
"eval_runtime": 13.5981,
"eval_samples_per_second": 399.322,
"eval_steps_per_second": 0.809,
"step": 2100
},
{
"epoch": 76.0,
"grad_norm": 0.16348977386951447,
"learning_rate": 0.0009858291309620953,
"loss": 0.0673,
"step": 2128
},
{
"epoch": 76.0,
"eval_loss": 0.08128391951322556,
"eval_runtime": 13.6413,
"eval_samples_per_second": 398.055,
"eval_steps_per_second": 0.806,
"step": 2128
}
],
"logging_steps": 500,
"max_steps": 28000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1000,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 1e-05
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.500471276978176e+17,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}