Time Series Forecasting
Safetensors
granite_tsfm
tinytimemixer
ttm4hvac
tsfm
digital twin
hvac
energy
ttm4hvac / trainer_state.json
Ferran Aran
initial commit
71fbc29 unverified
{
"best_global_step": 9408,
"best_metric": 0.1410149782896042,
"best_model_checkpoint": "tmp/out/1536-96-r2_mix_channel_fcmCtx3_fcmLayers3_fcmChMixingTrue_stride24_bs512_lrf_deb3/checkpoint-9408",
"epoch": 168.0,
"eval_steps": 500,
"global_step": 9408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 0.376089870929718,
"learning_rate": 0.00029836401390103334,
"loss": 0.3643,
"step": 56
},
{
"epoch": 1.0,
"eval_loss": 0.25079935789108276,
"eval_runtime": 12.3705,
"eval_samples_per_second": 877.898,
"eval_steps_per_second": 1.778,
"step": 56
},
{
"epoch": 2.0,
"grad_norm": 0.25105392932891846,
"learning_rate": 0.00029836183164580883,
"loss": 0.3058,
"step": 112
},
{
"epoch": 2.0,
"eval_loss": 0.23216894268989563,
"eval_runtime": 12.2194,
"eval_samples_per_second": 888.753,
"eval_steps_per_second": 1.8,
"step": 112
},
{
"epoch": 3.0,
"grad_norm": 0.17020165920257568,
"learning_rate": 0.00029835817704944523,
"loss": 0.2683,
"step": 168
},
{
"epoch": 3.0,
"eval_loss": 0.20991244912147522,
"eval_runtime": 10.9934,
"eval_samples_per_second": 987.863,
"eval_steps_per_second": 2.001,
"step": 168
},
{
"epoch": 4.0,
"grad_norm": 0.13130681216716766,
"learning_rate": 0.00029835305014801184,
"loss": 0.2395,
"step": 224
},
{
"epoch": 4.0,
"eval_loss": 0.19736029207706451,
"eval_runtime": 11.7226,
"eval_samples_per_second": 926.414,
"eval_steps_per_second": 1.877,
"step": 224
},
{
"epoch": 5.0,
"grad_norm": 0.12686163187026978,
"learning_rate": 0.0002983464509921093,
"loss": 0.2241,
"step": 280
},
{
"epoch": 5.0,
"eval_loss": 0.18977424502372742,
"eval_runtime": 11.8479,
"eval_samples_per_second": 916.618,
"eval_steps_per_second": 1.857,
"step": 280
},
{
"epoch": 6.0,
"grad_norm": 0.11746390908956528,
"learning_rate": 0.00029833837964686835,
"loss": 0.2148,
"step": 336
},
{
"epoch": 6.0,
"eval_loss": 0.1851092129945755,
"eval_runtime": 11.7556,
"eval_samples_per_second": 923.812,
"eval_steps_per_second": 1.871,
"step": 336
},
{
"epoch": 7.0,
"grad_norm": 0.13627897202968597,
"learning_rate": 0.0002983288361919503,
"loss": 0.2078,
"step": 392
},
{
"epoch": 7.0,
"eval_loss": 0.18129761517047882,
"eval_runtime": 11.7487,
"eval_samples_per_second": 924.357,
"eval_steps_per_second": 1.873,
"step": 392
},
{
"epoch": 8.0,
"grad_norm": 0.1497841328382492,
"learning_rate": 0.00029831782072154485,
"loss": 0.2025,
"step": 448
},
{
"epoch": 8.0,
"eval_loss": 0.17769944667816162,
"eval_runtime": 12.1141,
"eval_samples_per_second": 896.477,
"eval_steps_per_second": 1.816,
"step": 448
},
{
"epoch": 9.0,
"grad_norm": 0.19643521308898926,
"learning_rate": 0.0002983053333443701,
"loss": 0.1976,
"step": 504
},
{
"epoch": 9.0,
"eval_loss": 0.17583897709846497,
"eval_runtime": 12.5558,
"eval_samples_per_second": 864.936,
"eval_steps_per_second": 1.752,
"step": 504
},
{
"epoch": 10.0,
"grad_norm": 0.1033664122223854,
"learning_rate": 0.0002982913741836719,
"loss": 0.1936,
"step": 560
},
{
"epoch": 10.0,
"eval_loss": 0.1739388257265091,
"eval_runtime": 12.449,
"eval_samples_per_second": 872.358,
"eval_steps_per_second": 1.767,
"step": 560
},
{
"epoch": 11.0,
"grad_norm": 0.1361815184354782,
"learning_rate": 0.00029827594337722164,
"loss": 0.1902,
"step": 616
},
{
"epoch": 11.0,
"eval_loss": 0.17110829055309296,
"eval_runtime": 12.7701,
"eval_samples_per_second": 850.423,
"eval_steps_per_second": 1.723,
"step": 616
},
{
"epoch": 12.0,
"grad_norm": 0.12385320663452148,
"learning_rate": 0.0002982590410773146,
"loss": 0.1867,
"step": 672
},
{
"epoch": 12.0,
"eval_loss": 0.16852673888206482,
"eval_runtime": 11.8972,
"eval_samples_per_second": 912.817,
"eval_steps_per_second": 1.849,
"step": 672
},
{
"epoch": 13.0,
"grad_norm": 0.13126742839813232,
"learning_rate": 0.0002982406674507699,
"loss": 0.1837,
"step": 728
},
{
"epoch": 13.0,
"eval_loss": 0.1675039380788803,
"eval_runtime": 11.8951,
"eval_samples_per_second": 912.98,
"eval_steps_per_second": 1.85,
"step": 728
},
{
"epoch": 14.0,
"grad_norm": 0.14581529796123505,
"learning_rate": 0.00029822082267892794,
"loss": 0.1818,
"step": 784
},
{
"epoch": 14.0,
"eval_loss": 0.16522179543972015,
"eval_runtime": 12.951,
"eval_samples_per_second": 838.545,
"eval_steps_per_second": 1.699,
"step": 784
},
{
"epoch": 15.0,
"grad_norm": 0.12710689008235931,
"learning_rate": 0.0002981995069576483,
"loss": 0.1787,
"step": 840
},
{
"epoch": 15.0,
"eval_loss": 0.1651495099067688,
"eval_runtime": 12.4369,
"eval_samples_per_second": 873.211,
"eval_steps_per_second": 1.769,
"step": 840
},
{
"epoch": 16.0,
"grad_norm": 0.1914917379617691,
"learning_rate": 0.0002981767204973089,
"loss": 0.177,
"step": 896
},
{
"epoch": 16.0,
"eval_loss": 0.1639031320810318,
"eval_runtime": 12.7112,
"eval_samples_per_second": 854.365,
"eval_steps_per_second": 1.731,
"step": 896
},
{
"epoch": 17.0,
"grad_norm": 0.15502069890499115,
"learning_rate": 0.00029815246352280276,
"loss": 0.1751,
"step": 952
},
{
"epoch": 17.0,
"eval_loss": 0.16176268458366394,
"eval_runtime": 12.1031,
"eval_samples_per_second": 897.291,
"eval_steps_per_second": 1.818,
"step": 952
},
{
"epoch": 18.0,
"grad_norm": 0.11603855341672897,
"learning_rate": 0.0002981267362735362,
"loss": 0.1734,
"step": 1008
},
{
"epoch": 18.0,
"eval_loss": 0.1614038050174713,
"eval_runtime": 11.893,
"eval_samples_per_second": 913.139,
"eval_steps_per_second": 1.85,
"step": 1008
},
{
"epoch": 19.0,
"grad_norm": 0.11780980974435806,
"learning_rate": 0.0002980995390034271,
"loss": 0.172,
"step": 1064
},
{
"epoch": 19.0,
"eval_loss": 0.16114258766174316,
"eval_runtime": 12.6404,
"eval_samples_per_second": 859.152,
"eval_steps_per_second": 1.74,
"step": 1064
},
{
"epoch": 20.0,
"grad_norm": 0.14823858439922333,
"learning_rate": 0.00029807087198090116,
"loss": 0.1702,
"step": 1120
},
{
"epoch": 20.0,
"eval_loss": 0.15980996191501617,
"eval_runtime": 12.5631,
"eval_samples_per_second": 864.434,
"eval_steps_per_second": 1.751,
"step": 1120
},
{
"epoch": 21.0,
"grad_norm": 0.1246936172246933,
"learning_rate": 0.0002980407354888907,
"loss": 0.1688,
"step": 1176
},
{
"epoch": 21.0,
"eval_loss": 0.15955598652362823,
"eval_runtime": 12.315,
"eval_samples_per_second": 881.853,
"eval_steps_per_second": 1.786,
"step": 1176
},
{
"epoch": 22.0,
"grad_norm": 0.11726798117160797,
"learning_rate": 0.0002980091298248309,
"loss": 0.1675,
"step": 1232
},
{
"epoch": 22.0,
"eval_loss": 0.15864743292331696,
"eval_runtime": 12.3526,
"eval_samples_per_second": 879.166,
"eval_steps_per_second": 1.781,
"step": 1232
},
{
"epoch": 23.0,
"grad_norm": 0.13960805535316467,
"learning_rate": 0.0002979760553006564,
"loss": 0.1666,
"step": 1288
},
{
"epoch": 23.0,
"eval_loss": 0.15781378746032715,
"eval_runtime": 12.187,
"eval_samples_per_second": 891.116,
"eval_steps_per_second": 1.805,
"step": 1288
},
{
"epoch": 24.0,
"grad_norm": 0.11856065690517426,
"learning_rate": 0.00029794151224279964,
"loss": 0.1652,
"step": 1344
},
{
"epoch": 24.0,
"eval_loss": 0.15776978433132172,
"eval_runtime": 12.435,
"eval_samples_per_second": 873.344,
"eval_steps_per_second": 1.769,
"step": 1344
},
{
"epoch": 25.0,
"grad_norm": 0.12466388940811157,
"learning_rate": 0.00029790550099218654,
"loss": 0.1643,
"step": 1400
},
{
"epoch": 25.0,
"eval_loss": 0.15815725922584534,
"eval_runtime": 13.1792,
"eval_samples_per_second": 824.023,
"eval_steps_per_second": 1.669,
"step": 1400
},
{
"epoch": 26.0,
"grad_norm": 0.12369589507579803,
"learning_rate": 0.0002978680219042336,
"loss": 0.1633,
"step": 1456
},
{
"epoch": 26.0,
"eval_loss": 0.1567024141550064,
"eval_runtime": 12.484,
"eval_samples_per_second": 869.916,
"eval_steps_per_second": 1.762,
"step": 1456
},
{
"epoch": 27.0,
"grad_norm": 0.14197547733783722,
"learning_rate": 0.0002978290753488448,
"loss": 0.1624,
"step": 1512
},
{
"epoch": 27.0,
"eval_loss": 0.15676391124725342,
"eval_runtime": 12.738,
"eval_samples_per_second": 852.567,
"eval_steps_per_second": 1.727,
"step": 1512
},
{
"epoch": 28.0,
"grad_norm": 0.13262535631656647,
"learning_rate": 0.0002977886617104062,
"loss": 0.1613,
"step": 1568
},
{
"epoch": 28.0,
"eval_loss": 0.1567520797252655,
"eval_runtime": 12.6529,
"eval_samples_per_second": 858.304,
"eval_steps_per_second": 1.739,
"step": 1568
},
{
"epoch": 29.0,
"grad_norm": 0.15622882544994354,
"learning_rate": 0.0002977467813877842,
"loss": 0.1604,
"step": 1624
},
{
"epoch": 29.0,
"eval_loss": 0.15647795796394348,
"eval_runtime": 12.6006,
"eval_samples_per_second": 861.863,
"eval_steps_per_second": 1.746,
"step": 1624
},
{
"epoch": 30.0,
"grad_norm": 0.15161629021167755,
"learning_rate": 0.00029770343479432095,
"loss": 0.1598,
"step": 1680
},
{
"epoch": 30.0,
"eval_loss": 0.15717600286006927,
"eval_runtime": 12.8165,
"eval_samples_per_second": 847.348,
"eval_steps_per_second": 1.717,
"step": 1680
},
{
"epoch": 31.0,
"grad_norm": 0.12715986371040344,
"learning_rate": 0.0002976586223578297,
"loss": 0.1591,
"step": 1736
},
{
"epoch": 31.0,
"eval_loss": 0.1557074338197708,
"eval_runtime": 12.6403,
"eval_samples_per_second": 859.156,
"eval_steps_per_second": 1.74,
"step": 1736
},
{
"epoch": 32.0,
"grad_norm": 0.1595166027545929,
"learning_rate": 0.00029761234452059136,
"loss": 0.1584,
"step": 1792
},
{
"epoch": 32.0,
"eval_loss": 0.15540747344493866,
"eval_runtime": 13.3084,
"eval_samples_per_second": 816.027,
"eval_steps_per_second": 1.653,
"step": 1792
},
{
"epoch": 33.0,
"grad_norm": 0.16593649983406067,
"learning_rate": 0.0002975646017393494,
"loss": 0.1576,
"step": 1848
},
{
"epoch": 33.0,
"eval_loss": 0.15468333661556244,
"eval_runtime": 13.1483,
"eval_samples_per_second": 825.961,
"eval_steps_per_second": 1.673,
"step": 1848
},
{
"epoch": 34.0,
"grad_norm": 0.14555956423282623,
"learning_rate": 0.0002975153944853054,
"loss": 0.1567,
"step": 1904
},
{
"epoch": 34.0,
"eval_loss": 0.1553257554769516,
"eval_runtime": 12.853,
"eval_samples_per_second": 844.936,
"eval_steps_per_second": 1.712,
"step": 1904
},
{
"epoch": 35.0,
"grad_norm": 0.23194457590579987,
"learning_rate": 0.00029746472324411547,
"loss": 0.156,
"step": 1960
},
{
"epoch": 35.0,
"eval_loss": 0.1549767106771469,
"eval_runtime": 11.49,
"eval_samples_per_second": 945.169,
"eval_steps_per_second": 1.915,
"step": 1960
},
{
"epoch": 36.0,
"grad_norm": 0.17572428286075592,
"learning_rate": 0.0002974125885158844,
"loss": 0.1559,
"step": 2016
},
{
"epoch": 36.0,
"eval_loss": 0.15631072223186493,
"eval_runtime": 12.6465,
"eval_samples_per_second": 858.739,
"eval_steps_per_second": 1.74,
"step": 2016
},
{
"epoch": 37.0,
"grad_norm": 0.1315496563911438,
"learning_rate": 0.0002973589908151604,
"loss": 0.1547,
"step": 2072
},
{
"epoch": 37.0,
"eval_loss": 0.1540231704711914,
"eval_runtime": 13.3162,
"eval_samples_per_second": 815.548,
"eval_steps_per_second": 1.652,
"step": 2072
},
{
"epoch": 38.0,
"grad_norm": 0.17212693393230438,
"learning_rate": 0.0002973039306709319,
"loss": 0.1539,
"step": 2128
},
{
"epoch": 38.0,
"eval_loss": 0.15414279699325562,
"eval_runtime": 13.2364,
"eval_samples_per_second": 820.466,
"eval_steps_per_second": 1.662,
"step": 2128
},
{
"epoch": 39.0,
"grad_norm": 0.12589286267757416,
"learning_rate": 0.0002972474086266193,
"loss": 0.1538,
"step": 2184
},
{
"epoch": 39.0,
"eval_loss": 0.15399765968322754,
"eval_runtime": 12.5952,
"eval_samples_per_second": 862.236,
"eval_steps_per_second": 1.747,
"step": 2184
},
{
"epoch": 40.0,
"grad_norm": 0.1479528248310089,
"learning_rate": 0.0002971894252400732,
"loss": 0.1529,
"step": 2240
},
{
"epoch": 40.0,
"eval_loss": 0.1546306610107422,
"eval_runtime": 12.4569,
"eval_samples_per_second": 871.809,
"eval_steps_per_second": 1.766,
"step": 2240
},
{
"epoch": 41.0,
"grad_norm": 0.140830859541893,
"learning_rate": 0.00029712998108356566,
"loss": 0.1521,
"step": 2296
},
{
"epoch": 41.0,
"eval_loss": 0.15411749482154846,
"eval_runtime": 12.8911,
"eval_samples_per_second": 842.441,
"eval_steps_per_second": 1.707,
"step": 2296
},
{
"epoch": 42.0,
"grad_norm": 0.14429251849651337,
"learning_rate": 0.0002970690767437871,
"loss": 0.1521,
"step": 2352
},
{
"epoch": 42.0,
"eval_loss": 0.1535186916589737,
"eval_runtime": 12.7037,
"eval_samples_per_second": 854.87,
"eval_steps_per_second": 1.732,
"step": 2352
},
{
"epoch": 43.0,
"grad_norm": 0.1678067147731781,
"learning_rate": 0.00029700671282183844,
"loss": 0.1516,
"step": 2408
},
{
"epoch": 43.0,
"eval_loss": 0.15345174074172974,
"eval_runtime": 12.8622,
"eval_samples_per_second": 844.337,
"eval_steps_per_second": 1.71,
"step": 2408
},
{
"epoch": 44.0,
"grad_norm": 0.16715741157531738,
"learning_rate": 0.00029694288993322636,
"loss": 0.1506,
"step": 2464
},
{
"epoch": 44.0,
"eval_loss": 0.1528453379869461,
"eval_runtime": 12.394,
"eval_samples_per_second": 876.23,
"eval_steps_per_second": 1.775,
"step": 2464
},
{
"epoch": 45.0,
"grad_norm": 0.1476888358592987,
"learning_rate": 0.00029687760870785704,
"loss": 0.1502,
"step": 2520
},
{
"epoch": 45.0,
"eval_loss": 0.15371684730052948,
"eval_runtime": 12.8504,
"eval_samples_per_second": 845.113,
"eval_steps_per_second": 1.712,
"step": 2520
},
{
"epoch": 46.0,
"grad_norm": 0.16268473863601685,
"learning_rate": 0.00029681086979003,
"loss": 0.1497,
"step": 2576
},
{
"epoch": 46.0,
"eval_loss": 0.15216761827468872,
"eval_runtime": 12.9049,
"eval_samples_per_second": 841.539,
"eval_steps_per_second": 1.705,
"step": 2576
},
{
"epoch": 47.0,
"grad_norm": 0.17756158113479614,
"learning_rate": 0.0002967426738384313,
"loss": 0.1493,
"step": 2632
},
{
"epoch": 47.0,
"eval_loss": 0.15324676036834717,
"eval_runtime": 13.0526,
"eval_samples_per_second": 832.021,
"eval_steps_per_second": 1.685,
"step": 2632
},
{
"epoch": 48.0,
"grad_norm": 0.13994063436985016,
"learning_rate": 0.0002966730215261271,
"loss": 0.1487,
"step": 2688
},
{
"epoch": 48.0,
"eval_loss": 0.15221010148525238,
"eval_runtime": 12.6334,
"eval_samples_per_second": 859.628,
"eval_steps_per_second": 1.741,
"step": 2688
},
{
"epoch": 49.0,
"grad_norm": 0.18394885957241058,
"learning_rate": 0.0002966019135405581,
"loss": 0.1483,
"step": 2744
},
{
"epoch": 49.0,
"eval_loss": 0.15254603326320648,
"eval_runtime": 12.296,
"eval_samples_per_second": 883.214,
"eval_steps_per_second": 1.789,
"step": 2744
},
{
"epoch": 50.0,
"grad_norm": 0.14756232500076294,
"learning_rate": 0.000296529350583531,
"loss": 0.1479,
"step": 2800
},
{
"epoch": 50.0,
"eval_loss": 0.15157358348369598,
"eval_runtime": 12.7067,
"eval_samples_per_second": 854.666,
"eval_steps_per_second": 1.731,
"step": 2800
},
{
"epoch": 51.0,
"grad_norm": 0.18675681948661804,
"learning_rate": 0.00029645533337121344,
"loss": 0.1476,
"step": 2856
},
{
"epoch": 51.0,
"eval_loss": 0.15315961837768555,
"eval_runtime": 12.914,
"eval_samples_per_second": 840.949,
"eval_steps_per_second": 1.704,
"step": 2856
},
{
"epoch": 52.0,
"grad_norm": 0.21148425340652466,
"learning_rate": 0.0002963798626341248,
"loss": 0.1467,
"step": 2912
},
{
"epoch": 52.0,
"eval_loss": 0.151397705078125,
"eval_runtime": 12.6083,
"eval_samples_per_second": 861.336,
"eval_steps_per_second": 1.745,
"step": 2912
},
{
"epoch": 53.0,
"grad_norm": 0.14957012236118317,
"learning_rate": 0.00029630293911713125,
"loss": 0.1463,
"step": 2968
},
{
"epoch": 53.0,
"eval_loss": 0.152817040681839,
"eval_runtime": 12.3988,
"eval_samples_per_second": 875.89,
"eval_steps_per_second": 1.774,
"step": 2968
},
{
"epoch": 54.0,
"grad_norm": 0.18841682374477386,
"learning_rate": 0.0002962245635794367,
"loss": 0.1457,
"step": 3024
},
{
"epoch": 54.0,
"eval_loss": 0.1509653627872467,
"eval_runtime": 12.9201,
"eval_samples_per_second": 840.553,
"eval_steps_per_second": 1.703,
"step": 3024
},
{
"epoch": 55.0,
"grad_norm": 0.19782641530036926,
"learning_rate": 0.00029614473679457606,
"loss": 0.1457,
"step": 3080
},
{
"epoch": 55.0,
"eval_loss": 0.15204061567783356,
"eval_runtime": 13.0172,
"eval_samples_per_second": 834.282,
"eval_steps_per_second": 1.69,
"step": 3080
},
{
"epoch": 56.0,
"grad_norm": 0.15806534886360168,
"learning_rate": 0.0002960634595504073,
"loss": 0.145,
"step": 3136
},
{
"epoch": 56.0,
"eval_loss": 0.15144167840480804,
"eval_runtime": 12.3723,
"eval_samples_per_second": 877.767,
"eval_steps_per_second": 1.778,
"step": 3136
},
{
"epoch": 57.0,
"grad_norm": 0.1470707207918167,
"learning_rate": 0.00029598073264910414,
"loss": 0.1446,
"step": 3192
},
{
"epoch": 57.0,
"eval_loss": 0.15259326994419098,
"eval_runtime": 11.8486,
"eval_samples_per_second": 916.567,
"eval_steps_per_second": 1.857,
"step": 3192
},
{
"epoch": 58.0,
"grad_norm": 0.12880393862724304,
"learning_rate": 0.00029589655690714776,
"loss": 0.1444,
"step": 3248
},
{
"epoch": 58.0,
"eval_loss": 0.1521604359149933,
"eval_runtime": 12.3711,
"eval_samples_per_second": 877.851,
"eval_steps_per_second": 1.778,
"step": 3248
},
{
"epoch": 59.0,
"grad_norm": 0.20687344670295715,
"learning_rate": 0.00029581093315531867,
"loss": 0.1439,
"step": 3304
},
{
"epoch": 59.0,
"eval_loss": 0.1506902128458023,
"eval_runtime": 12.2839,
"eval_samples_per_second": 884.082,
"eval_steps_per_second": 1.791,
"step": 3304
},
{
"epoch": 60.0,
"grad_norm": 0.31674283742904663,
"learning_rate": 0.00029572386223868856,
"loss": 0.1434,
"step": 3360
},
{
"epoch": 60.0,
"eval_loss": 0.1497628092765808,
"eval_runtime": 12.2602,
"eval_samples_per_second": 885.791,
"eval_steps_per_second": 1.794,
"step": 3360
},
{
"epoch": 61.0,
"grad_norm": 0.1524023711681366,
"learning_rate": 0.0002956353450166127,
"loss": 0.1428,
"step": 3416
},
{
"epoch": 61.0,
"eval_loss": 0.15104272961616516,
"eval_runtime": 11.4854,
"eval_samples_per_second": 945.545,
"eval_steps_per_second": 1.915,
"step": 3416
},
{
"epoch": 62.0,
"grad_norm": 0.1333588808774948,
"learning_rate": 0.00029554538236271986,
"loss": 0.1427,
"step": 3472
},
{
"epoch": 62.0,
"eval_loss": 0.15125687420368195,
"eval_runtime": 11.619,
"eval_samples_per_second": 934.673,
"eval_steps_per_second": 1.893,
"step": 3472
},
{
"epoch": 63.0,
"grad_norm": 0.14987458288669586,
"learning_rate": 0.0002954539751649054,
"loss": 0.1427,
"step": 3528
},
{
"epoch": 63.0,
"eval_loss": 0.15022161602973938,
"eval_runtime": 11.7178,
"eval_samples_per_second": 926.795,
"eval_steps_per_second": 1.877,
"step": 3528
},
{
"epoch": 64.0,
"grad_norm": 0.19036932289600372,
"learning_rate": 0.00029536112432532164,
"loss": 0.1418,
"step": 3584
},
{
"epoch": 64.0,
"eval_loss": 0.15002530813217163,
"eval_runtime": 12.0423,
"eval_samples_per_second": 901.82,
"eval_steps_per_second": 1.827,
"step": 3584
},
{
"epoch": 65.0,
"grad_norm": 0.15858310461044312,
"learning_rate": 0.00029526683076036824,
"loss": 0.1416,
"step": 3640
},
{
"epoch": 65.0,
"eval_loss": 0.15072880685329437,
"eval_runtime": 11.4427,
"eval_samples_per_second": 949.077,
"eval_steps_per_second": 1.923,
"step": 3640
},
{
"epoch": 66.0,
"grad_norm": 0.1411045342683792,
"learning_rate": 0.0002951710954006851,
"loss": 0.1415,
"step": 3696
},
{
"epoch": 66.0,
"eval_loss": 0.150208979845047,
"eval_runtime": 11.7843,
"eval_samples_per_second": 921.567,
"eval_steps_per_second": 1.867,
"step": 3696
},
{
"epoch": 67.0,
"grad_norm": 0.18127693235874176,
"learning_rate": 0.00029507391919114174,
"loss": 0.1407,
"step": 3752
},
{
"epoch": 67.0,
"eval_loss": 0.15111134946346283,
"eval_runtime": 11.7998,
"eval_samples_per_second": 920.352,
"eval_steps_per_second": 1.864,
"step": 3752
},
{
"epoch": 68.0,
"grad_norm": 0.20954985916614532,
"learning_rate": 0.0002949753030908276,
"loss": 0.1404,
"step": 3808
},
{
"epoch": 68.0,
"eval_loss": 0.15048466622829437,
"eval_runtime": 11.8536,
"eval_samples_per_second": 916.178,
"eval_steps_per_second": 1.856,
"step": 3808
},
{
"epoch": 69.0,
"grad_norm": 0.1799214780330658,
"learning_rate": 0.0002948752480730442,
"loss": 0.1401,
"step": 3864
},
{
"epoch": 69.0,
"eval_loss": 0.14996136724948883,
"eval_runtime": 11.8425,
"eval_samples_per_second": 917.04,
"eval_steps_per_second": 1.858,
"step": 3864
},
{
"epoch": 70.0,
"grad_norm": 0.14687888324260712,
"learning_rate": 0.0002947737551252938,
"loss": 0.1399,
"step": 3920
},
{
"epoch": 70.0,
"eval_loss": 0.1494998186826706,
"eval_runtime": 11.8446,
"eval_samples_per_second": 916.877,
"eval_steps_per_second": 1.857,
"step": 3920
},
{
"epoch": 71.0,
"grad_norm": 0.2250983864068985,
"learning_rate": 0.000294670825249271,
"loss": 0.1397,
"step": 3976
},
{
"epoch": 71.0,
"eval_loss": 0.14974181354045868,
"eval_runtime": 10.3667,
"eval_samples_per_second": 1047.585,
"eval_steps_per_second": 2.122,
"step": 3976
},
{
"epoch": 72.0,
"grad_norm": 0.14977572858333588,
"learning_rate": 0.00029456645946085235,
"loss": 0.1393,
"step": 4032
},
{
"epoch": 72.0,
"eval_loss": 0.1504337042570114,
"eval_runtime": 11.0031,
"eval_samples_per_second": 986.994,
"eval_steps_per_second": 1.999,
"step": 4032
},
{
"epoch": 73.0,
"grad_norm": 0.2215435802936554,
"learning_rate": 0.00029446065879008577,
"loss": 0.1389,
"step": 4088
},
{
"epoch": 73.0,
"eval_loss": 0.14960449934005737,
"eval_runtime": 10.5211,
"eval_samples_per_second": 1032.216,
"eval_steps_per_second": 2.091,
"step": 4088
},
{
"epoch": 74.0,
"grad_norm": 0.14885684847831726,
"learning_rate": 0.00029435342428118117,
"loss": 0.1384,
"step": 4144
},
{
"epoch": 74.0,
"eval_loss": 0.14882370829582214,
"eval_runtime": 11.6942,
"eval_samples_per_second": 928.669,
"eval_steps_per_second": 1.881,
"step": 4144
},
{
"epoch": 75.0,
"grad_norm": 0.20596224069595337,
"learning_rate": 0.0002942447569924998,
"loss": 0.1384,
"step": 4200
},
{
"epoch": 75.0,
"eval_loss": 0.14847591519355774,
"eval_runtime": 11.911,
"eval_samples_per_second": 911.765,
"eval_steps_per_second": 1.847,
"step": 4200
},
{
"epoch": 76.0,
"grad_norm": 0.1551866978406906,
"learning_rate": 0.0002941346579965444,
"loss": 0.1379,
"step": 4256
},
{
"epoch": 76.0,
"eval_loss": 0.1497822105884552,
"eval_runtime": 11.0615,
"eval_samples_per_second": 981.782,
"eval_steps_per_second": 1.989,
"step": 4256
},
{
"epoch": 77.0,
"grad_norm": 0.19567330181598663,
"learning_rate": 0.00029402312837994727,
"loss": 0.138,
"step": 4312
},
{
"epoch": 77.0,
"eval_loss": 0.14890199899673462,
"eval_runtime": 11.5065,
"eval_samples_per_second": 943.812,
"eval_steps_per_second": 1.912,
"step": 4312
},
{
"epoch": 78.0,
"grad_norm": 0.1951490044593811,
"learning_rate": 0.0002939101692434606,
"loss": 0.1372,
"step": 4368
},
{
"epoch": 78.0,
"eval_loss": 0.14929604530334473,
"eval_runtime": 11.7303,
"eval_samples_per_second": 925.806,
"eval_steps_per_second": 1.875,
"step": 4368
},
{
"epoch": 79.0,
"grad_norm": 0.15116438269615173,
"learning_rate": 0.00029379578170194554,
"loss": 0.1371,
"step": 4424
},
{
"epoch": 79.0,
"eval_loss": 0.14909496903419495,
"eval_runtime": 11.5142,
"eval_samples_per_second": 943.184,
"eval_steps_per_second": 1.911,
"step": 4424
},
{
"epoch": 80.0,
"grad_norm": 0.24799354374408722,
"learning_rate": 0.00029367996688436096,
"loss": 0.1369,
"step": 4480
},
{
"epoch": 80.0,
"eval_loss": 0.14952804148197174,
"eval_runtime": 10.7014,
"eval_samples_per_second": 1014.824,
"eval_steps_per_second": 2.056,
"step": 4480
},
{
"epoch": 81.0,
"grad_norm": 0.16792896389961243,
"learning_rate": 0.00029356272593375216,
"loss": 0.1368,
"step": 4536
},
{
"epoch": 81.0,
"eval_loss": 0.1491686999797821,
"eval_runtime": 11.5601,
"eval_samples_per_second": 939.442,
"eval_steps_per_second": 1.903,
"step": 4536
},
{
"epoch": 82.0,
"grad_norm": 0.21115855872631073,
"learning_rate": 0.00029344406000724046,
"loss": 0.1363,
"step": 4592
},
{
"epoch": 82.0,
"eval_loss": 0.14837497472763062,
"eval_runtime": 11.7754,
"eval_samples_per_second": 922.263,
"eval_steps_per_second": 1.868,
"step": 4592
},
{
"epoch": 83.0,
"grad_norm": 0.15595555305480957,
"learning_rate": 0.0002933239702760101,
"loss": 0.1361,
"step": 4648
},
{
"epoch": 83.0,
"eval_loss": 0.14758282899856567,
"eval_runtime": 11.5424,
"eval_samples_per_second": 940.879,
"eval_steps_per_second": 1.906,
"step": 4648
},
{
"epoch": 84.0,
"grad_norm": 0.14343903958797455,
"learning_rate": 0.00029320245792529843,
"loss": 0.1355,
"step": 4704
},
{
"epoch": 84.0,
"eval_loss": 0.1478155553340912,
"eval_runtime": 11.4968,
"eval_samples_per_second": 944.61,
"eval_steps_per_second": 1.914,
"step": 4704
},
{
"epoch": 85.0,
"grad_norm": 0.2670864462852478,
"learning_rate": 0.00029307952415438376,
"loss": 0.1353,
"step": 4760
},
{
"epoch": 85.0,
"eval_loss": 0.14811985194683075,
"eval_runtime": 11.0295,
"eval_samples_per_second": 984.636,
"eval_steps_per_second": 1.995,
"step": 4760
},
{
"epoch": 86.0,
"grad_norm": 0.19388346374034882,
"learning_rate": 0.00029295517017657207,
"loss": 0.1353,
"step": 4816
},
{
"epoch": 86.0,
"eval_loss": 0.14837351441383362,
"eval_runtime": 11.4695,
"eval_samples_per_second": 946.859,
"eval_steps_per_second": 1.918,
"step": 4816
},
{
"epoch": 87.0,
"grad_norm": 0.15899422764778137,
"learning_rate": 0.00029282939721918743,
"loss": 0.1351,
"step": 4872
},
{
"epoch": 87.0,
"eval_loss": 0.14791646599769592,
"eval_runtime": 11.4789,
"eval_samples_per_second": 946.087,
"eval_steps_per_second": 1.917,
"step": 4872
},
{
"epoch": 88.0,
"grad_norm": 0.25924888253211975,
"learning_rate": 0.00029270220652355785,
"loss": 0.1345,
"step": 4928
},
{
"epoch": 88.0,
"eval_loss": 0.1483958214521408,
"eval_runtime": 11.0986,
"eval_samples_per_second": 978.501,
"eval_steps_per_second": 1.982,
"step": 4928
},
{
"epoch": 89.0,
"grad_norm": 0.197585791349411,
"learning_rate": 0.0002925735993450043,
"loss": 0.1342,
"step": 4984
},
{
"epoch": 89.0,
"eval_loss": 0.14841538667678833,
"eval_runtime": 11.2913,
"eval_samples_per_second": 961.799,
"eval_steps_per_second": 1.948,
"step": 4984
},
{
"epoch": 90.0,
"grad_norm": 0.18903715908527374,
"learning_rate": 0.0002924435769528278,
"loss": 0.1343,
"step": 5040
},
{
"epoch": 90.0,
"eval_loss": 0.14745239913463593,
"eval_runtime": 12.07,
"eval_samples_per_second": 899.752,
"eval_steps_per_second": 1.823,
"step": 5040
},
{
"epoch": 91.0,
"grad_norm": 0.1610485017299652,
"learning_rate": 0.00029231214063029666,
"loss": 0.1336,
"step": 5096
},
{
"epoch": 91.0,
"eval_loss": 0.1469384878873825,
"eval_runtime": 12.1199,
"eval_samples_per_second": 896.05,
"eval_steps_per_second": 1.815,
"step": 5096
},
{
"epoch": 92.0,
"grad_norm": 0.20112423598766327,
"learning_rate": 0.00029217929167463404,
"loss": 0.1337,
"step": 5152
},
{
"epoch": 92.0,
"eval_loss": 0.14764182269573212,
"eval_runtime": 10.2692,
"eval_samples_per_second": 1057.536,
"eval_steps_per_second": 2.142,
"step": 5152
},
{
"epoch": 93.0,
"grad_norm": 0.28488588333129883,
"learning_rate": 0.00029204503139700625,
"loss": 0.1335,
"step": 5208
},
{
"epoch": 93.0,
"eval_loss": 0.1479685753583908,
"eval_runtime": 11.6849,
"eval_samples_per_second": 929.407,
"eval_steps_per_second": 1.883,
"step": 5208
},
{
"epoch": 94.0,
"grad_norm": 0.2028261125087738,
"learning_rate": 0.0002919093611225077,
"loss": 0.1333,
"step": 5264
},
{
"epoch": 94.0,
"eval_loss": 0.14725789427757263,
"eval_runtime": 11.2025,
"eval_samples_per_second": 969.429,
"eval_steps_per_second": 1.964,
"step": 5264
},
{
"epoch": 95.0,
"grad_norm": 0.20275919139385223,
"learning_rate": 0.0002917722821901492,
"loss": 0.1334,
"step": 5320
},
{
"epoch": 95.0,
"eval_loss": 0.14767614006996155,
"eval_runtime": 10.8005,
"eval_samples_per_second": 1005.513,
"eval_steps_per_second": 2.037,
"step": 5320
},
{
"epoch": 96.0,
"grad_norm": 0.2053348869085312,
"learning_rate": 0.0002916337959528444,
"loss": 0.1325,
"step": 5376
},
{
"epoch": 96.0,
"eval_loss": 0.14707864820957184,
"eval_runtime": 11.1238,
"eval_samples_per_second": 976.287,
"eval_steps_per_second": 1.978,
"step": 5376
},
{
"epoch": 97.0,
"grad_norm": 0.23510950803756714,
"learning_rate": 0.0002914939037773966,
"loss": 0.1321,
"step": 5432
},
{
"epoch": 97.0,
"eval_loss": 0.1476944088935852,
"eval_runtime": 10.9362,
"eval_samples_per_second": 993.028,
"eval_steps_per_second": 2.012,
"step": 5432
},
{
"epoch": 98.0,
"grad_norm": 0.2703108787536621,
"learning_rate": 0.000291352607044485,
"loss": 0.1327,
"step": 5488
},
{
"epoch": 98.0,
"eval_loss": 0.1466565579175949,
"eval_runtime": 10.8189,
"eval_samples_per_second": 1003.802,
"eval_steps_per_second": 2.033,
"step": 5488
},
{
"epoch": 99.0,
"grad_norm": 0.22386641800403595,
"learning_rate": 0.0002912099071486513,
"loss": 0.1318,
"step": 5544
},
{
"epoch": 99.0,
"eval_loss": 0.1469065397977829,
"eval_runtime": 10.9677,
"eval_samples_per_second": 990.181,
"eval_steps_per_second": 2.006,
"step": 5544
},
{
"epoch": 100.0,
"grad_norm": 0.18684013187885284,
"learning_rate": 0.0002910658054982861,
"loss": 0.1319,
"step": 5600
},
{
"epoch": 100.0,
"eval_loss": 0.1462097316980362,
"eval_runtime": 11.5801,
"eval_samples_per_second": 937.82,
"eval_steps_per_second": 1.9,
"step": 5600
},
{
"epoch": 101.0,
"grad_norm": 0.1831580400466919,
"learning_rate": 0.00029092030351561435,
"loss": 0.1318,
"step": 5656
},
{
"epoch": 101.0,
"eval_loss": 0.1467864215373993,
"eval_runtime": 11.2551,
"eval_samples_per_second": 964.899,
"eval_steps_per_second": 1.955,
"step": 5656
},
{
"epoch": 102.0,
"grad_norm": 0.20423631370067596,
"learning_rate": 0.00029077340263668184,
"loss": 0.1315,
"step": 5712
},
{
"epoch": 102.0,
"eval_loss": 0.1470629870891571,
"eval_runtime": 10.0185,
"eval_samples_per_second": 1083.994,
"eval_steps_per_second": 2.196,
"step": 5712
},
{
"epoch": 103.0,
"grad_norm": 0.20669810473918915,
"learning_rate": 0.0002906251043113414,
"loss": 0.1312,
"step": 5768
},
{
"epoch": 103.0,
"eval_loss": 0.14603030681610107,
"eval_runtime": 11.5962,
"eval_samples_per_second": 936.51,
"eval_steps_per_second": 1.897,
"step": 5768
},
{
"epoch": 104.0,
"grad_norm": 0.18566496670246124,
"learning_rate": 0.0002904754100032369,
"loss": 0.1308,
"step": 5824
},
{
"epoch": 104.0,
"eval_loss": 0.146591916680336,
"eval_runtime": 11.8139,
"eval_samples_per_second": 919.255,
"eval_steps_per_second": 1.862,
"step": 5824
},
{
"epoch": 105.0,
"grad_norm": 0.32265496253967285,
"learning_rate": 0.000290324321189791,
"loss": 0.1311,
"step": 5880
},
{
"epoch": 105.0,
"eval_loss": 0.1458718478679657,
"eval_runtime": 11.9546,
"eval_samples_per_second": 908.438,
"eval_steps_per_second": 1.84,
"step": 5880
},
{
"epoch": 106.0,
"grad_norm": 0.17987699806690216,
"learning_rate": 0.00029017183936218906,
"loss": 0.1302,
"step": 5936
},
{
"epoch": 106.0,
"eval_loss": 0.1459737867116928,
"eval_runtime": 12.1694,
"eval_samples_per_second": 892.4,
"eval_steps_per_second": 1.808,
"step": 5936
},
{
"epoch": 107.0,
"grad_norm": 0.18314820528030396,
"learning_rate": 0.0002900179660253659,
"loss": 0.1303,
"step": 5992
},
{
"epoch": 107.0,
"eval_loss": 0.14506617188453674,
"eval_runtime": 11.0204,
"eval_samples_per_second": 985.446,
"eval_steps_per_second": 1.996,
"step": 5992
},
{
"epoch": 108.0,
"grad_norm": 0.1967027485370636,
"learning_rate": 0.00028986270269798893,
"loss": 0.13,
"step": 6048
},
{
"epoch": 108.0,
"eval_loss": 0.1448826640844345,
"eval_runtime": 11.2115,
"eval_samples_per_second": 968.651,
"eval_steps_per_second": 1.962,
"step": 6048
},
{
"epoch": 109.0,
"grad_norm": 0.17848514020442963,
"learning_rate": 0.00028970605091244395,
"loss": 0.13,
"step": 6104
},
{
"epoch": 109.0,
"eval_loss": 0.14577716588974,
"eval_runtime": 12.0159,
"eval_samples_per_second": 903.806,
"eval_steps_per_second": 1.831,
"step": 6104
},
{
"epoch": 110.0,
"grad_norm": 0.1681281179189682,
"learning_rate": 0.00028954801221482137,
"loss": 0.13,
"step": 6160
},
{
"epoch": 110.0,
"eval_loss": 0.1459922343492508,
"eval_runtime": 11.657,
"eval_samples_per_second": 931.628,
"eval_steps_per_second": 1.887,
"step": 6160
},
{
"epoch": 111.0,
"grad_norm": 0.19543369114398956,
"learning_rate": 0.00028938858816489945,
"loss": 0.1294,
"step": 6216
},
{
"epoch": 111.0,
"eval_loss": 0.14557458460330963,
"eval_runtime": 11.502,
"eval_samples_per_second": 944.183,
"eval_steps_per_second": 1.913,
"step": 6216
},
{
"epoch": 112.0,
"grad_norm": 0.19514279067516327,
"learning_rate": 0.0002892277803361288,
"loss": 0.1294,
"step": 6272
},
{
"epoch": 112.0,
"eval_loss": 0.14542081952095032,
"eval_runtime": 11.3675,
"eval_samples_per_second": 955.353,
"eval_steps_per_second": 1.935,
"step": 6272
},
{
"epoch": 113.0,
"grad_norm": 0.19245897233486176,
"learning_rate": 0.00028906559031561803,
"loss": 0.1294,
"step": 6328
},
{
"epoch": 113.0,
"eval_loss": 0.14575673639774323,
"eval_runtime": 12.0854,
"eval_samples_per_second": 898.603,
"eval_steps_per_second": 1.82,
"step": 6328
},
{
"epoch": 114.0,
"grad_norm": 0.2559398412704468,
"learning_rate": 0.0002889020197041172,
"loss": 0.129,
"step": 6384
},
{
"epoch": 114.0,
"eval_loss": 0.14476452767848969,
"eval_runtime": 11.4747,
"eval_samples_per_second": 946.432,
"eval_steps_per_second": 1.917,
"step": 6384
},
{
"epoch": 115.0,
"grad_norm": 0.1581374853849411,
"learning_rate": 0.0002887370701160019,
"loss": 0.129,
"step": 6440
},
{
"epoch": 115.0,
"eval_loss": 0.14649543166160583,
"eval_runtime": 11.7792,
"eval_samples_per_second": 921.961,
"eval_steps_per_second": 1.868,
"step": 6440
},
{
"epoch": 116.0,
"grad_norm": 0.17189738154411316,
"learning_rate": 0.0002885707431792581,
"loss": 0.1282,
"step": 6496
},
{
"epoch": 116.0,
"eval_loss": 0.14660660922527313,
"eval_runtime": 11.9186,
"eval_samples_per_second": 911.183,
"eval_steps_per_second": 1.846,
"step": 6496
},
{
"epoch": 117.0,
"grad_norm": 0.2357121855020523,
"learning_rate": 0.0002884030405354656,
"loss": 0.129,
"step": 6552
},
{
"epoch": 117.0,
"eval_loss": 0.146439790725708,
"eval_runtime": 11.5156,
"eval_samples_per_second": 943.071,
"eval_steps_per_second": 1.91,
"step": 6552
},
{
"epoch": 118.0,
"grad_norm": 0.1968863159418106,
"learning_rate": 0.00028823396383978163,
"loss": 0.1279,
"step": 6608
},
{
"epoch": 118.0,
"eval_loss": 0.1450948715209961,
"eval_runtime": 11.6204,
"eval_samples_per_second": 934.567,
"eval_steps_per_second": 1.893,
"step": 6608
},
{
"epoch": 119.0,
"grad_norm": 0.16850939393043518,
"learning_rate": 0.0002880635147609254,
"loss": 0.1279,
"step": 6664
},
{
"epoch": 119.0,
"eval_loss": 0.1456771343946457,
"eval_runtime": 11.4295,
"eval_samples_per_second": 950.17,
"eval_steps_per_second": 1.925,
"step": 6664
},
{
"epoch": 120.0,
"grad_norm": 0.20816339552402496,
"learning_rate": 0.0002878916949811601,
"loss": 0.1277,
"step": 6720
},
{
"epoch": 120.0,
"eval_loss": 0.1461264193058014,
"eval_runtime": 11.9161,
"eval_samples_per_second": 911.372,
"eval_steps_per_second": 1.846,
"step": 6720
},
{
"epoch": 121.0,
"grad_norm": 0.19195137917995453,
"learning_rate": 0.0002877185061962775,
"loss": 0.1279,
"step": 6776
},
{
"epoch": 121.0,
"eval_loss": 0.14506319165229797,
"eval_runtime": 10.7769,
"eval_samples_per_second": 1007.715,
"eval_steps_per_second": 2.041,
"step": 6776
},
{
"epoch": 122.0,
"grad_norm": 0.1636265516281128,
"learning_rate": 0.0002875439501155812,
"loss": 0.1277,
"step": 6832
},
{
"epoch": 122.0,
"eval_loss": 0.1454634666442871,
"eval_runtime": 11.7121,
"eval_samples_per_second": 927.245,
"eval_steps_per_second": 1.878,
"step": 6832
},
{
"epoch": 123.0,
"grad_norm": 0.17660963535308838,
"learning_rate": 0.00028736802846186907,
"loss": 0.1273,
"step": 6888
},
{
"epoch": 123.0,
"eval_loss": 0.1449379324913025,
"eval_runtime": 12.0977,
"eval_samples_per_second": 897.695,
"eval_steps_per_second": 1.819,
"step": 6888
},
{
"epoch": 124.0,
"grad_norm": 0.20895443856716156,
"learning_rate": 0.00028719074297141686,
"loss": 0.127,
"step": 6944
},
{
"epoch": 124.0,
"eval_loss": 0.14427852630615234,
"eval_runtime": 11.8774,
"eval_samples_per_second": 914.341,
"eval_steps_per_second": 1.852,
"step": 6944
},
{
"epoch": 125.0,
"grad_norm": 0.1895224153995514,
"learning_rate": 0.0002870120953939609,
"loss": 0.1269,
"step": 7000
},
{
"epoch": 125.0,
"eval_loss": 0.1446518748998642,
"eval_runtime": 11.7658,
"eval_samples_per_second": 923.015,
"eval_steps_per_second": 1.87,
"step": 7000
},
{
"epoch": 126.0,
"grad_norm": 0.191587895154953,
"learning_rate": 0.0002868320874926807,
"loss": 0.1269,
"step": 7056
},
{
"epoch": 126.0,
"eval_loss": 0.14533261954784393,
"eval_runtime": 11.2533,
"eval_samples_per_second": 965.053,
"eval_steps_per_second": 1.955,
"step": 7056
},
{
"epoch": 127.0,
"grad_norm": 0.20511987805366516,
"learning_rate": 0.00028665072104418107,
"loss": 0.1263,
"step": 7112
},
{
"epoch": 127.0,
"eval_loss": 0.1444355994462967,
"eval_runtime": 11.3297,
"eval_samples_per_second": 958.545,
"eval_steps_per_second": 1.942,
"step": 7112
},
{
"epoch": 128.0,
"grad_norm": 0.19347704946994781,
"learning_rate": 0.0002864679978384761,
"loss": 0.1266,
"step": 7168
},
{
"epoch": 128.0,
"eval_loss": 0.14528335630893707,
"eval_runtime": 11.7467,
"eval_samples_per_second": 924.517,
"eval_steps_per_second": 1.873,
"step": 7168
},
{
"epoch": 129.0,
"grad_norm": 0.1948786824941635,
"learning_rate": 0.00028628391967896994,
"loss": 0.1267,
"step": 7224
},
{
"epoch": 129.0,
"eval_loss": 0.1452852487564087,
"eval_runtime": 10.7249,
"eval_samples_per_second": 1012.6,
"eval_steps_per_second": 2.051,
"step": 7224
},
{
"epoch": 130.0,
"grad_norm": 0.2143562138080597,
"learning_rate": 0.00028609848838243983,
"loss": 0.1263,
"step": 7280
},
{
"epoch": 130.0,
"eval_loss": 0.14422422647476196,
"eval_runtime": 12.1111,
"eval_samples_per_second": 896.699,
"eval_steps_per_second": 1.817,
"step": 7280
},
{
"epoch": 131.0,
"grad_norm": 0.17198456823825836,
"learning_rate": 0.0002859117057790177,
"loss": 0.1258,
"step": 7336
},
{
"epoch": 131.0,
"eval_loss": 0.14419187605381012,
"eval_runtime": 11.2161,
"eval_samples_per_second": 968.25,
"eval_steps_per_second": 1.961,
"step": 7336
},
{
"epoch": 132.0,
"grad_norm": 0.2027718871831894,
"learning_rate": 0.0002857235737121728,
"loss": 0.1257,
"step": 7392
},
{
"epoch": 132.0,
"eval_loss": 0.14398382604122162,
"eval_runtime": 11.7549,
"eval_samples_per_second": 923.871,
"eval_steps_per_second": 1.872,
"step": 7392
},
{
"epoch": 133.0,
"grad_norm": 0.18598471581935883,
"learning_rate": 0.00028553409403869214,
"loss": 0.1256,
"step": 7448
},
{
"epoch": 133.0,
"eval_loss": 0.144750714302063,
"eval_runtime": 10.9992,
"eval_samples_per_second": 987.344,
"eval_steps_per_second": 2.0,
"step": 7448
},
{
"epoch": 134.0,
"grad_norm": 0.18290792405605316,
"learning_rate": 0.0002853432686286638,
"loss": 0.1255,
"step": 7504
},
{
"epoch": 134.0,
"eval_loss": 0.14384572207927704,
"eval_runtime": 11.23,
"eval_samples_per_second": 967.05,
"eval_steps_per_second": 1.959,
"step": 7504
},
{
"epoch": 135.0,
"grad_norm": 0.22160011529922485,
"learning_rate": 0.0002851510993654578,
"loss": 0.1254,
"step": 7560
},
{
"epoch": 135.0,
"eval_loss": 0.1437937319278717,
"eval_runtime": 11.9673,
"eval_samples_per_second": 907.472,
"eval_steps_per_second": 1.838,
"step": 7560
},
{
"epoch": 136.0,
"grad_norm": 0.18182989954948425,
"learning_rate": 0.0002849575881457068,
"loss": 0.1252,
"step": 7616
},
{
"epoch": 136.0,
"eval_loss": 0.14378975331783295,
"eval_runtime": 11.8117,
"eval_samples_per_second": 919.426,
"eval_steps_per_second": 1.863,
"step": 7616
},
{
"epoch": 137.0,
"grad_norm": 0.16500607132911682,
"learning_rate": 0.0002847627368792885,
"loss": 0.125,
"step": 7672
},
{
"epoch": 137.0,
"eval_loss": 0.1436585932970047,
"eval_runtime": 12.4256,
"eval_samples_per_second": 874.0,
"eval_steps_per_second": 1.771,
"step": 7672
},
{
"epoch": 138.0,
"grad_norm": 0.22664882242679596,
"learning_rate": 0.0002845665474893062,
"loss": 0.125,
"step": 7728
},
{
"epoch": 138.0,
"eval_loss": 0.14313535392284393,
"eval_runtime": 12.1895,
"eval_samples_per_second": 890.932,
"eval_steps_per_second": 1.805,
"step": 7728
},
{
"epoch": 139.0,
"grad_norm": 0.1606769859790802,
"learning_rate": 0.0002843690219120703,
"loss": 0.1242,
"step": 7784
},
{
"epoch": 139.0,
"eval_loss": 0.14361213147640228,
"eval_runtime": 12.1036,
"eval_samples_per_second": 897.251,
"eval_steps_per_second": 1.818,
"step": 7784
},
{
"epoch": 140.0,
"grad_norm": 0.20197436213493347,
"learning_rate": 0.0002841701620970783,
"loss": 0.1244,
"step": 7840
},
{
"epoch": 140.0,
"eval_loss": 0.142960324883461,
"eval_runtime": 11.6316,
"eval_samples_per_second": 933.665,
"eval_steps_per_second": 1.891,
"step": 7840
},
{
"epoch": 141.0,
"grad_norm": 0.18616272509098053,
"learning_rate": 0.000283969970006996,
"loss": 0.1243,
"step": 7896
},
{
"epoch": 141.0,
"eval_loss": 0.1441134661436081,
"eval_runtime": 11.589,
"eval_samples_per_second": 937.094,
"eval_steps_per_second": 1.898,
"step": 7896
},
{
"epoch": 142.0,
"grad_norm": 0.20340923964977264,
"learning_rate": 0.0002837684476176391,
"loss": 0.1239,
"step": 7952
},
{
"epoch": 142.0,
"eval_loss": 0.1434699296951294,
"eval_runtime": 12.3235,
"eval_samples_per_second": 881.241,
"eval_steps_per_second": 1.785,
"step": 7952
},
{
"epoch": 143.0,
"grad_norm": 0.18145394325256348,
"learning_rate": 0.0002835655969179518,
"loss": 0.1241,
"step": 8008
},
{
"epoch": 143.0,
"eval_loss": 0.14338643848896027,
"eval_runtime": 12.3449,
"eval_samples_per_second": 879.717,
"eval_steps_per_second": 1.782,
"step": 8008
},
{
"epoch": 144.0,
"grad_norm": 0.1755165159702301,
"learning_rate": 0.0002833614199099885,
"loss": 0.1241,
"step": 8064
},
{
"epoch": 144.0,
"eval_loss": 0.14308682084083557,
"eval_runtime": 12.0765,
"eval_samples_per_second": 899.268,
"eval_steps_per_second": 1.822,
"step": 8064
},
{
"epoch": 145.0,
"grad_norm": 0.18520286679267883,
"learning_rate": 0.00028315591860889397,
"loss": 0.1238,
"step": 8120
},
{
"epoch": 145.0,
"eval_loss": 0.14301612973213196,
"eval_runtime": 11.4026,
"eval_samples_per_second": 952.414,
"eval_steps_per_second": 1.929,
"step": 8120
},
{
"epoch": 146.0,
"grad_norm": 0.2836858630180359,
"learning_rate": 0.0002829490950428833,
"loss": 0.1237,
"step": 8176
},
{
"epoch": 146.0,
"eval_loss": 0.1432274430990219,
"eval_runtime": 10.5295,
"eval_samples_per_second": 1031.389,
"eval_steps_per_second": 2.089,
"step": 8176
},
{
"epoch": 147.0,
"grad_norm": 0.18382933735847473,
"learning_rate": 0.0002827409512532215,
"loss": 0.1233,
"step": 8232
},
{
"epoch": 147.0,
"eval_loss": 0.14315703511238098,
"eval_runtime": 11.7841,
"eval_samples_per_second": 921.584,
"eval_steps_per_second": 1.867,
"step": 8232
},
{
"epoch": 148.0,
"grad_norm": 0.16152502596378326,
"learning_rate": 0.00028253148929420393,
"loss": 0.1236,
"step": 8288
},
{
"epoch": 148.0,
"eval_loss": 0.14190851151943207,
"eval_runtime": 12.2311,
"eval_samples_per_second": 887.903,
"eval_steps_per_second": 1.799,
"step": 8288
},
{
"epoch": 149.0,
"grad_norm": 0.23382407426834106,
"learning_rate": 0.0002823207112331354,
"loss": 0.1232,
"step": 8344
},
{
"epoch": 149.0,
"eval_loss": 0.14270788431167603,
"eval_runtime": 12.109,
"eval_samples_per_second": 896.855,
"eval_steps_per_second": 1.817,
"step": 8344
},
{
"epoch": 150.0,
"grad_norm": 0.1615588366985321,
"learning_rate": 0.00028210861915030973,
"loss": 0.1232,
"step": 8400
},
{
"epoch": 150.0,
"eval_loss": 0.14285807311534882,
"eval_runtime": 12.5884,
"eval_samples_per_second": 862.702,
"eval_steps_per_second": 1.748,
"step": 8400
},
{
"epoch": 151.0,
"grad_norm": 0.2795417308807373,
"learning_rate": 0.0002818952151389907,
"loss": 0.1227,
"step": 8456
},
{
"epoch": 151.0,
"eval_loss": 0.14255040884017944,
"eval_runtime": 12.5025,
"eval_samples_per_second": 868.624,
"eval_steps_per_second": 1.76,
"step": 8456
},
{
"epoch": 152.0,
"grad_norm": 0.2292180061340332,
"learning_rate": 0.00028168050130538953,
"loss": 0.1231,
"step": 8512
},
{
"epoch": 152.0,
"eval_loss": 0.14337477087974548,
"eval_runtime": 12.1529,
"eval_samples_per_second": 893.611,
"eval_steps_per_second": 1.81,
"step": 8512
},
{
"epoch": 153.0,
"grad_norm": 0.17736776173114777,
"learning_rate": 0.00028146447976864553,
"loss": 0.1224,
"step": 8568
},
{
"epoch": 153.0,
"eval_loss": 0.14352336525917053,
"eval_runtime": 12.3539,
"eval_samples_per_second": 879.073,
"eval_steps_per_second": 1.781,
"step": 8568
},
{
"epoch": 154.0,
"grad_norm": 0.36273321509361267,
"learning_rate": 0.0002812471526608039,
"loss": 0.1227,
"step": 8624
},
{
"epoch": 154.0,
"eval_loss": 0.142772376537323,
"eval_runtime": 12.0892,
"eval_samples_per_second": 898.323,
"eval_steps_per_second": 1.82,
"step": 8624
},
{
"epoch": 155.0,
"grad_norm": 0.19883078336715698,
"learning_rate": 0.00028102852212679526,
"loss": 0.1228,
"step": 8680
},
{
"epoch": 155.0,
"eval_loss": 0.14210332930088043,
"eval_runtime": 12.2389,
"eval_samples_per_second": 887.336,
"eval_steps_per_second": 1.798,
"step": 8680
},
{
"epoch": 156.0,
"grad_norm": 0.2114337682723999,
"learning_rate": 0.00028080859032441463,
"loss": 0.1223,
"step": 8736
},
{
"epoch": 156.0,
"eval_loss": 0.14258325099945068,
"eval_runtime": 12.5038,
"eval_samples_per_second": 868.534,
"eval_steps_per_second": 1.759,
"step": 8736
},
{
"epoch": 157.0,
"grad_norm": 0.193147674202919,
"learning_rate": 0.0002805873594243001,
"loss": 0.1223,
"step": 8792
},
{
"epoch": 157.0,
"eval_loss": 0.1423390656709671,
"eval_runtime": 11.2533,
"eval_samples_per_second": 965.047,
"eval_steps_per_second": 1.955,
"step": 8792
},
{
"epoch": 158.0,
"grad_norm": 0.15751470625400543,
"learning_rate": 0.0002803648316099116,
"loss": 0.1222,
"step": 8848
},
{
"epoch": 158.0,
"eval_loss": 0.1417943835258484,
"eval_runtime": 11.5797,
"eval_samples_per_second": 937.847,
"eval_steps_per_second": 1.9,
"step": 8848
},
{
"epoch": 159.0,
"grad_norm": 0.27395108342170715,
"learning_rate": 0.00028014100907750874,
"loss": 0.1219,
"step": 8904
},
{
"epoch": 159.0,
"eval_loss": 0.14257293939590454,
"eval_runtime": 12.328,
"eval_samples_per_second": 880.923,
"eval_steps_per_second": 1.785,
"step": 8904
},
{
"epoch": 160.0,
"grad_norm": 0.22418324649333954,
"learning_rate": 0.0002799158940361295,
"loss": 0.1217,
"step": 8960
},
{
"epoch": 160.0,
"eval_loss": 0.1431107521057129,
"eval_runtime": 12.2423,
"eval_samples_per_second": 887.09,
"eval_steps_per_second": 1.797,
"step": 8960
},
{
"epoch": 161.0,
"grad_norm": 0.2003849744796753,
"learning_rate": 0.0002796894887075685,
"loss": 0.1218,
"step": 9016
},
{
"epoch": 161.0,
"eval_loss": 0.14198802411556244,
"eval_runtime": 11.4923,
"eval_samples_per_second": 944.981,
"eval_steps_per_second": 1.914,
"step": 9016
},
{
"epoch": 162.0,
"grad_norm": 0.21222490072250366,
"learning_rate": 0.00027946179532635447,
"loss": 0.1215,
"step": 9072
},
{
"epoch": 162.0,
"eval_loss": 0.14226287603378296,
"eval_runtime": 12.6489,
"eval_samples_per_second": 858.572,
"eval_steps_per_second": 1.739,
"step": 9072
},
{
"epoch": 163.0,
"grad_norm": 0.3284847140312195,
"learning_rate": 0.0002792328161397301,
"loss": 0.1214,
"step": 9128
},
{
"epoch": 163.0,
"eval_loss": 0.14255832135677338,
"eval_runtime": 11.8749,
"eval_samples_per_second": 914.536,
"eval_steps_per_second": 1.853,
"step": 9128
},
{
"epoch": 164.0,
"grad_norm": 0.17873606085777283,
"learning_rate": 0.0002790025534076267,
"loss": 0.1209,
"step": 9184
},
{
"epoch": 164.0,
"eval_loss": 0.14214134216308594,
"eval_runtime": 11.7349,
"eval_samples_per_second": 925.446,
"eval_steps_per_second": 1.875,
"step": 9184
},
{
"epoch": 165.0,
"grad_norm": 0.29637348651885986,
"learning_rate": 0.00027877100940264476,
"loss": 0.1214,
"step": 9240
},
{
"epoch": 165.0,
"eval_loss": 0.14148862659931183,
"eval_runtime": 11.2369,
"eval_samples_per_second": 966.457,
"eval_steps_per_second": 1.958,
"step": 9240
},
{
"epoch": 166.0,
"grad_norm": 0.19445298612117767,
"learning_rate": 0.0002785381864100304,
"loss": 0.1211,
"step": 9296
},
{
"epoch": 166.0,
"eval_loss": 0.14366163313388824,
"eval_runtime": 11.7897,
"eval_samples_per_second": 921.146,
"eval_steps_per_second": 1.866,
"step": 9296
},
{
"epoch": 167.0,
"grad_norm": 0.2037288248538971,
"learning_rate": 0.0002783040867276523,
"loss": 0.1209,
"step": 9352
},
{
"epoch": 167.0,
"eval_loss": 0.14206562936306,
"eval_runtime": 11.4292,
"eval_samples_per_second": 950.199,
"eval_steps_per_second": 1.925,
"step": 9352
},
{
"epoch": 168.0,
"grad_norm": 0.21530179679393768,
"learning_rate": 0.0002780687126659796,
"loss": 0.1208,
"step": 9408
},
{
"epoch": 168.0,
"eval_loss": 0.1410149782896042,
"eval_runtime": 11.7288,
"eval_samples_per_second": 925.923,
"eval_steps_per_second": 1.876,
"step": 9408
}
],
"logging_steps": 500,
"max_steps": 56000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1000,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 1e-05
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1001513367240704e+18,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}