GPT-model-train / trainer_state.json
BryanSagbay's picture
Upload 10 files
53911bf verified
{
"best_metric": 0.022735346358792183,
"best_model_checkpoint": "./results/checkpoint-10634",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 10634,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 0.7369279265403748,
"learning_rate": 4.952981004325748e-05,
"loss": 4.7032,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 0.5547892451286316,
"learning_rate": 4.905962008651496e-05,
"loss": 4.7033,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 0.5557618737220764,
"learning_rate": 4.858943012977243e-05,
"loss": 4.7015,
"step": 300
},
{
"epoch": 0.08,
"grad_norm": 0.5398121476173401,
"learning_rate": 4.8119240173029906e-05,
"loss": 4.7038,
"step": 400
},
{
"epoch": 0.09,
"grad_norm": 0.5128409266471863,
"learning_rate": 4.7649050216287384e-05,
"loss": 4.7003,
"step": 500
},
{
"epoch": 0.11,
"grad_norm": 0.57951420545578,
"learning_rate": 4.717886025954486e-05,
"loss": 4.7018,
"step": 600
},
{
"epoch": 0.13,
"grad_norm": 0.48418277502059937,
"learning_rate": 4.670867030280233e-05,
"loss": 4.7005,
"step": 700
},
{
"epoch": 0.15,
"grad_norm": 0.5264779329299927,
"learning_rate": 4.623848034605981e-05,
"loss": 4.7034,
"step": 800
},
{
"epoch": 0.17,
"grad_norm": 0.4631044268608093,
"learning_rate": 4.576829038931729e-05,
"loss": 4.7018,
"step": 900
},
{
"epoch": 0.19,
"grad_norm": 0.49229735136032104,
"learning_rate": 4.5298100432574765e-05,
"loss": 4.6994,
"step": 1000
},
{
"epoch": 0.21,
"grad_norm": 0.5025375485420227,
"learning_rate": 4.482791047583224e-05,
"loss": 4.7019,
"step": 1100
},
{
"epoch": 0.23,
"grad_norm": 0.5106560587882996,
"learning_rate": 4.4357720519089714e-05,
"loss": 4.7017,
"step": 1200
},
{
"epoch": 0.24,
"grad_norm": 0.4811244308948517,
"learning_rate": 4.388753056234719e-05,
"loss": 4.7015,
"step": 1300
},
{
"epoch": 0.26,
"grad_norm": 0.5047605037689209,
"learning_rate": 4.341734060560467e-05,
"loss": 4.7011,
"step": 1400
},
{
"epoch": 0.28,
"grad_norm": 0.4720272123813629,
"learning_rate": 4.2947150648862146e-05,
"loss": 4.7004,
"step": 1500
},
{
"epoch": 0.3,
"grad_norm": 0.3932195007801056,
"learning_rate": 4.247696069211962e-05,
"loss": 4.7028,
"step": 1600
},
{
"epoch": 0.32,
"grad_norm": 0.39717328548431396,
"learning_rate": 4.2006770735377095e-05,
"loss": 4.702,
"step": 1700
},
{
"epoch": 0.34,
"grad_norm": 0.46395114064216614,
"learning_rate": 4.153658077863457e-05,
"loss": 4.7005,
"step": 1800
},
{
"epoch": 0.36,
"grad_norm": 0.4270722270011902,
"learning_rate": 4.106639082189205e-05,
"loss": 4.7,
"step": 1900
},
{
"epoch": 0.38,
"grad_norm": 0.4068189561367035,
"learning_rate": 4.059620086514952e-05,
"loss": 4.7016,
"step": 2000
},
{
"epoch": 0.39,
"grad_norm": 0.4495134651660919,
"learning_rate": 4.0126010908407e-05,
"loss": 4.7,
"step": 2100
},
{
"epoch": 0.41,
"grad_norm": 0.4774065613746643,
"learning_rate": 3.9655820951664476e-05,
"loss": 4.7022,
"step": 2200
},
{
"epoch": 0.43,
"grad_norm": 0.4217697083950043,
"learning_rate": 3.9185630994921954e-05,
"loss": 4.7016,
"step": 2300
},
{
"epoch": 0.45,
"grad_norm": 0.4176025092601776,
"learning_rate": 3.871544103817943e-05,
"loss": 4.7005,
"step": 2400
},
{
"epoch": 0.47,
"grad_norm": 0.43538960814476013,
"learning_rate": 3.82452510814369e-05,
"loss": 4.7002,
"step": 2500
},
{
"epoch": 0.49,
"grad_norm": 0.3962569236755371,
"learning_rate": 3.777506112469438e-05,
"loss": 4.7015,
"step": 2600
},
{
"epoch": 0.51,
"grad_norm": 0.41195255517959595,
"learning_rate": 3.730487116795186e-05,
"loss": 4.7014,
"step": 2700
},
{
"epoch": 0.53,
"grad_norm": 0.3937325179576874,
"learning_rate": 3.6834681211209335e-05,
"loss": 4.7015,
"step": 2800
},
{
"epoch": 0.55,
"grad_norm": 0.39071130752563477,
"learning_rate": 3.6364491254466806e-05,
"loss": 4.6994,
"step": 2900
},
{
"epoch": 0.56,
"grad_norm": 0.40909039974212646,
"learning_rate": 3.5894301297724284e-05,
"loss": 4.7019,
"step": 3000
},
{
"epoch": 0.58,
"grad_norm": 0.3900412321090698,
"learning_rate": 3.542411134098176e-05,
"loss": 4.6996,
"step": 3100
},
{
"epoch": 0.6,
"grad_norm": 0.5122581720352173,
"learning_rate": 3.495392138423923e-05,
"loss": 4.6992,
"step": 3200
},
{
"epoch": 0.62,
"grad_norm": 0.38495704531669617,
"learning_rate": 3.448373142749671e-05,
"loss": 4.7024,
"step": 3300
},
{
"epoch": 0.64,
"grad_norm": 0.40592727065086365,
"learning_rate": 3.401354147075419e-05,
"loss": 4.7002,
"step": 3400
},
{
"epoch": 0.66,
"grad_norm": 0.4175421893596649,
"learning_rate": 3.354335151401166e-05,
"loss": 4.6993,
"step": 3500
},
{
"epoch": 0.68,
"grad_norm": 0.43954411149024963,
"learning_rate": 3.3073161557269136e-05,
"loss": 4.7016,
"step": 3600
},
{
"epoch": 0.7,
"grad_norm": 0.501674473285675,
"learning_rate": 3.2602971600526614e-05,
"loss": 4.6995,
"step": 3700
},
{
"epoch": 0.71,
"grad_norm": 0.3901698589324951,
"learning_rate": 3.2132781643784084e-05,
"loss": 4.7006,
"step": 3800
},
{
"epoch": 0.73,
"grad_norm": 0.3993096649646759,
"learning_rate": 3.166259168704156e-05,
"loss": 4.699,
"step": 3900
},
{
"epoch": 0.75,
"grad_norm": 0.43237388134002686,
"learning_rate": 3.119240173029904e-05,
"loss": 4.7001,
"step": 4000
},
{
"epoch": 0.77,
"grad_norm": 2.5795607566833496,
"learning_rate": 3.072221177355652e-05,
"loss": 4.6928,
"step": 4100
},
{
"epoch": 0.79,
"grad_norm": 0.9452407956123352,
"learning_rate": 3.0252021816814e-05,
"loss": 4.7014,
"step": 4200
},
{
"epoch": 0.81,
"grad_norm": 0.7060949802398682,
"learning_rate": 2.978183186007147e-05,
"loss": 4.707,
"step": 4300
},
{
"epoch": 0.83,
"grad_norm": 1.052825689315796,
"learning_rate": 2.9311641903328947e-05,
"loss": 4.7014,
"step": 4400
},
{
"epoch": 0.85,
"grad_norm": 0.7108451128005981,
"learning_rate": 2.8841451946586424e-05,
"loss": 4.7027,
"step": 4500
},
{
"epoch": 0.87,
"grad_norm": 0.6421079635620117,
"learning_rate": 2.83712619898439e-05,
"loss": 4.7033,
"step": 4600
},
{
"epoch": 0.88,
"grad_norm": 0.5927799940109253,
"learning_rate": 2.7901072033101373e-05,
"loss": 4.7044,
"step": 4700
},
{
"epoch": 0.9,
"grad_norm": 0.5713520050048828,
"learning_rate": 2.743088207635885e-05,
"loss": 4.7022,
"step": 4800
},
{
"epoch": 0.92,
"grad_norm": 0.5557407140731812,
"learning_rate": 2.6960692119616325e-05,
"loss": 4.7036,
"step": 4900
},
{
"epoch": 0.94,
"grad_norm": 0.37561553716659546,
"learning_rate": 2.6490502162873802e-05,
"loss": 4.7011,
"step": 5000
},
{
"epoch": 0.96,
"grad_norm": 0.404910683631897,
"learning_rate": 2.602031220613128e-05,
"loss": 4.7007,
"step": 5100
},
{
"epoch": 0.98,
"grad_norm": 0.39087265729904175,
"learning_rate": 2.5550122249388754e-05,
"loss": 4.7021,
"step": 5200
},
{
"epoch": 1.0,
"grad_norm": 0.3689398169517517,
"learning_rate": 2.507993229264623e-05,
"loss": 4.7004,
"step": 5300
},
{
"epoch": 1.0,
"eval_accuracy": 0.008644168146832444,
"eval_f1": 0.00014816254395838658,
"eval_loss": 4.700786113739014,
"eval_precision": 7.472164295071267e-05,
"eval_recall": 0.008644168146832444,
"eval_runtime": 96.2388,
"eval_samples_per_second": 87.75,
"eval_steps_per_second": 4.884,
"step": 5317
},
{
"epoch": 1.02,
"grad_norm": 0.3822825849056244,
"learning_rate": 2.4609742335903706e-05,
"loss": 4.7002,
"step": 5400
},
{
"epoch": 1.03,
"grad_norm": 0.421373575925827,
"learning_rate": 2.413955237916118e-05,
"loss": 4.6991,
"step": 5500
},
{
"epoch": 1.05,
"grad_norm": 0.4438491463661194,
"learning_rate": 2.3669362422418658e-05,
"loss": 4.7001,
"step": 5600
},
{
"epoch": 1.07,
"grad_norm": 0.812016487121582,
"learning_rate": 2.3199172465676132e-05,
"loss": 4.6941,
"step": 5700
},
{
"epoch": 1.09,
"grad_norm": 1.119163155555725,
"learning_rate": 2.272898250893361e-05,
"loss": 4.6807,
"step": 5800
},
{
"epoch": 1.11,
"grad_norm": 1.1431705951690674,
"learning_rate": 2.2258792552191084e-05,
"loss": 4.6842,
"step": 5900
},
{
"epoch": 1.13,
"grad_norm": 1.4402434825897217,
"learning_rate": 2.1788602595448562e-05,
"loss": 4.6653,
"step": 6000
},
{
"epoch": 1.15,
"grad_norm": 2.073824644088745,
"learning_rate": 2.131841263870604e-05,
"loss": 4.6543,
"step": 6100
},
{
"epoch": 1.17,
"grad_norm": 1.6333357095718384,
"learning_rate": 2.0848222681963514e-05,
"loss": 4.6394,
"step": 6200
},
{
"epoch": 1.18,
"grad_norm": 1.206061840057373,
"learning_rate": 2.037803272522099e-05,
"loss": 4.6213,
"step": 6300
},
{
"epoch": 1.2,
"grad_norm": 0.5230748653411865,
"learning_rate": 1.9907842768478466e-05,
"loss": 4.7044,
"step": 6400
},
{
"epoch": 1.22,
"grad_norm": 0.4634922444820404,
"learning_rate": 1.9437652811735943e-05,
"loss": 4.7031,
"step": 6500
},
{
"epoch": 1.24,
"grad_norm": 0.47288939356803894,
"learning_rate": 1.8967462854993417e-05,
"loss": 4.7027,
"step": 6600
},
{
"epoch": 1.26,
"grad_norm": 0.4731047749519348,
"learning_rate": 1.8497272898250895e-05,
"loss": 4.7014,
"step": 6700
},
{
"epoch": 1.28,
"grad_norm": 0.5110020637512207,
"learning_rate": 1.802708294150837e-05,
"loss": 4.6995,
"step": 6800
},
{
"epoch": 1.3,
"grad_norm": 0.5348946452140808,
"learning_rate": 1.7556892984765847e-05,
"loss": 4.7013,
"step": 6900
},
{
"epoch": 1.32,
"grad_norm": 0.5182259678840637,
"learning_rate": 1.708670302802332e-05,
"loss": 4.6991,
"step": 7000
},
{
"epoch": 1.34,
"grad_norm": 0.6097771525382996,
"learning_rate": 1.66165130712808e-05,
"loss": 4.7022,
"step": 7100
},
{
"epoch": 1.35,
"grad_norm": 0.49007490277290344,
"learning_rate": 1.6146323114538276e-05,
"loss": 4.7012,
"step": 7200
},
{
"epoch": 1.37,
"grad_norm": 0.5180237293243408,
"learning_rate": 1.567613315779575e-05,
"loss": 4.6998,
"step": 7300
},
{
"epoch": 1.39,
"grad_norm": 0.4303135275840759,
"learning_rate": 1.5205943201053227e-05,
"loss": 4.7025,
"step": 7400
},
{
"epoch": 1.41,
"grad_norm": 0.6103301644325256,
"learning_rate": 1.47357532443107e-05,
"loss": 4.7027,
"step": 7500
},
{
"epoch": 1.43,
"grad_norm": 0.5991978049278259,
"learning_rate": 1.4265563287568178e-05,
"loss": 4.7016,
"step": 7600
},
{
"epoch": 1.45,
"grad_norm": 0.590713381767273,
"learning_rate": 1.3795373330825653e-05,
"loss": 4.6992,
"step": 7700
},
{
"epoch": 1.47,
"grad_norm": 0.6496405601501465,
"learning_rate": 1.332518337408313e-05,
"loss": 4.7005,
"step": 7800
},
{
"epoch": 1.49,
"grad_norm": 0.5405462980270386,
"learning_rate": 1.2854993417340605e-05,
"loss": 4.704,
"step": 7900
},
{
"epoch": 1.5,
"grad_norm": 0.5655048489570618,
"learning_rate": 1.2384803460598082e-05,
"loss": 4.701,
"step": 8000
},
{
"epoch": 1.52,
"grad_norm": 1.1468925476074219,
"learning_rate": 1.1914613503855558e-05,
"loss": 4.6547,
"step": 8100
},
{
"epoch": 1.54,
"grad_norm": 3.2609410285949707,
"learning_rate": 1.1444423547113034e-05,
"loss": 4.6054,
"step": 8200
},
{
"epoch": 1.56,
"grad_norm": 3.0458478927612305,
"learning_rate": 1.097423359037051e-05,
"loss": 4.5575,
"step": 8300
},
{
"epoch": 1.58,
"grad_norm": 4.3110527992248535,
"learning_rate": 1.0504043633627986e-05,
"loss": 4.5948,
"step": 8400
},
{
"epoch": 1.6,
"grad_norm": 1.7174781560897827,
"learning_rate": 1.0033853676885462e-05,
"loss": 4.5585,
"step": 8500
},
{
"epoch": 1.62,
"grad_norm": 3.2902700901031494,
"learning_rate": 9.56366372014294e-06,
"loss": 4.5886,
"step": 8600
},
{
"epoch": 1.64,
"grad_norm": 1.1743800640106201,
"learning_rate": 9.093473763400415e-06,
"loss": 4.5534,
"step": 8700
},
{
"epoch": 1.66,
"grad_norm": 1.5308388471603394,
"learning_rate": 8.623283806657891e-06,
"loss": 4.547,
"step": 8800
},
{
"epoch": 1.67,
"grad_norm": 2.839127540588379,
"learning_rate": 8.153093849915366e-06,
"loss": 4.5339,
"step": 8900
},
{
"epoch": 1.69,
"grad_norm": 1.9120197296142578,
"learning_rate": 7.682903893172842e-06,
"loss": 4.5439,
"step": 9000
},
{
"epoch": 1.71,
"grad_norm": 1.549185872077942,
"learning_rate": 7.212713936430318e-06,
"loss": 4.5276,
"step": 9100
},
{
"epoch": 1.73,
"grad_norm": 2.3087658882141113,
"learning_rate": 6.742523979687794e-06,
"loss": 4.5081,
"step": 9200
},
{
"epoch": 1.75,
"grad_norm": 1.4830306768417358,
"learning_rate": 6.27233402294527e-06,
"loss": 4.494,
"step": 9300
},
{
"epoch": 1.77,
"grad_norm": 2.1838178634643555,
"learning_rate": 5.802144066202746e-06,
"loss": 4.4904,
"step": 9400
},
{
"epoch": 1.79,
"grad_norm": 3.3637304306030273,
"learning_rate": 5.331954109460222e-06,
"loss": 4.4807,
"step": 9500
},
{
"epoch": 1.81,
"grad_norm": 1.637593388557434,
"learning_rate": 4.861764152717698e-06,
"loss": 4.474,
"step": 9600
},
{
"epoch": 1.82,
"grad_norm": 1.833465576171875,
"learning_rate": 4.391574195975174e-06,
"loss": 4.4719,
"step": 9700
},
{
"epoch": 1.84,
"grad_norm": 2.6578447818756104,
"learning_rate": 3.921384239232651e-06,
"loss": 4.4593,
"step": 9800
},
{
"epoch": 1.86,
"grad_norm": 2.121244192123413,
"learning_rate": 3.451194282490126e-06,
"loss": 4.4514,
"step": 9900
},
{
"epoch": 1.88,
"grad_norm": 2.080278158187866,
"learning_rate": 2.981004325747602e-06,
"loss": 4.4407,
"step": 10000
},
{
"epoch": 1.9,
"grad_norm": 2.2840731143951416,
"learning_rate": 2.510814369005078e-06,
"loss": 4.4289,
"step": 10100
},
{
"epoch": 1.92,
"grad_norm": 2.203077554702759,
"learning_rate": 2.0406244122625544e-06,
"loss": 4.4071,
"step": 10200
},
{
"epoch": 1.94,
"grad_norm": 2.4338624477386475,
"learning_rate": 1.5704344555200301e-06,
"loss": 4.4166,
"step": 10300
},
{
"epoch": 1.96,
"grad_norm": 1.855125069618225,
"learning_rate": 1.1002444987775063e-06,
"loss": 4.4079,
"step": 10400
},
{
"epoch": 1.97,
"grad_norm": 2.9419732093811035,
"learning_rate": 6.300545420349821e-07,
"loss": 4.4227,
"step": 10500
},
{
"epoch": 1.99,
"grad_norm": 3.1695730686187744,
"learning_rate": 1.5986458529245815e-07,
"loss": 4.4244,
"step": 10600
},
{
"epoch": 2.0,
"eval_accuracy": 0.022735346358792183,
"eval_f1": 0.00212270922930165,
"eval_loss": 4.410290241241455,
"eval_precision": 0.0011595953132254675,
"eval_recall": 0.022735346358792183,
"eval_runtime": 96.0024,
"eval_samples_per_second": 87.967,
"eval_steps_per_second": 4.896,
"step": 10634
}
],
"logging_steps": 100,
"max_steps": 10634,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 5.041092263921664e+16,
"train_batch_size": 18,
"trial_name": null,
"trial_params": null
}