evolve_sft / sft /checkpoint-5000 /trainer_state.json
xinchen9's picture
Upload folder using huggingface_hub
8c3a32b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0782918149466192,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010783996549121105,
"grad_norm": 0.2076384574174881,
"learning_rate": 5.861244019138756e-06,
"loss": 1.0803,
"step": 50
},
{
"epoch": 0.02156799309824221,
"grad_norm": 0.25434058904647827,
"learning_rate": 1.1842105263157895e-05,
"loss": 1.0521,
"step": 100
},
{
"epoch": 0.03235198964736331,
"grad_norm": 0.24684220552444458,
"learning_rate": 1.7822966507177032e-05,
"loss": 1.0288,
"step": 150
},
{
"epoch": 0.04313598619648442,
"grad_norm": 0.3034498691558838,
"learning_rate": 2.380382775119617e-05,
"loss": 0.9972,
"step": 200
},
{
"epoch": 0.05391998274560552,
"grad_norm": 0.2725016176700592,
"learning_rate": 2.9784688995215314e-05,
"loss": 0.9555,
"step": 250
},
{
"epoch": 0.06470397929472663,
"grad_norm": 0.27356916666030884,
"learning_rate": 3.576555023923445e-05,
"loss": 0.9688,
"step": 300
},
{
"epoch": 0.07548797584384773,
"grad_norm": 0.2624454200267792,
"learning_rate": 4.174641148325359e-05,
"loss": 0.9699,
"step": 350
},
{
"epoch": 0.08627197239296884,
"grad_norm": 0.2676330506801605,
"learning_rate": 4.772727272727273e-05,
"loss": 0.9739,
"step": 400
},
{
"epoch": 0.09705596894208994,
"grad_norm": 0.24369767308235168,
"learning_rate": 4.999934880025785e-05,
"loss": 0.9833,
"step": 450
},
{
"epoch": 0.10783996549121104,
"grad_norm": 0.26960158348083496,
"learning_rate": 4.9995554200393156e-05,
"loss": 0.9677,
"step": 500
},
{
"epoch": 0.11862396204033215,
"grad_norm": 0.2564559578895569,
"learning_rate": 4.998837209058379e-05,
"loss": 0.9493,
"step": 550
},
{
"epoch": 0.12940795858945325,
"grad_norm": 0.23627087473869324,
"learning_rate": 4.9977803444181587e-05,
"loss": 0.9726,
"step": 600
},
{
"epoch": 0.14019195513857435,
"grad_norm": 0.22857290506362915,
"learning_rate": 4.996384969349704e-05,
"loss": 0.9653,
"step": 650
},
{
"epoch": 0.15097595168769545,
"grad_norm": 0.25175178050994873,
"learning_rate": 4.9946512729605226e-05,
"loss": 0.9725,
"step": 700
},
{
"epoch": 0.16175994823681655,
"grad_norm": 0.20284195244312286,
"learning_rate": 4.992579490208947e-05,
"loss": 0.968,
"step": 750
},
{
"epoch": 0.17254394478593768,
"grad_norm": 0.228809654712677,
"learning_rate": 4.990169901872295e-05,
"loss": 0.9338,
"step": 800
},
{
"epoch": 0.18332794133505878,
"grad_norm": 0.2436237633228302,
"learning_rate": 4.987422834508818e-05,
"loss": 0.9581,
"step": 850
},
{
"epoch": 0.19411193788417988,
"grad_norm": 0.2001142054796219,
"learning_rate": 4.9843386604134425e-05,
"loss": 0.9512,
"step": 900
},
{
"epoch": 0.20489593443330098,
"grad_norm": 0.20406965911388397,
"learning_rate": 4.980917797567315e-05,
"loss": 0.9479,
"step": 950
},
{
"epoch": 0.21567993098242208,
"grad_norm": 0.20756883919239044,
"learning_rate": 4.9771607095811565e-05,
"loss": 0.9552,
"step": 1000
},
{
"epoch": 0.22646392753154318,
"grad_norm": 0.23893098533153534,
"learning_rate": 4.9730679056324334e-05,
"loss": 0.9732,
"step": 1050
},
{
"epoch": 0.2372479240806643,
"grad_norm": 0.20374947786331177,
"learning_rate": 4.968639940396346e-05,
"loss": 0.961,
"step": 1100
},
{
"epoch": 0.2480319206297854,
"grad_norm": 0.20845109224319458,
"learning_rate": 4.963877413970663e-05,
"loss": 0.9481,
"step": 1150
},
{
"epoch": 0.2588159171789065,
"grad_norm": 0.23683245480060577,
"learning_rate": 4.958780971794388e-05,
"loss": 0.9558,
"step": 1200
},
{
"epoch": 0.2695999137280276,
"grad_norm": 0.18015944957733154,
"learning_rate": 4.953351304560292e-05,
"loss": 0.9367,
"step": 1250
},
{
"epoch": 0.2803839102771487,
"grad_norm": 0.21432434022426605,
"learning_rate": 4.947589148121301e-05,
"loss": 0.9289,
"step": 1300
},
{
"epoch": 0.2911679068262698,
"grad_norm": 0.217897430062294,
"learning_rate": 4.941495283390778e-05,
"loss": 0.9663,
"step": 1350
},
{
"epoch": 0.3019519033753909,
"grad_norm": 0.23911495506763458,
"learning_rate": 4.9350705362366836e-05,
"loss": 0.9534,
"step": 1400
},
{
"epoch": 0.312735899924512,
"grad_norm": 0.21729810535907745,
"learning_rate": 4.928315777369652e-05,
"loss": 0.9663,
"step": 1450
},
{
"epoch": 0.3235198964736331,
"grad_norm": 0.19448955357074738,
"learning_rate": 4.9212319222249914e-05,
"loss": 0.9203,
"step": 1500
},
{
"epoch": 0.3343038930227542,
"grad_norm": 0.20799997448921204,
"learning_rate": 4.913819930838616e-05,
"loss": 0.9426,
"step": 1550
},
{
"epoch": 0.34508788957187536,
"grad_norm": 0.1989525556564331,
"learning_rate": 4.906080807716941e-05,
"loss": 0.9544,
"step": 1600
},
{
"epoch": 0.35587188612099646,
"grad_norm": 0.21680687367916107,
"learning_rate": 4.898015601700745e-05,
"loss": 0.9666,
"step": 1650
},
{
"epoch": 0.36665588267011756,
"grad_norm": 0.2180759161710739,
"learning_rate": 4.889625405823027e-05,
"loss": 0.9441,
"step": 1700
},
{
"epoch": 0.37743987921923866,
"grad_norm": 0.19334350526332855,
"learning_rate": 4.880911357160877e-05,
"loss": 0.9415,
"step": 1750
},
{
"epoch": 0.38822387576835976,
"grad_norm": 0.19350044429302216,
"learning_rate": 4.871874636681366e-05,
"loss": 0.9534,
"step": 1800
},
{
"epoch": 0.39900787231748086,
"grad_norm": 0.23279784619808197,
"learning_rate": 4.862516469081505e-05,
"loss": 0.9578,
"step": 1850
},
{
"epoch": 0.40979186886660196,
"grad_norm": 0.2038542479276657,
"learning_rate": 4.852838122622264e-05,
"loss": 0.9416,
"step": 1900
},
{
"epoch": 0.42057586541572306,
"grad_norm": 0.21980704367160797,
"learning_rate": 4.842840908956692e-05,
"loss": 0.9359,
"step": 1950
},
{
"epoch": 0.43135986196484416,
"grad_norm": 0.20842380821704865,
"learning_rate": 4.832526182952156e-05,
"loss": 0.9495,
"step": 2000
},
{
"epoch": 0.44214385851396526,
"grad_norm": 0.2161971479654312,
"learning_rate": 4.821895342506724e-05,
"loss": 0.9388,
"step": 2050
},
{
"epoch": 0.45292785506308636,
"grad_norm": 0.2119661122560501,
"learning_rate": 4.8109498283597146e-05,
"loss": 0.9618,
"step": 2100
},
{
"epoch": 0.46371185161220746,
"grad_norm": 0.17877915501594543,
"learning_rate": 4.799691123896441e-05,
"loss": 0.9498,
"step": 2150
},
{
"epoch": 0.4744958481613286,
"grad_norm": 0.2198779135942459,
"learning_rate": 4.788120754947179e-05,
"loss": 0.9464,
"step": 2200
},
{
"epoch": 0.4852798447104497,
"grad_norm": 0.20385344326496124,
"learning_rate": 4.7762402895803763e-05,
"loss": 0.9423,
"step": 2250
},
{
"epoch": 0.4960638412595708,
"grad_norm": 0.21472816169261932,
"learning_rate": 4.764051337890143e-05,
"loss": 0.9295,
"step": 2300
},
{
"epoch": 0.5068478378086919,
"grad_norm": 0.21423693001270294,
"learning_rate": 4.7515555517780405e-05,
"loss": 0.9557,
"step": 2350
},
{
"epoch": 0.517631834357813,
"grad_norm": 0.2088768184185028,
"learning_rate": 4.7387546247292156e-05,
"loss": 0.9392,
"step": 2400
},
{
"epoch": 0.5284158309069341,
"grad_norm": 0.18323567509651184,
"learning_rate": 4.725650291582885e-05,
"loss": 0.9418,
"step": 2450
},
{
"epoch": 0.5391998274560552,
"grad_norm": 0.22341737151145935,
"learning_rate": 4.712244328297224e-05,
"loss": 0.9207,
"step": 2500
},
{
"epoch": 0.5499838240051763,
"grad_norm": 0.2024504542350769,
"learning_rate": 4.698538551708682e-05,
"loss": 0.9337,
"step": 2550
},
{
"epoch": 0.5607678205542974,
"grad_norm": 0.20455148816108704,
"learning_rate": 4.684534819285758e-05,
"loss": 0.9451,
"step": 2600
},
{
"epoch": 0.5715518171034185,
"grad_norm": 0.19093358516693115,
"learning_rate": 4.6702350288772626e-05,
"loss": 0.9468,
"step": 2650
},
{
"epoch": 0.5823358136525396,
"grad_norm": 0.1995963305234909,
"learning_rate": 4.6556411184551176e-05,
"loss": 0.9373,
"step": 2700
},
{
"epoch": 0.5931198102016607,
"grad_norm": 0.19664354622364044,
"learning_rate": 4.640755065851712e-05,
"loss": 0.9609,
"step": 2750
},
{
"epoch": 0.6039038067507818,
"grad_norm": 0.20155999064445496,
"learning_rate": 4.6255788884918595e-05,
"loss": 0.9221,
"step": 2800
},
{
"epoch": 0.6146878032999029,
"grad_norm": 0.2094108611345291,
"learning_rate": 4.610114643119382e-05,
"loss": 0.9665,
"step": 2850
},
{
"epoch": 0.625471799849024,
"grad_norm": 0.23038670420646667,
"learning_rate": 4.5943644255183785e-05,
"loss": 0.9223,
"step": 2900
},
{
"epoch": 0.6362557963981451,
"grad_norm": 0.22103433310985565,
"learning_rate": 4.5783303702291856e-05,
"loss": 0.9271,
"step": 2950
},
{
"epoch": 0.6470397929472662,
"grad_norm": 0.21444232761859894,
"learning_rate": 4.5620146502591065e-05,
"loss": 0.9553,
"step": 3000
},
{
"epoch": 0.6578237894963873,
"grad_norm": 0.20402322709560394,
"learning_rate": 4.5454194767879046e-05,
"loss": 0.9342,
"step": 3050
},
{
"epoch": 0.6686077860455084,
"grad_norm": 0.17598140239715576,
"learning_rate": 4.52854709886814e-05,
"loss": 0.9343,
"step": 3100
},
{
"epoch": 0.6793917825946296,
"grad_norm": 0.2235531210899353,
"learning_rate": 4.511399803120367e-05,
"loss": 0.9325,
"step": 3150
},
{
"epoch": 0.6901757791437507,
"grad_norm": 0.1978316605091095,
"learning_rate": 4.49397991342324e-05,
"loss": 0.9175,
"step": 3200
},
{
"epoch": 0.7009597756928718,
"grad_norm": 0.20724375545978546,
"learning_rate": 4.476289790598571e-05,
"loss": 0.9509,
"step": 3250
},
{
"epoch": 0.7117437722419929,
"grad_norm": 0.19276615977287292,
"learning_rate": 4.458331832091385e-05,
"loss": 0.9247,
"step": 3300
},
{
"epoch": 0.722527768791114,
"grad_norm": 0.2208387851715088,
"learning_rate": 4.440108471644997e-05,
"loss": 0.9409,
"step": 3350
},
{
"epoch": 0.7333117653402351,
"grad_norm": 0.21308571100234985,
"learning_rate": 4.421622178971193e-05,
"loss": 0.9267,
"step": 3400
},
{
"epoch": 0.7440957618893562,
"grad_norm": 0.2115100473165512,
"learning_rate": 4.4028754594155125e-05,
"loss": 0.933,
"step": 3450
},
{
"epoch": 0.7548797584384773,
"grad_norm": 0.21246980130672455,
"learning_rate": 4.383870853617721e-05,
"loss": 0.9422,
"step": 3500
},
{
"epoch": 0.7656637549875984,
"grad_norm": 0.2082446962594986,
"learning_rate": 4.364610937167485e-05,
"loss": 0.9204,
"step": 3550
},
{
"epoch": 0.7764477515367195,
"grad_norm": 0.22102369368076324,
"learning_rate": 4.345098320255321e-05,
"loss": 0.9226,
"step": 3600
},
{
"epoch": 0.7872317480858406,
"grad_norm": 0.19831791520118713,
"learning_rate": 4.325335647318848e-05,
"loss": 0.9327,
"step": 3650
},
{
"epoch": 0.7980157446349617,
"grad_norm": 0.2220238745212555,
"learning_rate": 4.3053255966844016e-05,
"loss": 0.9318,
"step": 3700
},
{
"epoch": 0.8087997411840828,
"grad_norm": 0.20910035073757172,
"learning_rate": 4.285070880204057e-05,
"loss": 0.9306,
"step": 3750
},
{
"epoch": 0.8195837377332039,
"grad_norm": 0.21745839715003967,
"learning_rate": 4.264574242888105e-05,
"loss": 0.9304,
"step": 3800
},
{
"epoch": 0.830367734282325,
"grad_norm": 0.24437028169631958,
"learning_rate": 4.2438384625330374e-05,
"loss": 0.9433,
"step": 3850
},
{
"epoch": 0.8411517308314461,
"grad_norm": 0.2319614738225937,
"learning_rate": 4.222866349345083e-05,
"loss": 0.9536,
"step": 3900
},
{
"epoch": 0.8519357273805672,
"grad_norm": 0.2375030517578125,
"learning_rate": 4.2016607455593624e-05,
"loss": 0.9421,
"step": 3950
},
{
"epoch": 0.8627197239296883,
"grad_norm": 0.2176317423582077,
"learning_rate": 4.1802245250546926e-05,
"loss": 0.9268,
"step": 4000
},
{
"epoch": 0.8735037204788094,
"grad_norm": 0.2226661890745163,
"learning_rate": 4.158560592964104e-05,
"loss": 0.925,
"step": 4050
},
{
"epoch": 0.8842877170279305,
"grad_norm": 0.2202196568250656,
"learning_rate": 4.136671885281124e-05,
"loss": 0.9465,
"step": 4100
},
{
"epoch": 0.8950717135770516,
"grad_norm": 0.20654049515724182,
"learning_rate": 4.114561368461884e-05,
"loss": 0.9251,
"step": 4150
},
{
"epoch": 0.9058557101261727,
"grad_norm": 0.23357035219669342,
"learning_rate": 4.092232039023084e-05,
"loss": 0.9417,
"step": 4200
},
{
"epoch": 0.9166397066752938,
"grad_norm": 0.20816297829151154,
"learning_rate": 4.069686923135896e-05,
"loss": 0.9225,
"step": 4250
},
{
"epoch": 0.9274237032244149,
"grad_norm": 0.20184196531772614,
"learning_rate": 4.04692907621584e-05,
"loss": 0.9212,
"step": 4300
},
{
"epoch": 0.938207699773536,
"grad_norm": 0.1984609067440033,
"learning_rate": 4.023961582508704e-05,
"loss": 0.9261,
"step": 4350
},
{
"epoch": 0.9489916963226572,
"grad_norm": 0.22444488108158112,
"learning_rate": 4.000787554672553e-05,
"loss": 0.9291,
"step": 4400
},
{
"epoch": 0.9597756928717783,
"grad_norm": 0.21115441620349884,
"learning_rate": 3.977410133355884e-05,
"loss": 0.9349,
"step": 4450
},
{
"epoch": 0.9705596894208994,
"grad_norm": 0.19569146633148193,
"learning_rate": 3.953832486771996e-05,
"loss": 0.9049,
"step": 4500
},
{
"epoch": 0.9813436859700205,
"grad_norm": 0.22996151447296143,
"learning_rate": 3.930057810269612e-05,
"loss": 0.894,
"step": 4550
},
{
"epoch": 0.9921276825191416,
"grad_norm": 0.19879557192325592,
"learning_rate": 3.906089325899841e-05,
"loss": 0.955,
"step": 4600
},
{
"epoch": 1.0028038391027714,
"grad_norm": 0.207550510764122,
"learning_rate": 3.8819302819795046e-05,
"loss": 0.9362,
"step": 4650
},
{
"epoch": 1.0135878356518926,
"grad_norm": 0.20435990393161774,
"learning_rate": 3.8575839526509105e-05,
"loss": 0.9217,
"step": 4700
},
{
"epoch": 1.0243718322010138,
"grad_norm": 0.22362500429153442,
"learning_rate": 3.833053637438128e-05,
"loss": 0.9342,
"step": 4750
},
{
"epoch": 1.0351558287501348,
"grad_norm": 0.18318387866020203,
"learning_rate": 3.8083426607998216e-05,
"loss": 0.8937,
"step": 4800
},
{
"epoch": 1.045939825299256,
"grad_norm": 0.20834890007972717,
"learning_rate": 3.783454371678705e-05,
"loss": 0.9103,
"step": 4850
},
{
"epoch": 1.056723821848377,
"grad_norm": 0.2138434648513794,
"learning_rate": 3.758392143047677e-05,
"loss": 0.9003,
"step": 4900
},
{
"epoch": 1.0675078183974982,
"grad_norm": 0.21266281604766846,
"learning_rate": 3.733159371452701e-05,
"loss": 0.9142,
"step": 4950
},
{
"epoch": 1.0782918149466192,
"grad_norm": 0.25879135727882385,
"learning_rate": 3.707759476552489e-05,
"loss": 0.8976,
"step": 5000
}
],
"logging_steps": 50,
"max_steps": 13911,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.237807547251268e+19,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}