lr2.0e-06_data-mix_assistant_only / trainer_state.json
Gabe-Thomp's picture
Model save
cfdce29 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 366,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0412796697626419,
"grad_norm": 16.710033810394073,
"learning_rate": 2.1621621621621622e-07,
"loss": 1.0133,
"mean_token_accuracy": 0.7553978890180588,
"num_tokens": 474561.0,
"step": 5
},
{
"epoch": 0.0825593395252838,
"grad_norm": 14.528834570178129,
"learning_rate": 4.864864864864865e-07,
"loss": 0.9896,
"mean_token_accuracy": 0.7556369215250015,
"num_tokens": 947546.0,
"step": 10
},
{
"epoch": 0.1238390092879257,
"grad_norm": 8.780161837177058,
"learning_rate": 7.567567567567568e-07,
"loss": 0.8241,
"mean_token_accuracy": 0.7811301812529564,
"num_tokens": 1418691.0,
"step": 15
},
{
"epoch": 0.1651186790505676,
"grad_norm": 2.7100520665485894,
"learning_rate": 1.0270270270270269e-06,
"loss": 0.7011,
"mean_token_accuracy": 0.794181127846241,
"num_tokens": 1892249.0,
"step": 20
},
{
"epoch": 0.20639834881320948,
"grad_norm": 1.7196639977337644,
"learning_rate": 1.2972972972972972e-06,
"loss": 0.6437,
"mean_token_accuracy": 0.800345453619957,
"num_tokens": 2366197.0,
"step": 25
},
{
"epoch": 0.2476780185758514,
"grad_norm": 1.680207863114678,
"learning_rate": 1.5675675675675676e-06,
"loss": 0.5926,
"mean_token_accuracy": 0.8132463812828064,
"num_tokens": 2838384.0,
"step": 30
},
{
"epoch": 0.2889576883384933,
"grad_norm": 1.164814654682159,
"learning_rate": 1.837837837837838e-06,
"loss": 0.5987,
"mean_token_accuracy": 0.8100811287760734,
"num_tokens": 3309793.0,
"step": 35
},
{
"epoch": 0.3302373581011352,
"grad_norm": 1.198281328185356,
"learning_rate": 1.9998176420316e-06,
"loss": 0.5949,
"mean_token_accuracy": 0.809085787832737,
"num_tokens": 3784285.0,
"step": 40
},
{
"epoch": 0.3715170278637771,
"grad_norm": 1.284722675046685,
"learning_rate": 1.9977668786231533e-06,
"loss": 0.6033,
"mean_token_accuracy": 0.8060941636562348,
"num_tokens": 4259188.0,
"step": 45
},
{
"epoch": 0.41279669762641896,
"grad_norm": 1.155248536480801,
"learning_rate": 1.993442093851331e-06,
"loss": 0.5767,
"mean_token_accuracy": 0.8136543348431587,
"num_tokens": 4733149.0,
"step": 50
},
{
"epoch": 0.4540763673890609,
"grad_norm": 1.1147204245893136,
"learning_rate": 1.986853144380224e-06,
"loss": 0.5667,
"mean_token_accuracy": 0.8162353426218033,
"num_tokens": 5207210.0,
"step": 55
},
{
"epoch": 0.4953560371517028,
"grad_norm": 1.112840928320902,
"learning_rate": 1.9780150471563555e-06,
"loss": 0.5869,
"mean_token_accuracy": 0.810538823902607,
"num_tokens": 5683326.0,
"step": 60
},
{
"epoch": 0.5366357069143447,
"grad_norm": 1.0402844533961697,
"learning_rate": 1.9669479451833974e-06,
"loss": 0.5756,
"mean_token_accuracy": 0.8130813702940941,
"num_tokens": 6156332.0,
"step": 65
},
{
"epoch": 0.5779153766769866,
"grad_norm": 1.0779121215536756,
"learning_rate": 1.9536770616140275e-06,
"loss": 0.5658,
"mean_token_accuracy": 0.8173741087317467,
"num_tokens": 6628529.0,
"step": 70
},
{
"epoch": 0.6191950464396285,
"grad_norm": 1.0854741991980508,
"learning_rate": 1.9382326422635704e-06,
"loss": 0.5613,
"mean_token_accuracy": 0.8183763369917869,
"num_tokens": 7104675.0,
"step": 75
},
{
"epoch": 0.6604747162022704,
"grad_norm": 1.03078206062344,
"learning_rate": 1.920649886676429e-06,
"loss": 0.5246,
"mean_token_accuracy": 0.8290216967463493,
"num_tokens": 7575339.0,
"step": 80
},
{
"epoch": 0.7017543859649122,
"grad_norm": 1.0790498772904935,
"learning_rate": 1.9009688679024189e-06,
"loss": 0.5674,
"mean_token_accuracy": 0.8155619785189628,
"num_tokens": 8049585.0,
"step": 85
},
{
"epoch": 0.7430340557275542,
"grad_norm": 1.0644009479188448,
"learning_rate": 1.8792344411658468e-06,
"loss": 0.5484,
"mean_token_accuracy": 0.8202491089701652,
"num_tokens": 8523864.0,
"step": 90
},
{
"epoch": 0.7843137254901961,
"grad_norm": 1.0597366929661158,
"learning_rate": 1.8554961416354758e-06,
"loss": 0.5495,
"mean_token_accuracy": 0.8222089603543281,
"num_tokens": 8993860.0,
"step": 95
},
{
"epoch": 0.8255933952528379,
"grad_norm": 1.0650070028025487,
"learning_rate": 1.8298080715283857e-06,
"loss": 0.5536,
"mean_token_accuracy": 0.8196913883090019,
"num_tokens": 9467969.0,
"step": 100
},
{
"epoch": 0.8668730650154799,
"grad_norm": 1.0912797921529367,
"learning_rate": 1.80222877680502e-06,
"loss": 0.5694,
"mean_token_accuracy": 0.8143000498414039,
"num_tokens": 9943019.0,
"step": 105
},
{
"epoch": 0.9081527347781218,
"grad_norm": 1.099585144155188,
"learning_rate": 1.7728211137364486e-06,
"loss": 0.5469,
"mean_token_accuracy": 0.820717391371727,
"num_tokens": 10416469.0,
"step": 110
},
{
"epoch": 0.9494324045407637,
"grad_norm": 1.1065390692517416,
"learning_rate": 1.7416521056479575e-06,
"loss": 0.5654,
"mean_token_accuracy": 0.8159454494714737,
"num_tokens": 10891463.0,
"step": 115
},
{
"epoch": 0.9907120743034056,
"grad_norm": 1.0600231806965752,
"learning_rate": 1.7087927901654556e-06,
"loss": 0.5343,
"mean_token_accuracy": 0.8255413874983788,
"num_tokens": 11362461.0,
"step": 120
},
{
"epoch": 1.0247678018575852,
"grad_norm": 1.075516597064286,
"learning_rate": 1.6743180573128493e-06,
"loss": 0.5489,
"mean_token_accuracy": 0.8342628081639608,
"num_tokens": 11754365.0,
"step": 125
},
{
"epoch": 1.066047471620227,
"grad_norm": 1.1024958520463706,
"learning_rate": 1.6383064788293728e-06,
"loss": 0.4588,
"mean_token_accuracy": 0.8454525545239449,
"num_tokens": 12226786.0,
"step": 130
},
{
"epoch": 1.107327141382869,
"grad_norm": 1.1227030716421933,
"learning_rate": 1.6008401290958805e-06,
"loss": 0.4695,
"mean_token_accuracy": 0.8423517674207688,
"num_tokens": 12699204.0,
"step": 135
},
{
"epoch": 1.1486068111455108,
"grad_norm": 1.1177693050643365,
"learning_rate": 1.5620043980782325e-06,
"loss": 0.4656,
"mean_token_accuracy": 0.8432464152574539,
"num_tokens": 13176184.0,
"step": 140
},
{
"epoch": 1.1898864809081529,
"grad_norm": 1.1746918069579293,
"learning_rate": 1.521887796714092e-06,
"loss": 0.4549,
"mean_token_accuracy": 0.845870116353035,
"num_tokens": 13647750.0,
"step": 145
},
{
"epoch": 1.2311661506707947,
"grad_norm": 1.1505216678817896,
"learning_rate": 1.4805817551866838e-06,
"loss": 0.4701,
"mean_token_accuracy": 0.8416765749454498,
"num_tokens": 14123743.0,
"step": 150
},
{
"epoch": 1.2724458204334366,
"grad_norm": 1.1436761762759031,
"learning_rate": 1.438180414545267e-06,
"loss": 0.438,
"mean_token_accuracy": 0.8506157398223877,
"num_tokens": 14596023.0,
"step": 155
},
{
"epoch": 1.3137254901960784,
"grad_norm": 1.1647013953489513,
"learning_rate": 1.394780412147245e-06,
"loss": 0.4522,
"mean_token_accuracy": 0.8473987281322479,
"num_tokens": 15071031.0,
"step": 160
},
{
"epoch": 1.3550051599587203,
"grad_norm": 1.1333033999243753,
"learning_rate": 1.3504806614109097e-06,
"loss": 0.437,
"mean_token_accuracy": 0.8531909629702568,
"num_tokens": 15543244.0,
"step": 165
},
{
"epoch": 1.3962848297213624,
"grad_norm": 1.1120619686788833,
"learning_rate": 1.3053821263807945e-06,
"loss": 0.4634,
"mean_token_accuracy": 0.8448714122176171,
"num_tokens": 16017065.0,
"step": 170
},
{
"epoch": 1.437564499484004,
"grad_norm": 1.111129800110557,
"learning_rate": 1.2595875916194184e-06,
"loss": 0.4417,
"mean_token_accuracy": 0.849587918817997,
"num_tokens": 16491281.0,
"step": 175
},
{
"epoch": 1.478844169246646,
"grad_norm": 1.150776254338656,
"learning_rate": 1.21320142794987e-06,
"loss": 0.4562,
"mean_token_accuracy": 0.8464704275131225,
"num_tokens": 16968313.0,
"step": 180
},
{
"epoch": 1.520123839009288,
"grad_norm": 1.2131274987610103,
"learning_rate": 1.16632935458313e-06,
"loss": 0.45,
"mean_token_accuracy": 0.8496235758066177,
"num_tokens": 17441110.0,
"step": 185
},
{
"epoch": 1.5614035087719298,
"grad_norm": 1.2050312039209545,
"learning_rate": 1.119078198172262e-06,
"loss": 0.4607,
"mean_token_accuracy": 0.8451412498950959,
"num_tokens": 17916232.0,
"step": 190
},
{
"epoch": 1.6026831785345719,
"grad_norm": 1.1254347771320186,
"learning_rate": 1.071555649342626e-06,
"loss": 0.4633,
"mean_token_accuracy": 0.8458669915795326,
"num_tokens": 18390551.0,
"step": 195
},
{
"epoch": 1.6439628482972135,
"grad_norm": 1.1206711136434166,
"learning_rate": 1.0238700172530007e-06,
"loss": 0.4512,
"mean_token_accuracy": 0.8467774465680122,
"num_tokens": 18864159.0,
"step": 200
},
{
"epoch": 1.6852425180598556,
"grad_norm": 1.1206183234114782,
"learning_rate": 9.761299827469992e-07,
"loss": 0.4546,
"mean_token_accuracy": 0.8466342076659202,
"num_tokens": 19335203.0,
"step": 205
},
{
"epoch": 1.7265221878224974,
"grad_norm": 1.1225182789231243,
"learning_rate": 9.284443506573739e-07,
"loss": 0.453,
"mean_token_accuracy": 0.8467301100492477,
"num_tokens": 19807808.0,
"step": 210
},
{
"epoch": 1.7678018575851393,
"grad_norm": 1.0862991936704458,
"learning_rate": 8.809218018277377e-07,
"loss": 0.4345,
"mean_token_accuracy": 0.852784389257431,
"num_tokens": 20281710.0,
"step": 215
},
{
"epoch": 1.8090815273477814,
"grad_norm": 1.048025349811373,
"learning_rate": 8.336706454168699e-07,
"loss": 0.4465,
"mean_token_accuracy": 0.8481804400682449,
"num_tokens": 20751414.0,
"step": 220
},
{
"epoch": 1.850361197110423,
"grad_norm": 1.106143000786653,
"learning_rate": 7.8679857205013e-07,
"loss": 0.4481,
"mean_token_accuracy": 0.848122601211071,
"num_tokens": 21223522.0,
"step": 225
},
{
"epoch": 1.891640866873065,
"grad_norm": 1.143127191810739,
"learning_rate": 7.404124083805818e-07,
"loss": 0.4531,
"mean_token_accuracy": 0.8467795923352242,
"num_tokens": 21697455.0,
"step": 230
},
{
"epoch": 1.932920536635707,
"grad_norm": 1.2019232454749589,
"learning_rate": 6.946178736192052e-07,
"loss": 0.463,
"mean_token_accuracy": 0.8440817475318909,
"num_tokens": 22171241.0,
"step": 235
},
{
"epoch": 1.9742002063983488,
"grad_norm": 1.1622964289091862,
"learning_rate": 6.495193385890901e-07,
"loss": 0.45,
"mean_token_accuracy": 0.8478259801864624,
"num_tokens": 22643133.0,
"step": 240
},
{
"epoch": 2.0082559339525283,
"grad_norm": 1.3310228664644796,
"learning_rate": 6.052195878527549e-07,
"loss": 0.4939,
"mean_token_accuracy": 0.8443521983695753,
"num_tokens": 23036728.0,
"step": 245
},
{
"epoch": 2.0495356037151704,
"grad_norm": 1.1421506298255963,
"learning_rate": 5.618195854547332e-07,
"loss": 0.401,
"mean_token_accuracy": 0.8647464781999588,
"num_tokens": 23512046.0,
"step": 250
},
{
"epoch": 2.090815273477812,
"grad_norm": 1.0987074279262417,
"learning_rate": 5.194182448133162e-07,
"loss": 0.3902,
"mean_token_accuracy": 0.8662704512476921,
"num_tokens": 23988482.0,
"step": 255
},
{
"epoch": 2.132094943240454,
"grad_norm": 1.135056862447882,
"learning_rate": 4.781122032859079e-07,
"loss": 0.3869,
"mean_token_accuracy": 0.8672365352511406,
"num_tokens": 24462226.0,
"step": 260
},
{
"epoch": 2.173374613003096,
"grad_norm": 1.144024286056472,
"learning_rate": 4.379956019217674e-07,
"loss": 0.3816,
"mean_token_accuracy": 0.868879072368145,
"num_tokens": 24935522.0,
"step": 265
},
{
"epoch": 2.214654282765738,
"grad_norm": 1.095313893833739,
"learning_rate": 3.991598709041195e-07,
"loss": 0.398,
"mean_token_accuracy": 0.8641743138432503,
"num_tokens": 25412409.0,
"step": 270
},
{
"epoch": 2.25593395252838,
"grad_norm": 1.146038956350082,
"learning_rate": 3.6169352117062745e-07,
"loss": 0.3963,
"mean_token_accuracy": 0.8645552083849907,
"num_tokens": 25887799.0,
"step": 275
},
{
"epoch": 2.2972136222910216,
"grad_norm": 1.1485726690588176,
"learning_rate": 3.2568194268715065e-07,
"loss": 0.3639,
"mean_token_accuracy": 0.873411850631237,
"num_tokens": 26358464.0,
"step": 280
},
{
"epoch": 2.3384932920536636,
"grad_norm": 1.1969513148119446,
"learning_rate": 2.912072098345446e-07,
"loss": 0.3641,
"mean_token_accuracy": 0.8735954254865647,
"num_tokens": 26830555.0,
"step": 285
},
{
"epoch": 2.3797729618163057,
"grad_norm": 1.1692613851184464,
"learning_rate": 2.583478943520424e-07,
"loss": 0.3813,
"mean_token_accuracy": 0.8684006243944168,
"num_tokens": 27301887.0,
"step": 290
},
{
"epoch": 2.4210526315789473,
"grad_norm": 1.1756633565591612,
"learning_rate": 2.271788862635513e-07,
"loss": 0.3689,
"mean_token_accuracy": 0.8724981382489204,
"num_tokens": 27770674.0,
"step": 295
},
{
"epoch": 2.4623323013415894,
"grad_norm": 1.1723451227641641,
"learning_rate": 1.9777122319497986e-07,
"loss": 0.4079,
"mean_token_accuracy": 0.8619420573115348,
"num_tokens": 28245209.0,
"step": 300
},
{
"epoch": 2.503611971104231,
"grad_norm": 1.1497630056766186,
"learning_rate": 1.7019192847161423e-07,
"loss": 0.3652,
"mean_token_accuracy": 0.872562825679779,
"num_tokens": 28717418.0,
"step": 305
},
{
"epoch": 2.544891640866873,
"grad_norm": 1.1389041755051739,
"learning_rate": 1.4450385836452428e-07,
"loss": 0.3915,
"mean_token_accuracy": 0.8656164303421974,
"num_tokens": 29191351.0,
"step": 310
},
{
"epoch": 2.586171310629515,
"grad_norm": 1.1966708809689266,
"learning_rate": 1.207655588341534e-07,
"loss": 0.3851,
"mean_token_accuracy": 0.8667084857821464,
"num_tokens": 29664862.0,
"step": 315
},
{
"epoch": 2.627450980392157,
"grad_norm": 1.1811552059060793,
"learning_rate": 9.903113209758096e-08,
"loss": 0.3768,
"mean_token_accuracy": 0.8696465089917182,
"num_tokens": 30139751.0,
"step": 320
},
{
"epoch": 2.6687306501547985,
"grad_norm": 1.185142290815498,
"learning_rate": 7.93501133235711e-08,
"loss": 0.3823,
"mean_token_accuracy": 0.8686287999153137,
"num_tokens": 30609687.0,
"step": 325
},
{
"epoch": 2.7100103199174406,
"grad_norm": 1.1477492199007593,
"learning_rate": 6.17673577364296e-08,
"loss": 0.3779,
"mean_token_accuracy": 0.8701810359954834,
"num_tokens": 31082327.0,
"step": 330
},
{
"epoch": 2.7512899896800826,
"grad_norm": 1.1198458468611587,
"learning_rate": 4.632293838597246e-08,
"loss": 0.3726,
"mean_token_accuracy": 0.8715211614966393,
"num_tokens": 31558388.0,
"step": 335
},
{
"epoch": 2.7925696594427247,
"grad_norm": 1.153660136950001,
"learning_rate": 3.305205481660245e-08,
"loss": 0.3933,
"mean_token_accuracy": 0.8645625025033951,
"num_tokens": 32034515.0,
"step": 340
},
{
"epoch": 2.8338493292053664,
"grad_norm": 1.1426440702388894,
"learning_rate": 2.19849528436441e-08,
"loss": 0.3854,
"mean_token_accuracy": 0.8678223595023156,
"num_tokens": 32507410.0,
"step": 345
},
{
"epoch": 2.875128998968008,
"grad_norm": 1.1521506874308294,
"learning_rate": 1.3146855619776132e-08,
"loss": 0.3604,
"mean_token_accuracy": 0.8755264401435852,
"num_tokens": 32978318.0,
"step": 350
},
{
"epoch": 2.91640866873065,
"grad_norm": 1.1215252031109357,
"learning_rate": 6.557906148669023e-09,
"loss": 0.3876,
"mean_token_accuracy": 0.8669898718595505,
"num_tokens": 33451754.0,
"step": 355
},
{
"epoch": 2.957688338493292,
"grad_norm": 1.1622153487674391,
"learning_rate": 2.233121376846836e-09,
"loss": 0.3952,
"mean_token_accuracy": 0.8642757371068001,
"num_tokens": 33925172.0,
"step": 360
},
{
"epoch": 2.998968008255934,
"grad_norm": 1.1751426215214322,
"learning_rate": 1.8235796839982664e-10,
"loss": 0.3871,
"mean_token_accuracy": 0.8679782792925834,
"num_tokens": 34399819.0,
"step": 365
},
{
"epoch": 3.0,
"mean_token_accuracy": 0.8686857223510742,
"num_tokens": 34411845.0,
"step": 366,
"total_flos": 131252961591296.0,
"train_loss": 0.4884359234017753,
"train_runtime": 11375.5467,
"train_samples_per_second": 4.088,
"train_steps_per_second": 0.032
}
],
"logging_steps": 5,
"max_steps": 366,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 131252961591296.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}