user
Upload
28d574b
{
"best_global_step": 60006,
"best_metric": 0.7708992224677207,
"best_model_checkpoint": "./nvidia_domain_model_multilingual-e5-small/checkpoint-60006",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 60006,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024997500249975,
"grad_norm": 7.94265604019165,
"learning_rate": 4.9584208245842084e-05,
"loss": 2.602,
"step": 500
},
{
"epoch": 0.04999500049995,
"grad_norm": 5.469175815582275,
"learning_rate": 4.9167583241675834e-05,
"loss": 1.8965,
"step": 1000
},
{
"epoch": 0.074992500749925,
"grad_norm": 7.713351726531982,
"learning_rate": 4.875095823750958e-05,
"loss": 1.604,
"step": 1500
},
{
"epoch": 0.0999900009999,
"grad_norm": 11.510587692260742,
"learning_rate": 4.833433323334334e-05,
"loss": 1.3957,
"step": 2000
},
{
"epoch": 0.12498750124987501,
"grad_norm": 12.626413345336914,
"learning_rate": 4.791770822917708e-05,
"loss": 1.322,
"step": 2500
},
{
"epoch": 0.14998500149985,
"grad_norm": 7.150252342224121,
"learning_rate": 4.750108322501083e-05,
"loss": 1.2218,
"step": 3000
},
{
"epoch": 0.17498250174982502,
"grad_norm": 16.782085418701172,
"learning_rate": 4.708445822084459e-05,
"loss": 1.195,
"step": 3500
},
{
"epoch": 0.1999800019998,
"grad_norm": 13.529509544372559,
"learning_rate": 4.666783321667834e-05,
"loss": 1.1313,
"step": 4000
},
{
"epoch": 0.22497750224977503,
"grad_norm": 19.662353515625,
"learning_rate": 4.625120821251208e-05,
"loss": 1.0902,
"step": 4500
},
{
"epoch": 0.24997500249975002,
"grad_norm": 11.194819450378418,
"learning_rate": 4.5834583208345836e-05,
"loss": 1.0637,
"step": 5000
},
{
"epoch": 0.274972502749725,
"grad_norm": 13.16511058807373,
"learning_rate": 4.5417958204179585e-05,
"loss": 1.0626,
"step": 5500
},
{
"epoch": 0.2999700029997,
"grad_norm": 9.290426254272461,
"learning_rate": 4.5001333200013335e-05,
"loss": 1.0054,
"step": 6000
},
{
"epoch": 0.32496750324967505,
"grad_norm": 17.698017120361328,
"learning_rate": 4.4584708195847084e-05,
"loss": 1.0253,
"step": 6500
},
{
"epoch": 0.34996500349965004,
"grad_norm": 15.605792999267578,
"learning_rate": 4.4168083191680834e-05,
"loss": 1.0127,
"step": 7000
},
{
"epoch": 0.37496250374962503,
"grad_norm": 33.41305923461914,
"learning_rate": 4.375145818751458e-05,
"loss": 0.9714,
"step": 7500
},
{
"epoch": 0.3999600039996,
"grad_norm": 18.213973999023438,
"learning_rate": 4.333483318334833e-05,
"loss": 0.9589,
"step": 8000
},
{
"epoch": 0.42495750424957507,
"grad_norm": 11.406991958618164,
"learning_rate": 4.291820817918208e-05,
"loss": 0.9808,
"step": 8500
},
{
"epoch": 0.44995500449955006,
"grad_norm": 15.420747756958008,
"learning_rate": 4.250158317501584e-05,
"loss": 0.9392,
"step": 9000
},
{
"epoch": 0.47495250474952505,
"grad_norm": 19.129817962646484,
"learning_rate": 4.208495817084958e-05,
"loss": 0.9304,
"step": 9500
},
{
"epoch": 0.49995000499950004,
"grad_norm": 9.371217727661133,
"learning_rate": 4.166833316668333e-05,
"loss": 0.9369,
"step": 10000
},
{
"epoch": 0.5249475052494751,
"grad_norm": 11.86233901977539,
"learning_rate": 4.1251708162517086e-05,
"loss": 0.9181,
"step": 10500
},
{
"epoch": 0.54994500549945,
"grad_norm": 16.078561782836914,
"learning_rate": 4.0835083158350836e-05,
"loss": 0.8996,
"step": 11000
},
{
"epoch": 0.5749425057494251,
"grad_norm": 8.514225006103516,
"learning_rate": 4.0418458154184585e-05,
"loss": 0.9111,
"step": 11500
},
{
"epoch": 0.5999400059994,
"grad_norm": 7.778424263000488,
"learning_rate": 4.000183315001833e-05,
"loss": 0.9033,
"step": 12000
},
{
"epoch": 0.624937506249375,
"grad_norm": 10.383719444274902,
"learning_rate": 3.9585208145852084e-05,
"loss": 0.917,
"step": 12500
},
{
"epoch": 0.6499350064993501,
"grad_norm": 12.048624992370605,
"learning_rate": 3.9168583141685834e-05,
"loss": 0.8872,
"step": 13000
},
{
"epoch": 0.674932506749325,
"grad_norm": 14.255531311035156,
"learning_rate": 3.875195813751958e-05,
"loss": 0.8604,
"step": 13500
},
{
"epoch": 0.6999300069993001,
"grad_norm": 15.18703556060791,
"learning_rate": 3.833533313335333e-05,
"loss": 0.8628,
"step": 14000
},
{
"epoch": 0.7249275072492751,
"grad_norm": 12.154521942138672,
"learning_rate": 3.791870812918708e-05,
"loss": 0.8929,
"step": 14500
},
{
"epoch": 0.7499250074992501,
"grad_norm": 14.692411422729492,
"learning_rate": 3.750208312502083e-05,
"loss": 0.8585,
"step": 15000
},
{
"epoch": 0.7749225077492251,
"grad_norm": 8.900308609008789,
"learning_rate": 3.708545812085458e-05,
"loss": 0.9014,
"step": 15500
},
{
"epoch": 0.7999200079992,
"grad_norm": 18.15697479248047,
"learning_rate": 3.666883311668834e-05,
"loss": 0.8581,
"step": 16000
},
{
"epoch": 0.8249175082491751,
"grad_norm": 14.366026878356934,
"learning_rate": 3.6252208112522086e-05,
"loss": 0.8622,
"step": 16500
},
{
"epoch": 0.8499150084991501,
"grad_norm": 14.673120498657227,
"learning_rate": 3.583558310835583e-05,
"loss": 0.873,
"step": 17000
},
{
"epoch": 0.8749125087491251,
"grad_norm": 9.87514877319336,
"learning_rate": 3.541895810418958e-05,
"loss": 0.8446,
"step": 17500
},
{
"epoch": 0.8999100089991001,
"grad_norm": 20.0493221282959,
"learning_rate": 3.5002333100023335e-05,
"loss": 0.819,
"step": 18000
},
{
"epoch": 0.924907509249075,
"grad_norm": 18.50018882751465,
"learning_rate": 3.4585708095857084e-05,
"loss": 0.8458,
"step": 18500
},
{
"epoch": 0.9499050094990501,
"grad_norm": 16.332889556884766,
"learning_rate": 3.4169083091690833e-05,
"loss": 0.8458,
"step": 19000
},
{
"epoch": 0.9749025097490251,
"grad_norm": 11.074434280395508,
"learning_rate": 3.375245808752458e-05,
"loss": 0.8497,
"step": 19500
},
{
"epoch": 0.9999000099990001,
"grad_norm": 8.59486198425293,
"learning_rate": 3.333583308335833e-05,
"loss": 0.7989,
"step": 20000
},
{
"epoch": 1.0,
"eval_f1": 0.7451762918283228,
"eval_loss": 0.8514304757118225,
"eval_runtime": 10.7214,
"eval_samples_per_second": 1865.614,
"eval_steps_per_second": 233.272,
"step": 20002
},
{
"epoch": 1.024897510248975,
"grad_norm": 11.283440589904785,
"learning_rate": 3.291920807919208e-05,
"loss": 0.6034,
"step": 20500
},
{
"epoch": 1.0498950104989502,
"grad_norm": 14.751864433288574,
"learning_rate": 3.250258307502583e-05,
"loss": 0.6148,
"step": 21000
},
{
"epoch": 1.0748925107489251,
"grad_norm": 20.8693790435791,
"learning_rate": 3.208595807085959e-05,
"loss": 0.614,
"step": 21500
},
{
"epoch": 1.0998900109989,
"grad_norm": 15.057612419128418,
"learning_rate": 3.166933306669333e-05,
"loss": 0.5895,
"step": 22000
},
{
"epoch": 1.1248875112488752,
"grad_norm": 10.95419979095459,
"learning_rate": 3.125270806252708e-05,
"loss": 0.6483,
"step": 22500
},
{
"epoch": 1.1498850114988501,
"grad_norm": 17.469892501831055,
"learning_rate": 3.083608305836083e-05,
"loss": 0.6331,
"step": 23000
},
{
"epoch": 1.174882511748825,
"grad_norm": 20.316282272338867,
"learning_rate": 3.041945805419458e-05,
"loss": 0.5885,
"step": 23500
},
{
"epoch": 1.1998800119988,
"grad_norm": 5.562185764312744,
"learning_rate": 3.0002833050028334e-05,
"loss": 0.6082,
"step": 24000
},
{
"epoch": 1.2248775122487752,
"grad_norm": 17.523334503173828,
"learning_rate": 2.958620804586208e-05,
"loss": 0.6312,
"step": 24500
},
{
"epoch": 1.24987501249875,
"grad_norm": 20.40757179260254,
"learning_rate": 2.916958304169583e-05,
"loss": 0.6033,
"step": 25000
},
{
"epoch": 1.274872512748725,
"grad_norm": 18.183963775634766,
"learning_rate": 2.8752958037529583e-05,
"loss": 0.6006,
"step": 25500
},
{
"epoch": 1.2998700129987002,
"grad_norm": 4.399472236633301,
"learning_rate": 2.8336333033363332e-05,
"loss": 0.6283,
"step": 26000
},
{
"epoch": 1.3248675132486751,
"grad_norm": 17.38117027282715,
"learning_rate": 2.7919708029197085e-05,
"loss": 0.6319,
"step": 26500
},
{
"epoch": 1.34986501349865,
"grad_norm": 9.839600563049316,
"learning_rate": 2.7503083025030828e-05,
"loss": 0.5913,
"step": 27000
},
{
"epoch": 1.3748625137486252,
"grad_norm": 3.2011570930480957,
"learning_rate": 2.708645802086458e-05,
"loss": 0.6037,
"step": 27500
},
{
"epoch": 1.3998600139986002,
"grad_norm": 9.335294723510742,
"learning_rate": 2.666983301669833e-05,
"loss": 0.6025,
"step": 28000
},
{
"epoch": 1.424857514248575,
"grad_norm": 26.70831298828125,
"learning_rate": 2.6253208012532083e-05,
"loss": 0.6067,
"step": 28500
},
{
"epoch": 1.4498550144985503,
"grad_norm": 16.662883758544922,
"learning_rate": 2.5836583008365832e-05,
"loss": 0.6075,
"step": 29000
},
{
"epoch": 1.4748525147485252,
"grad_norm": 18.168540954589844,
"learning_rate": 2.5419958004199578e-05,
"loss": 0.6035,
"step": 29500
},
{
"epoch": 1.4998500149985001,
"grad_norm": 49.09202575683594,
"learning_rate": 2.500333300003333e-05,
"loss": 0.5826,
"step": 30000
},
{
"epoch": 1.5248475152484753,
"grad_norm": 18.314056396484375,
"learning_rate": 2.458670799586708e-05,
"loss": 0.5905,
"step": 30500
},
{
"epoch": 1.54984501549845,
"grad_norm": 4.7171406745910645,
"learning_rate": 2.4170082991700833e-05,
"loss": 0.563,
"step": 31000
},
{
"epoch": 1.5748425157484252,
"grad_norm": 17.988279342651367,
"learning_rate": 2.375345798753458e-05,
"loss": 0.5795,
"step": 31500
},
{
"epoch": 1.5998400159984003,
"grad_norm": 15.996960639953613,
"learning_rate": 2.3336832983368332e-05,
"loss": 0.603,
"step": 32000
},
{
"epoch": 1.624837516248375,
"grad_norm": 15.832610130310059,
"learning_rate": 2.2920207979202078e-05,
"loss": 0.5805,
"step": 32500
},
{
"epoch": 1.6498350164983502,
"grad_norm": 33.191444396972656,
"learning_rate": 2.250358297503583e-05,
"loss": 0.6108,
"step": 33000
},
{
"epoch": 1.6748325167483251,
"grad_norm": 8.741061210632324,
"learning_rate": 2.208695797086958e-05,
"loss": 0.6077,
"step": 33500
},
{
"epoch": 1.6998300169983,
"grad_norm": 14.29039192199707,
"learning_rate": 2.167033296670333e-05,
"loss": 0.5751,
"step": 34000
},
{
"epoch": 1.7248275172482752,
"grad_norm": 21.69901466369629,
"learning_rate": 2.1253707962537083e-05,
"loss": 0.5833,
"step": 34500
},
{
"epoch": 1.7498250174982501,
"grad_norm": 32.595794677734375,
"learning_rate": 2.083708295837083e-05,
"loss": 0.5895,
"step": 35000
},
{
"epoch": 1.774822517748225,
"grad_norm": 42.687721252441406,
"learning_rate": 2.042045795420458e-05,
"loss": 0.5541,
"step": 35500
},
{
"epoch": 1.7998200179982002,
"grad_norm": 16.474918365478516,
"learning_rate": 2.000383295003833e-05,
"loss": 0.5423,
"step": 36000
},
{
"epoch": 1.8248175182481752,
"grad_norm": 13.296688079833984,
"learning_rate": 1.958720794587208e-05,
"loss": 0.5566,
"step": 36500
},
{
"epoch": 1.84981501849815,
"grad_norm": 18.645790100097656,
"learning_rate": 1.917058294170583e-05,
"loss": 0.5493,
"step": 37000
},
{
"epoch": 1.8748125187481253,
"grad_norm": 12.576258659362793,
"learning_rate": 1.875395793753958e-05,
"loss": 0.5602,
"step": 37500
},
{
"epoch": 1.8998100189981002,
"grad_norm": 21.95449447631836,
"learning_rate": 1.833733293337333e-05,
"loss": 0.5878,
"step": 38000
},
{
"epoch": 1.9248075192480751,
"grad_norm": 9.17590618133545,
"learning_rate": 1.792070792920708e-05,
"loss": 0.5681,
"step": 38500
},
{
"epoch": 1.9498050194980503,
"grad_norm": 12.517435073852539,
"learning_rate": 1.750408292504083e-05,
"loss": 0.5464,
"step": 39000
},
{
"epoch": 1.9748025197480252,
"grad_norm": 15.346318244934082,
"learning_rate": 1.708745792087458e-05,
"loss": 0.5917,
"step": 39500
},
{
"epoch": 1.9998000199980002,
"grad_norm": 29.321331024169922,
"learning_rate": 1.667083291670833e-05,
"loss": 0.5443,
"step": 40000
},
{
"epoch": 2.0,
"eval_f1": 0.7651638193675152,
"eval_loss": 0.8535689115524292,
"eval_runtime": 10.748,
"eval_samples_per_second": 1860.991,
"eval_steps_per_second": 232.694,
"step": 40004
},
{
"epoch": 2.0247975202479753,
"grad_norm": 24.93914794921875,
"learning_rate": 1.625420791254208e-05,
"loss": 0.3501,
"step": 40500
},
{
"epoch": 2.04979502049795,
"grad_norm": 50.30072784423828,
"learning_rate": 1.5837582908375832e-05,
"loss": 0.3785,
"step": 41000
},
{
"epoch": 2.074792520747925,
"grad_norm": 24.169206619262695,
"learning_rate": 1.5420957904209578e-05,
"loss": 0.4034,
"step": 41500
},
{
"epoch": 2.0997900209979004,
"grad_norm": 43.043338775634766,
"learning_rate": 1.500433290004333e-05,
"loss": 0.385,
"step": 42000
},
{
"epoch": 2.124787521247875,
"grad_norm": 1.601791262626648,
"learning_rate": 1.4587707895877079e-05,
"loss": 0.3758,
"step": 42500
},
{
"epoch": 2.1497850214978502,
"grad_norm": 1.0921714305877686,
"learning_rate": 1.417108289171083e-05,
"loss": 0.3713,
"step": 43000
},
{
"epoch": 2.1747825217478254,
"grad_norm": 23.122596740722656,
"learning_rate": 1.375445788754458e-05,
"loss": 0.413,
"step": 43500
},
{
"epoch": 2.1997800219978,
"grad_norm": 7.090549468994141,
"learning_rate": 1.3337832883378329e-05,
"loss": 0.3787,
"step": 44000
},
{
"epoch": 2.2247775222477753,
"grad_norm": 17.668933868408203,
"learning_rate": 1.292120787921208e-05,
"loss": 0.3805,
"step": 44500
},
{
"epoch": 2.2497750224977504,
"grad_norm": 15.878674507141113,
"learning_rate": 1.2504582875045829e-05,
"loss": 0.3757,
"step": 45000
},
{
"epoch": 2.274772522747725,
"grad_norm": 39.11751937866211,
"learning_rate": 1.2087957870879578e-05,
"loss": 0.3887,
"step": 45500
},
{
"epoch": 2.2997700229977003,
"grad_norm": 4.333780288696289,
"learning_rate": 1.167133286671333e-05,
"loss": 0.3789,
"step": 46000
},
{
"epoch": 2.324767523247675,
"grad_norm": 21.4094295501709,
"learning_rate": 1.1254707862547079e-05,
"loss": 0.3742,
"step": 46500
},
{
"epoch": 2.34976502349765,
"grad_norm": 14.586631774902344,
"learning_rate": 1.083808285838083e-05,
"loss": 0.3805,
"step": 47000
},
{
"epoch": 2.3747625237476253,
"grad_norm": 1.1548786163330078,
"learning_rate": 1.042145785421458e-05,
"loss": 0.3936,
"step": 47500
},
{
"epoch": 2.3997600239976,
"grad_norm": 0.03682245686650276,
"learning_rate": 1.0004832850048329e-05,
"loss": 0.38,
"step": 48000
},
{
"epoch": 2.424757524247575,
"grad_norm": 35.44232940673828,
"learning_rate": 9.588207845882078e-06,
"loss": 0.3941,
"step": 48500
},
{
"epoch": 2.4497550244975503,
"grad_norm": 8.77474594116211,
"learning_rate": 9.171582841715828e-06,
"loss": 0.4054,
"step": 49000
},
{
"epoch": 2.474752524747525,
"grad_norm": 13.013947486877441,
"learning_rate": 8.754957837549579e-06,
"loss": 0.3659,
"step": 49500
},
{
"epoch": 2.4997500249975,
"grad_norm": 14.281270980834961,
"learning_rate": 8.338332833383328e-06,
"loss": 0.3917,
"step": 50000
},
{
"epoch": 2.5247475252474754,
"grad_norm": 24.0106258392334,
"learning_rate": 7.92170782921708e-06,
"loss": 0.3876,
"step": 50500
},
{
"epoch": 2.54974502549745,
"grad_norm": 20.56169319152832,
"learning_rate": 7.505082825050828e-06,
"loss": 0.3628,
"step": 51000
},
{
"epoch": 2.5747425257474252,
"grad_norm": 0.7545715570449829,
"learning_rate": 7.088457820884578e-06,
"loss": 0.3918,
"step": 51500
},
{
"epoch": 2.5997400259974004,
"grad_norm": 17.21295928955078,
"learning_rate": 6.6718328167183295e-06,
"loss": 0.359,
"step": 52000
},
{
"epoch": 2.624737526247375,
"grad_norm": 9.85009479522705,
"learning_rate": 6.255207812552079e-06,
"loss": 0.3634,
"step": 52500
},
{
"epoch": 2.6497350264973503,
"grad_norm": 21.24859046936035,
"learning_rate": 5.838582808385828e-06,
"loss": 0.3737,
"step": 53000
},
{
"epoch": 2.6747325267473254,
"grad_norm": 13.614805221557617,
"learning_rate": 5.421957804219578e-06,
"loss": 0.4022,
"step": 53500
},
{
"epoch": 2.6997300269973,
"grad_norm": 5.028663158416748,
"learning_rate": 5.005332800053329e-06,
"loss": 0.3562,
"step": 54000
},
{
"epoch": 2.7247275272472753,
"grad_norm": 22.341398239135742,
"learning_rate": 4.588707795887078e-06,
"loss": 0.349,
"step": 54500
},
{
"epoch": 2.7497250274972505,
"grad_norm": 9.66286849975586,
"learning_rate": 4.172082791720828e-06,
"loss": 0.3573,
"step": 55000
},
{
"epoch": 2.774722527747225,
"grad_norm": 4.927464962005615,
"learning_rate": 3.755457787554578e-06,
"loss": 0.335,
"step": 55500
},
{
"epoch": 2.7997200279972003,
"grad_norm": 4.33281135559082,
"learning_rate": 3.338832783388328e-06,
"loss": 0.3679,
"step": 56000
},
{
"epoch": 2.8247175282471755,
"grad_norm": 0.29482612013816833,
"learning_rate": 2.9222077792220777e-06,
"loss": 0.3266,
"step": 56500
},
{
"epoch": 2.84971502849715,
"grad_norm": 21.363672256469727,
"learning_rate": 2.505582775055828e-06,
"loss": 0.3453,
"step": 57000
},
{
"epoch": 2.8747125287471254,
"grad_norm": 2.6021454334259033,
"learning_rate": 2.088957770889578e-06,
"loss": 0.3682,
"step": 57500
},
{
"epoch": 2.8997100289971005,
"grad_norm": 4.7911577224731445,
"learning_rate": 1.6723327667233275e-06,
"loss": 0.3417,
"step": 58000
},
{
"epoch": 2.924707529247075,
"grad_norm": 0.21428282558918,
"learning_rate": 1.2557077625570776e-06,
"loss": 0.3192,
"step": 58500
},
{
"epoch": 2.9497050294970504,
"grad_norm": 1.2091667652130127,
"learning_rate": 8.390827583908276e-07,
"loss": 0.3375,
"step": 59000
},
{
"epoch": 2.9747025297470255,
"grad_norm": 24.199045181274414,
"learning_rate": 4.2245775422457754e-07,
"loss": 0.3669,
"step": 59500
},
{
"epoch": 2.9997000299970003,
"grad_norm": 5.163976669311523,
"learning_rate": 5.832750058327501e-09,
"loss": 0.332,
"step": 60000
},
{
"epoch": 3.0,
"eval_f1": 0.7708992224677207,
"eval_loss": 0.9973717331886292,
"eval_runtime": 11.045,
"eval_samples_per_second": 1810.963,
"eval_steps_per_second": 226.438,
"step": 60006
},
{
"epoch": 3.0,
"step": 60006,
"total_flos": 7908628105405440.0,
"train_loss": 0.6686365460569141,
"train_runtime": 1639.5865,
"train_samples_per_second": 292.775,
"train_steps_per_second": 36.598
}
],
"logging_steps": 500,
"max_steps": 60006,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7908628105405440.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}