{ "best_global_step": 60006, "best_metric": 0.7708992224677207, "best_model_checkpoint": "./nvidia_domain_model_multilingual-e5-small/checkpoint-60006", "epoch": 3.0, "eval_steps": 500, "global_step": 60006, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024997500249975, "grad_norm": 7.94265604019165, "learning_rate": 4.9584208245842084e-05, "loss": 2.602, "step": 500 }, { "epoch": 0.04999500049995, "grad_norm": 5.469175815582275, "learning_rate": 4.9167583241675834e-05, "loss": 1.8965, "step": 1000 }, { "epoch": 0.074992500749925, "grad_norm": 7.713351726531982, "learning_rate": 4.875095823750958e-05, "loss": 1.604, "step": 1500 }, { "epoch": 0.0999900009999, "grad_norm": 11.510587692260742, "learning_rate": 4.833433323334334e-05, "loss": 1.3957, "step": 2000 }, { "epoch": 0.12498750124987501, "grad_norm": 12.626413345336914, "learning_rate": 4.791770822917708e-05, "loss": 1.322, "step": 2500 }, { "epoch": 0.14998500149985, "grad_norm": 7.150252342224121, "learning_rate": 4.750108322501083e-05, "loss": 1.2218, "step": 3000 }, { "epoch": 0.17498250174982502, "grad_norm": 16.782085418701172, "learning_rate": 4.708445822084459e-05, "loss": 1.195, "step": 3500 }, { "epoch": 0.1999800019998, "grad_norm": 13.529509544372559, "learning_rate": 4.666783321667834e-05, "loss": 1.1313, "step": 4000 }, { "epoch": 0.22497750224977503, "grad_norm": 19.662353515625, "learning_rate": 4.625120821251208e-05, "loss": 1.0902, "step": 4500 }, { "epoch": 0.24997500249975002, "grad_norm": 11.194819450378418, "learning_rate": 4.5834583208345836e-05, "loss": 1.0637, "step": 5000 }, { "epoch": 0.274972502749725, "grad_norm": 13.16511058807373, "learning_rate": 4.5417958204179585e-05, "loss": 1.0626, "step": 5500 }, { "epoch": 0.2999700029997, "grad_norm": 9.290426254272461, "learning_rate": 4.5001333200013335e-05, "loss": 1.0054, "step": 6000 }, { "epoch": 0.32496750324967505, "grad_norm": 17.698017120361328, "learning_rate": 4.4584708195847084e-05, "loss": 1.0253, "step": 6500 }, { "epoch": 0.34996500349965004, "grad_norm": 15.605792999267578, "learning_rate": 4.4168083191680834e-05, "loss": 1.0127, "step": 7000 }, { "epoch": 0.37496250374962503, "grad_norm": 33.41305923461914, "learning_rate": 4.375145818751458e-05, "loss": 0.9714, "step": 7500 }, { "epoch": 0.3999600039996, "grad_norm": 18.213973999023438, "learning_rate": 4.333483318334833e-05, "loss": 0.9589, "step": 8000 }, { "epoch": 0.42495750424957507, "grad_norm": 11.406991958618164, "learning_rate": 4.291820817918208e-05, "loss": 0.9808, "step": 8500 }, { "epoch": 0.44995500449955006, "grad_norm": 15.420747756958008, "learning_rate": 4.250158317501584e-05, "loss": 0.9392, "step": 9000 }, { "epoch": 0.47495250474952505, "grad_norm": 19.129817962646484, "learning_rate": 4.208495817084958e-05, "loss": 0.9304, "step": 9500 }, { "epoch": 0.49995000499950004, "grad_norm": 9.371217727661133, "learning_rate": 4.166833316668333e-05, "loss": 0.9369, "step": 10000 }, { "epoch": 0.5249475052494751, "grad_norm": 11.86233901977539, "learning_rate": 4.1251708162517086e-05, "loss": 0.9181, "step": 10500 }, { "epoch": 0.54994500549945, "grad_norm": 16.078561782836914, "learning_rate": 4.0835083158350836e-05, "loss": 0.8996, "step": 11000 }, { "epoch": 0.5749425057494251, "grad_norm": 8.514225006103516, "learning_rate": 4.0418458154184585e-05, "loss": 0.9111, "step": 11500 }, { "epoch": 0.5999400059994, "grad_norm": 7.778424263000488, "learning_rate": 4.000183315001833e-05, "loss": 0.9033, "step": 12000 }, { "epoch": 0.624937506249375, "grad_norm": 10.383719444274902, "learning_rate": 3.9585208145852084e-05, "loss": 0.917, "step": 12500 }, { "epoch": 0.6499350064993501, "grad_norm": 12.048624992370605, "learning_rate": 3.9168583141685834e-05, "loss": 0.8872, "step": 13000 }, { "epoch": 0.674932506749325, "grad_norm": 14.255531311035156, "learning_rate": 3.875195813751958e-05, "loss": 0.8604, "step": 13500 }, { "epoch": 0.6999300069993001, "grad_norm": 15.18703556060791, "learning_rate": 3.833533313335333e-05, "loss": 0.8628, "step": 14000 }, { "epoch": 0.7249275072492751, "grad_norm": 12.154521942138672, "learning_rate": 3.791870812918708e-05, "loss": 0.8929, "step": 14500 }, { "epoch": 0.7499250074992501, "grad_norm": 14.692411422729492, "learning_rate": 3.750208312502083e-05, "loss": 0.8585, "step": 15000 }, { "epoch": 0.7749225077492251, "grad_norm": 8.900308609008789, "learning_rate": 3.708545812085458e-05, "loss": 0.9014, "step": 15500 }, { "epoch": 0.7999200079992, "grad_norm": 18.15697479248047, "learning_rate": 3.666883311668834e-05, "loss": 0.8581, "step": 16000 }, { "epoch": 0.8249175082491751, "grad_norm": 14.366026878356934, "learning_rate": 3.6252208112522086e-05, "loss": 0.8622, "step": 16500 }, { "epoch": 0.8499150084991501, "grad_norm": 14.673120498657227, "learning_rate": 3.583558310835583e-05, "loss": 0.873, "step": 17000 }, { "epoch": 0.8749125087491251, "grad_norm": 9.87514877319336, "learning_rate": 3.541895810418958e-05, "loss": 0.8446, "step": 17500 }, { "epoch": 0.8999100089991001, "grad_norm": 20.0493221282959, "learning_rate": 3.5002333100023335e-05, "loss": 0.819, "step": 18000 }, { "epoch": 0.924907509249075, "grad_norm": 18.50018882751465, "learning_rate": 3.4585708095857084e-05, "loss": 0.8458, "step": 18500 }, { "epoch": 0.9499050094990501, "grad_norm": 16.332889556884766, "learning_rate": 3.4169083091690833e-05, "loss": 0.8458, "step": 19000 }, { "epoch": 0.9749025097490251, "grad_norm": 11.074434280395508, "learning_rate": 3.375245808752458e-05, "loss": 0.8497, "step": 19500 }, { "epoch": 0.9999000099990001, "grad_norm": 8.59486198425293, "learning_rate": 3.333583308335833e-05, "loss": 0.7989, "step": 20000 }, { "epoch": 1.0, "eval_f1": 0.7451762918283228, "eval_loss": 0.8514304757118225, "eval_runtime": 10.7214, "eval_samples_per_second": 1865.614, "eval_steps_per_second": 233.272, "step": 20002 }, { "epoch": 1.024897510248975, "grad_norm": 11.283440589904785, "learning_rate": 3.291920807919208e-05, "loss": 0.6034, "step": 20500 }, { "epoch": 1.0498950104989502, "grad_norm": 14.751864433288574, "learning_rate": 3.250258307502583e-05, "loss": 0.6148, "step": 21000 }, { "epoch": 1.0748925107489251, "grad_norm": 20.8693790435791, "learning_rate": 3.208595807085959e-05, "loss": 0.614, "step": 21500 }, { "epoch": 1.0998900109989, "grad_norm": 15.057612419128418, "learning_rate": 3.166933306669333e-05, "loss": 0.5895, "step": 22000 }, { "epoch": 1.1248875112488752, "grad_norm": 10.95419979095459, "learning_rate": 3.125270806252708e-05, "loss": 0.6483, "step": 22500 }, { "epoch": 1.1498850114988501, "grad_norm": 17.469892501831055, "learning_rate": 3.083608305836083e-05, "loss": 0.6331, "step": 23000 }, { "epoch": 1.174882511748825, "grad_norm": 20.316282272338867, "learning_rate": 3.041945805419458e-05, "loss": 0.5885, "step": 23500 }, { "epoch": 1.1998800119988, "grad_norm": 5.562185764312744, "learning_rate": 3.0002833050028334e-05, "loss": 0.6082, "step": 24000 }, { "epoch": 1.2248775122487752, "grad_norm": 17.523334503173828, "learning_rate": 2.958620804586208e-05, "loss": 0.6312, "step": 24500 }, { "epoch": 1.24987501249875, "grad_norm": 20.40757179260254, "learning_rate": 2.916958304169583e-05, "loss": 0.6033, "step": 25000 }, { "epoch": 1.274872512748725, "grad_norm": 18.183963775634766, "learning_rate": 2.8752958037529583e-05, "loss": 0.6006, "step": 25500 }, { "epoch": 1.2998700129987002, "grad_norm": 4.399472236633301, "learning_rate": 2.8336333033363332e-05, "loss": 0.6283, "step": 26000 }, { "epoch": 1.3248675132486751, "grad_norm": 17.38117027282715, "learning_rate": 2.7919708029197085e-05, "loss": 0.6319, "step": 26500 }, { "epoch": 1.34986501349865, "grad_norm": 9.839600563049316, "learning_rate": 2.7503083025030828e-05, "loss": 0.5913, "step": 27000 }, { "epoch": 1.3748625137486252, "grad_norm": 3.2011570930480957, "learning_rate": 2.708645802086458e-05, "loss": 0.6037, "step": 27500 }, { "epoch": 1.3998600139986002, "grad_norm": 9.335294723510742, "learning_rate": 2.666983301669833e-05, "loss": 0.6025, "step": 28000 }, { "epoch": 1.424857514248575, "grad_norm": 26.70831298828125, "learning_rate": 2.6253208012532083e-05, "loss": 0.6067, "step": 28500 }, { "epoch": 1.4498550144985503, "grad_norm": 16.662883758544922, "learning_rate": 2.5836583008365832e-05, "loss": 0.6075, "step": 29000 }, { "epoch": 1.4748525147485252, "grad_norm": 18.168540954589844, "learning_rate": 2.5419958004199578e-05, "loss": 0.6035, "step": 29500 }, { "epoch": 1.4998500149985001, "grad_norm": 49.09202575683594, "learning_rate": 2.500333300003333e-05, "loss": 0.5826, "step": 30000 }, { "epoch": 1.5248475152484753, "grad_norm": 18.314056396484375, "learning_rate": 2.458670799586708e-05, "loss": 0.5905, "step": 30500 }, { "epoch": 1.54984501549845, "grad_norm": 4.7171406745910645, "learning_rate": 2.4170082991700833e-05, "loss": 0.563, "step": 31000 }, { "epoch": 1.5748425157484252, "grad_norm": 17.988279342651367, "learning_rate": 2.375345798753458e-05, "loss": 0.5795, "step": 31500 }, { "epoch": 1.5998400159984003, "grad_norm": 15.996960639953613, "learning_rate": 2.3336832983368332e-05, "loss": 0.603, "step": 32000 }, { "epoch": 1.624837516248375, "grad_norm": 15.832610130310059, "learning_rate": 2.2920207979202078e-05, "loss": 0.5805, "step": 32500 }, { "epoch": 1.6498350164983502, "grad_norm": 33.191444396972656, "learning_rate": 2.250358297503583e-05, "loss": 0.6108, "step": 33000 }, { "epoch": 1.6748325167483251, "grad_norm": 8.741061210632324, "learning_rate": 2.208695797086958e-05, "loss": 0.6077, "step": 33500 }, { "epoch": 1.6998300169983, "grad_norm": 14.29039192199707, "learning_rate": 2.167033296670333e-05, "loss": 0.5751, "step": 34000 }, { "epoch": 1.7248275172482752, "grad_norm": 21.69901466369629, "learning_rate": 2.1253707962537083e-05, "loss": 0.5833, "step": 34500 }, { "epoch": 1.7498250174982501, "grad_norm": 32.595794677734375, "learning_rate": 2.083708295837083e-05, "loss": 0.5895, "step": 35000 }, { "epoch": 1.774822517748225, "grad_norm": 42.687721252441406, "learning_rate": 2.042045795420458e-05, "loss": 0.5541, "step": 35500 }, { "epoch": 1.7998200179982002, "grad_norm": 16.474918365478516, "learning_rate": 2.000383295003833e-05, "loss": 0.5423, "step": 36000 }, { "epoch": 1.8248175182481752, "grad_norm": 13.296688079833984, "learning_rate": 1.958720794587208e-05, "loss": 0.5566, "step": 36500 }, { "epoch": 1.84981501849815, "grad_norm": 18.645790100097656, "learning_rate": 1.917058294170583e-05, "loss": 0.5493, "step": 37000 }, { "epoch": 1.8748125187481253, "grad_norm": 12.576258659362793, "learning_rate": 1.875395793753958e-05, "loss": 0.5602, "step": 37500 }, { "epoch": 1.8998100189981002, "grad_norm": 21.95449447631836, "learning_rate": 1.833733293337333e-05, "loss": 0.5878, "step": 38000 }, { "epoch": 1.9248075192480751, "grad_norm": 9.17590618133545, "learning_rate": 1.792070792920708e-05, "loss": 0.5681, "step": 38500 }, { "epoch": 1.9498050194980503, "grad_norm": 12.517435073852539, "learning_rate": 1.750408292504083e-05, "loss": 0.5464, "step": 39000 }, { "epoch": 1.9748025197480252, "grad_norm": 15.346318244934082, "learning_rate": 1.708745792087458e-05, "loss": 0.5917, "step": 39500 }, { "epoch": 1.9998000199980002, "grad_norm": 29.321331024169922, "learning_rate": 1.667083291670833e-05, "loss": 0.5443, "step": 40000 }, { "epoch": 2.0, "eval_f1": 0.7651638193675152, "eval_loss": 0.8535689115524292, "eval_runtime": 10.748, "eval_samples_per_second": 1860.991, "eval_steps_per_second": 232.694, "step": 40004 }, { "epoch": 2.0247975202479753, "grad_norm": 24.93914794921875, "learning_rate": 1.625420791254208e-05, "loss": 0.3501, "step": 40500 }, { "epoch": 2.04979502049795, "grad_norm": 50.30072784423828, "learning_rate": 1.5837582908375832e-05, "loss": 0.3785, "step": 41000 }, { "epoch": 2.074792520747925, "grad_norm": 24.169206619262695, "learning_rate": 1.5420957904209578e-05, "loss": 0.4034, "step": 41500 }, { "epoch": 2.0997900209979004, "grad_norm": 43.043338775634766, "learning_rate": 1.500433290004333e-05, "loss": 0.385, "step": 42000 }, { "epoch": 2.124787521247875, "grad_norm": 1.601791262626648, "learning_rate": 1.4587707895877079e-05, "loss": 0.3758, "step": 42500 }, { "epoch": 2.1497850214978502, "grad_norm": 1.0921714305877686, "learning_rate": 1.417108289171083e-05, "loss": 0.3713, "step": 43000 }, { "epoch": 2.1747825217478254, "grad_norm": 23.122596740722656, "learning_rate": 1.375445788754458e-05, "loss": 0.413, "step": 43500 }, { "epoch": 2.1997800219978, "grad_norm": 7.090549468994141, "learning_rate": 1.3337832883378329e-05, "loss": 0.3787, "step": 44000 }, { "epoch": 2.2247775222477753, "grad_norm": 17.668933868408203, "learning_rate": 1.292120787921208e-05, "loss": 0.3805, "step": 44500 }, { "epoch": 2.2497750224977504, "grad_norm": 15.878674507141113, "learning_rate": 1.2504582875045829e-05, "loss": 0.3757, "step": 45000 }, { "epoch": 2.274772522747725, "grad_norm": 39.11751937866211, "learning_rate": 1.2087957870879578e-05, "loss": 0.3887, "step": 45500 }, { "epoch": 2.2997700229977003, "grad_norm": 4.333780288696289, "learning_rate": 1.167133286671333e-05, "loss": 0.3789, "step": 46000 }, { "epoch": 2.324767523247675, "grad_norm": 21.4094295501709, "learning_rate": 1.1254707862547079e-05, "loss": 0.3742, "step": 46500 }, { "epoch": 2.34976502349765, "grad_norm": 14.586631774902344, "learning_rate": 1.083808285838083e-05, "loss": 0.3805, "step": 47000 }, { "epoch": 2.3747625237476253, "grad_norm": 1.1548786163330078, "learning_rate": 1.042145785421458e-05, "loss": 0.3936, "step": 47500 }, { "epoch": 2.3997600239976, "grad_norm": 0.03682245686650276, "learning_rate": 1.0004832850048329e-05, "loss": 0.38, "step": 48000 }, { "epoch": 2.424757524247575, "grad_norm": 35.44232940673828, "learning_rate": 9.588207845882078e-06, "loss": 0.3941, "step": 48500 }, { "epoch": 2.4497550244975503, "grad_norm": 8.77474594116211, "learning_rate": 9.171582841715828e-06, "loss": 0.4054, "step": 49000 }, { "epoch": 2.474752524747525, "grad_norm": 13.013947486877441, "learning_rate": 8.754957837549579e-06, "loss": 0.3659, "step": 49500 }, { "epoch": 2.4997500249975, "grad_norm": 14.281270980834961, "learning_rate": 8.338332833383328e-06, "loss": 0.3917, "step": 50000 }, { "epoch": 2.5247475252474754, "grad_norm": 24.0106258392334, "learning_rate": 7.92170782921708e-06, "loss": 0.3876, "step": 50500 }, { "epoch": 2.54974502549745, "grad_norm": 20.56169319152832, "learning_rate": 7.505082825050828e-06, "loss": 0.3628, "step": 51000 }, { "epoch": 2.5747425257474252, "grad_norm": 0.7545715570449829, "learning_rate": 7.088457820884578e-06, "loss": 0.3918, "step": 51500 }, { "epoch": 2.5997400259974004, "grad_norm": 17.21295928955078, "learning_rate": 6.6718328167183295e-06, "loss": 0.359, "step": 52000 }, { "epoch": 2.624737526247375, "grad_norm": 9.85009479522705, "learning_rate": 6.255207812552079e-06, "loss": 0.3634, "step": 52500 }, { "epoch": 2.6497350264973503, "grad_norm": 21.24859046936035, "learning_rate": 5.838582808385828e-06, "loss": 0.3737, "step": 53000 }, { "epoch": 2.6747325267473254, "grad_norm": 13.614805221557617, "learning_rate": 5.421957804219578e-06, "loss": 0.4022, "step": 53500 }, { "epoch": 2.6997300269973, "grad_norm": 5.028663158416748, "learning_rate": 5.005332800053329e-06, "loss": 0.3562, "step": 54000 }, { "epoch": 2.7247275272472753, "grad_norm": 22.341398239135742, "learning_rate": 4.588707795887078e-06, "loss": 0.349, "step": 54500 }, { "epoch": 2.7497250274972505, "grad_norm": 9.66286849975586, "learning_rate": 4.172082791720828e-06, "loss": 0.3573, "step": 55000 }, { "epoch": 2.774722527747225, "grad_norm": 4.927464962005615, "learning_rate": 3.755457787554578e-06, "loss": 0.335, "step": 55500 }, { "epoch": 2.7997200279972003, "grad_norm": 4.33281135559082, "learning_rate": 3.338832783388328e-06, "loss": 0.3679, "step": 56000 }, { "epoch": 2.8247175282471755, "grad_norm": 0.29482612013816833, "learning_rate": 2.9222077792220777e-06, "loss": 0.3266, "step": 56500 }, { "epoch": 2.84971502849715, "grad_norm": 21.363672256469727, "learning_rate": 2.505582775055828e-06, "loss": 0.3453, "step": 57000 }, { "epoch": 2.8747125287471254, "grad_norm": 2.6021454334259033, "learning_rate": 2.088957770889578e-06, "loss": 0.3682, "step": 57500 }, { "epoch": 2.8997100289971005, "grad_norm": 4.7911577224731445, "learning_rate": 1.6723327667233275e-06, "loss": 0.3417, "step": 58000 }, { "epoch": 2.924707529247075, "grad_norm": 0.21428282558918, "learning_rate": 1.2557077625570776e-06, "loss": 0.3192, "step": 58500 }, { "epoch": 2.9497050294970504, "grad_norm": 1.2091667652130127, "learning_rate": 8.390827583908276e-07, "loss": 0.3375, "step": 59000 }, { "epoch": 2.9747025297470255, "grad_norm": 24.199045181274414, "learning_rate": 4.2245775422457754e-07, "loss": 0.3669, "step": 59500 }, { "epoch": 2.9997000299970003, "grad_norm": 5.163976669311523, "learning_rate": 5.832750058327501e-09, "loss": 0.332, "step": 60000 }, { "epoch": 3.0, "eval_f1": 0.7708992224677207, "eval_loss": 0.9973717331886292, "eval_runtime": 11.045, "eval_samples_per_second": 1810.963, "eval_steps_per_second": 226.438, "step": 60006 }, { "epoch": 3.0, "step": 60006, "total_flos": 7908628105405440.0, "train_loss": 0.6686365460569141, "train_runtime": 1639.5865, "train_samples_per_second": 292.775, "train_steps_per_second": 36.598 } ], "logging_steps": 500, "max_steps": 60006, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7908628105405440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }