{ "best_global_step": 1800, "best_metric": 0.00229549, "best_model_checkpoint": "/mnt/beegfs3/liying/zhangfanhao/output1125/v1-20251125-231025/checkpoint-1800", "epoch": 3.0354280894137493, "eval_steps": 100, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001687051876845213, "grad_norm": 0.5938383277366354, "learning_rate": 5.999998316002012e-06, "loss": 0.380859375, "step": 1 }, { "epoch": 0.008435259384226065, "grad_norm": 0.432332139447319, "learning_rate": 5.999957900144816e-06, "loss": 0.3326416015625, "step": 5 }, { "epoch": 0.01687051876845213, "grad_norm": 0.24245712798777588, "learning_rate": 5.99983160176086e-06, "loss": 0.2187744140625, "step": 10 }, { "epoch": 0.025305778152678194, "grad_norm": 0.14967602144842607, "learning_rate": 5.999621108392896e-06, "loss": 0.1771240234375, "step": 15 }, { "epoch": 0.03374103753690426, "grad_norm": 0.11965916268612647, "learning_rate": 5.9993264259487505e-06, "loss": 0.14423828125, "step": 20 }, { "epoch": 0.04217629692113033, "grad_norm": 0.09039362542123534, "learning_rate": 5.998947562699149e-06, "loss": 0.1184326171875, "step": 25 }, { "epoch": 0.05061155630535639, "grad_norm": 0.0761794885482189, "learning_rate": 5.998484529277483e-06, "loss": 0.108642578125, "step": 30 }, { "epoch": 0.059046815689582456, "grad_norm": 0.07436752367684027, "learning_rate": 5.997937338679513e-06, "loss": 0.09638671875, "step": 35 }, { "epoch": 0.06748207507380852, "grad_norm": 0.06054003854062884, "learning_rate": 5.997306006263003e-06, "loss": 0.1025146484375, "step": 40 }, { "epoch": 0.07591733445803459, "grad_norm": 0.05715450839425674, "learning_rate": 5.996590549747288e-06, "loss": 0.0909912109375, "step": 45 }, { "epoch": 0.08435259384226065, "grad_norm": 0.0562159873926997, "learning_rate": 5.995790989212777e-06, "loss": 0.0900390625, "step": 50 }, { "epoch": 0.09278785322648671, "grad_norm": 0.054733644360014155, "learning_rate": 5.994907347100393e-06, "loss": 0.08599853515625, "step": 55 }, { "epoch": 0.10122311261071278, "grad_norm": 0.04945430208391664, "learning_rate": 5.99393964821094e-06, "loss": 0.08861083984375, "step": 60 }, { "epoch": 0.10965837199493884, "grad_norm": 0.060375343186170424, "learning_rate": 5.992887919704406e-06, "loss": 0.08037109375, "step": 65 }, { "epoch": 0.11809363137916491, "grad_norm": 0.05113371142226039, "learning_rate": 5.991752191099203e-06, "loss": 0.07867431640625, "step": 70 }, { "epoch": 0.12652889076339097, "grad_norm": 0.060073186423122656, "learning_rate": 5.990532494271337e-06, "loss": 0.07816162109375, "step": 75 }, { "epoch": 0.13496415014761703, "grad_norm": 0.058832653609599356, "learning_rate": 5.989228863453515e-06, "loss": 0.08001708984375, "step": 80 }, { "epoch": 0.1433994095318431, "grad_norm": 0.06587176624760811, "learning_rate": 5.987841335234184e-06, "loss": 0.074359130859375, "step": 85 }, { "epoch": 0.15183466891606917, "grad_norm": 0.06491166432460505, "learning_rate": 5.9863699485565e-06, "loss": 0.0674072265625, "step": 90 }, { "epoch": 0.16026992830029524, "grad_norm": 0.056198676389375694, "learning_rate": 5.984814744717241e-06, "loss": 0.0659912109375, "step": 95 }, { "epoch": 0.1687051876845213, "grad_norm": 0.0673764252680421, "learning_rate": 5.983175767365646e-06, "loss": 0.063623046875, "step": 100 }, { "epoch": 0.1687051876845213, "eval_loss": 0.0162808820605278, "eval_margin": -0.020074697267714766, "eval_mean_neg": 0.6548054814338684, "eval_mean_pos": 0.8441178202629089, "eval_runtime": 367.938, "eval_samples_per_second": 21.713, "eval_steps_per_second": 0.34, "step": 100 }, { "epoch": 0.17714044706874738, "grad_norm": 0.06055978762872105, "learning_rate": 5.981453062502185e-06, "loss": 0.060498046875, "step": 105 }, { "epoch": 0.18557570645297342, "grad_norm": 0.06213709235940642, "learning_rate": 5.979646678477277e-06, "loss": 0.056640625, "step": 110 }, { "epoch": 0.19401096583719948, "grad_norm": 0.0659729457413995, "learning_rate": 5.977756665989925e-06, "loss": 0.05919189453125, "step": 115 }, { "epoch": 0.20244622522142555, "grad_norm": 0.05897713608413389, "learning_rate": 5.9757830780862985e-06, "loss": 0.0628662109375, "step": 120 }, { "epoch": 0.21088148460565162, "grad_norm": 0.05642517065149083, "learning_rate": 5.973725970158239e-06, "loss": 0.05245361328125, "step": 125 }, { "epoch": 0.2193167439898777, "grad_norm": 0.06579611075607034, "learning_rate": 5.9715853999417115e-06, "loss": 0.05848388671875, "step": 130 }, { "epoch": 0.22775200337410376, "grad_norm": 0.07448489445734133, "learning_rate": 5.969361427515179e-06, "loss": 0.0573974609375, "step": 135 }, { "epoch": 0.23618726275832982, "grad_norm": 0.05975086799089143, "learning_rate": 5.9670541152979215e-06, "loss": 0.05091552734375, "step": 140 }, { "epoch": 0.2446225221425559, "grad_norm": 0.06504159374670346, "learning_rate": 5.964663528048276e-06, "loss": 0.047943115234375, "step": 145 }, { "epoch": 0.25305778152678193, "grad_norm": 0.060584307441235295, "learning_rate": 5.96218973286183e-06, "loss": 0.0493896484375, "step": 150 }, { "epoch": 0.261493040911008, "grad_norm": 0.06234398910972033, "learning_rate": 5.959632799169529e-06, "loss": 0.04854736328125, "step": 155 }, { "epoch": 0.26992830029523407, "grad_norm": 0.07326440644425879, "learning_rate": 5.9569927987357305e-06, "loss": 0.0443359375, "step": 160 }, { "epoch": 0.27836355967946014, "grad_norm": 0.059873291821439245, "learning_rate": 5.954269805656194e-06, "loss": 0.04698486328125, "step": 165 }, { "epoch": 0.2867988190636862, "grad_norm": 0.07214278446872342, "learning_rate": 5.951463896355993e-06, "loss": 0.0474639892578125, "step": 170 }, { "epoch": 0.2952340784479123, "grad_norm": 0.06530184393433881, "learning_rate": 5.94857514958738e-06, "loss": 0.043914794921875, "step": 175 }, { "epoch": 0.30366933783213834, "grad_norm": 0.06214586771199744, "learning_rate": 5.945603646427567e-06, "loss": 0.043475341796875, "step": 180 }, { "epoch": 0.3121045972163644, "grad_norm": 0.06764874450241058, "learning_rate": 5.9425494702764575e-06, "loss": 0.04755859375, "step": 185 }, { "epoch": 0.3205398566005905, "grad_norm": 0.06523200399348678, "learning_rate": 5.939412706854299e-06, "loss": 0.044635009765625, "step": 190 }, { "epoch": 0.32897511598481655, "grad_norm": 0.060102318432770876, "learning_rate": 5.9361934441992835e-06, "loss": 0.042364501953125, "step": 195 }, { "epoch": 0.3374103753690426, "grad_norm": 0.06678207500644712, "learning_rate": 5.9328917726650706e-06, "loss": 0.04183349609375, "step": 200 }, { "epoch": 0.3374103753690426, "eval_loss": 0.010425936430692673, "eval_margin": -0.016463442112229044, "eval_mean_neg": 0.5932909250259399, "eval_mean_pos": 0.8196097016334534, "eval_runtime": 365.6666, "eval_samples_per_second": 21.848, "eval_steps_per_second": 0.342, "step": 200 }, { "epoch": 0.3458456347532687, "grad_norm": 0.05626492604909855, "learning_rate": 5.929507784918257e-06, "loss": 0.040447998046875, "step": 205 }, { "epoch": 0.35428089413749475, "grad_norm": 0.054176681030320105, "learning_rate": 5.926041575935772e-06, "loss": 0.037396240234375, "step": 210 }, { "epoch": 0.3627161535217208, "grad_norm": 0.06953999336709471, "learning_rate": 5.922493243002212e-06, "loss": 0.042828369140625, "step": 215 }, { "epoch": 0.37115141290594683, "grad_norm": 0.05589129508252642, "learning_rate": 5.918862885707113e-06, "loss": 0.034979248046875, "step": 220 }, { "epoch": 0.3795866722901729, "grad_norm": 0.07078214617147234, "learning_rate": 5.915150605942153e-06, "loss": 0.035723876953125, "step": 225 }, { "epoch": 0.38802193167439897, "grad_norm": 0.06815732953530805, "learning_rate": 5.911356507898291e-06, "loss": 0.041973876953125, "step": 230 }, { "epoch": 0.39645719105862504, "grad_norm": 0.062472935047014386, "learning_rate": 5.907480698062848e-06, "loss": 0.0356689453125, "step": 235 }, { "epoch": 0.4048924504428511, "grad_norm": 0.06665756879409568, "learning_rate": 5.90352328521651e-06, "loss": 0.036456298828125, "step": 240 }, { "epoch": 0.41332770982707717, "grad_norm": 0.06681598226193439, "learning_rate": 5.899484380430284e-06, "loss": 0.0343994140625, "step": 245 }, { "epoch": 0.42176296921130324, "grad_norm": 0.060740413400477374, "learning_rate": 5.895364097062374e-06, "loss": 0.0318511962890625, "step": 250 }, { "epoch": 0.4301982285955293, "grad_norm": 0.06545743307605277, "learning_rate": 5.8911625507550015e-06, "loss": 0.034765625, "step": 255 }, { "epoch": 0.4386334879797554, "grad_norm": 0.0673664786591912, "learning_rate": 5.88687985943116e-06, "loss": 0.03580322265625, "step": 260 }, { "epoch": 0.44706874736398144, "grad_norm": 0.06937993663032453, "learning_rate": 5.882516143291308e-06, "loss": 0.036236572265625, "step": 265 }, { "epoch": 0.4555040067482075, "grad_norm": 0.0639250177544625, "learning_rate": 5.878071524809988e-06, "loss": 0.0317962646484375, "step": 270 }, { "epoch": 0.4639392661324336, "grad_norm": 0.06037822600018219, "learning_rate": 5.873546128732399e-06, "loss": 0.0323699951171875, "step": 275 }, { "epoch": 0.47237452551665965, "grad_norm": 0.060357976056049485, "learning_rate": 5.868940082070885e-06, "loss": 0.033660888671875, "step": 280 }, { "epoch": 0.4808097849008857, "grad_norm": 0.061037172126093234, "learning_rate": 5.8642535141013785e-06, "loss": 0.0297515869140625, "step": 285 }, { "epoch": 0.4892450442851118, "grad_norm": 0.0524126813526148, "learning_rate": 5.859486556359768e-06, "loss": 0.028472900390625, "step": 290 }, { "epoch": 0.49768030366933785, "grad_norm": 0.062026009465912704, "learning_rate": 5.854639342638208e-06, "loss": 0.030718994140625, "step": 295 }, { "epoch": 0.5061155630535639, "grad_norm": 0.05866098788599579, "learning_rate": 5.849712008981361e-06, "loss": 0.032916259765625, "step": 300 }, { "epoch": 0.5061155630535639, "eval_loss": 0.007868120446801186, "eval_margin": -0.013977996595654517, "eval_mean_neg": 0.5548827648162842, "eval_mean_pos": 0.793705403804779, "eval_runtime": 364.6437, "eval_samples_per_second": 21.909, "eval_steps_per_second": 0.343, "step": 300 }, { "epoch": 0.5145508224377899, "grad_norm": 0.05968132039231295, "learning_rate": 5.844704693682583e-06, "loss": 0.0292724609375, "step": 305 }, { "epoch": 0.522986081822016, "grad_norm": 0.06038138238675174, "learning_rate": 5.8396175372800405e-06, "loss": 0.030743408203125, "step": 310 }, { "epoch": 0.5314213412062421, "grad_norm": 0.06052295196543659, "learning_rate": 5.834450682552765e-06, "loss": 0.030194091796875, "step": 315 }, { "epoch": 0.5398566005904681, "grad_norm": 0.05539528727202974, "learning_rate": 5.829204274516648e-06, "loss": 0.0312774658203125, "step": 320 }, { "epoch": 0.5482918599746942, "grad_norm": 0.052508369724972796, "learning_rate": 5.823878460420366e-06, "loss": 0.0295318603515625, "step": 325 }, { "epoch": 0.5567271193589203, "grad_norm": 0.05151880865825463, "learning_rate": 5.8184733897412565e-06, "loss": 0.028912353515625, "step": 330 }, { "epoch": 0.5651623787431463, "grad_norm": 0.0624220665428448, "learning_rate": 5.812989214181113e-06, "loss": 0.027313232421875, "step": 335 }, { "epoch": 0.5735976381273724, "grad_norm": 0.06481057308539884, "learning_rate": 5.807426087661934e-06, "loss": 0.02608642578125, "step": 340 }, { "epoch": 0.5820328975115985, "grad_norm": 0.06109467057046473, "learning_rate": 5.8017841663216e-06, "loss": 0.0282989501953125, "step": 345 }, { "epoch": 0.5904681568958245, "grad_norm": 0.062107444796084835, "learning_rate": 5.796063608509493e-06, "loss": 0.0277069091796875, "step": 350 }, { "epoch": 0.5989034162800506, "grad_norm": 0.0552072139581444, "learning_rate": 5.7902645747820485e-06, "loss": 0.028399658203125, "step": 355 }, { "epoch": 0.6073386756642767, "grad_norm": 0.06047980839414296, "learning_rate": 5.784387227898254e-06, "loss": 0.0281524658203125, "step": 360 }, { "epoch": 0.6157739350485028, "grad_norm": 0.05336288606895412, "learning_rate": 5.778431732815078e-06, "loss": 0.02484130859375, "step": 365 }, { "epoch": 0.6242091944327288, "grad_norm": 0.060745200996401724, "learning_rate": 5.77239825668284e-06, "loss": 0.02640380859375, "step": 370 }, { "epoch": 0.6326444538169549, "grad_norm": 0.048268694566304324, "learning_rate": 5.766286968840522e-06, "loss": 0.0278717041015625, "step": 375 }, { "epoch": 0.641079713201181, "grad_norm": 0.05424806603710711, "learning_rate": 5.760098040811012e-06, "loss": 0.0271453857421875, "step": 380 }, { "epoch": 0.649514972585407, "grad_norm": 0.054535443289609395, "learning_rate": 5.7538316462962935e-06, "loss": 0.026611328125, "step": 385 }, { "epoch": 0.6579502319696331, "grad_norm": 0.06967389025087475, "learning_rate": 5.7474879611725655e-06, "loss": 0.02589111328125, "step": 390 }, { "epoch": 0.6663854913538592, "grad_norm": 0.06024092137696802, "learning_rate": 5.741067163485314e-06, "loss": 0.0193756103515625, "step": 395 }, { "epoch": 0.6748207507380852, "grad_norm": 0.05981804001044263, "learning_rate": 5.7345694334443066e-06, "loss": 0.0205718994140625, "step": 400 }, { "epoch": 0.6748207507380852, "eval_loss": 0.006414474919438362, "eval_margin": -0.013447051244457402, "eval_mean_neg": 0.5324161052703857, "eval_mean_pos": 0.7909372448921204, "eval_runtime": 365.0145, "eval_samples_per_second": 21.887, "eval_steps_per_second": 0.342, "step": 400 }, { "epoch": 0.6832560101223113, "grad_norm": 0.06692561927901217, "learning_rate": 5.727994953418538e-06, "loss": 0.022021484375, "step": 405 }, { "epoch": 0.6916912695065374, "grad_norm": 0.06609269963808409, "learning_rate": 5.721343907931114e-06, "loss": 0.02950592041015625, "step": 410 }, { "epoch": 0.7001265288907634, "grad_norm": 0.052617111357424175, "learning_rate": 5.71461648365407e-06, "loss": 0.025189208984375, "step": 415 }, { "epoch": 0.7085617882749895, "grad_norm": 0.04860971480260525, "learning_rate": 5.707812869403128e-06, "loss": 0.022052001953125, "step": 420 }, { "epoch": 0.7169970476592156, "grad_norm": 0.06030454097987917, "learning_rate": 5.7009332561324085e-06, "loss": 0.0219390869140625, "step": 425 }, { "epoch": 0.7254323070434416, "grad_norm": 0.06837586048390999, "learning_rate": 5.693977836929057e-06, "loss": 0.0270172119140625, "step": 430 }, { "epoch": 0.7338675664276677, "grad_norm": 0.05197492190608033, "learning_rate": 5.686946807007834e-06, "loss": 0.02206878662109375, "step": 435 }, { "epoch": 0.7423028258118937, "grad_norm": 0.06128713786873146, "learning_rate": 5.679840363705637e-06, "loss": 0.0244720458984375, "step": 440 }, { "epoch": 0.7507380851961197, "grad_norm": 0.0629198604819534, "learning_rate": 5.672658706475953e-06, "loss": 0.0194488525390625, "step": 445 }, { "epoch": 0.7591733445803458, "grad_norm": 0.05502172045134509, "learning_rate": 5.665402036883267e-06, "loss": 0.0225250244140625, "step": 450 }, { "epoch": 0.7676086039645719, "grad_norm": 0.06119000768724386, "learning_rate": 5.658070558597408e-06, "loss": 0.01928558349609375, "step": 455 }, { "epoch": 0.7760438633487979, "grad_norm": 0.058834092769235756, "learning_rate": 5.650664477387824e-06, "loss": 0.02149658203125, "step": 460 }, { "epoch": 0.784479122733024, "grad_norm": 0.06942758384696321, "learning_rate": 5.643184001117811e-06, "loss": 0.0266326904296875, "step": 465 }, { "epoch": 0.7929143821172501, "grad_norm": 0.05395397336586372, "learning_rate": 5.6356293397386836e-06, "loss": 0.0206085205078125, "step": 470 }, { "epoch": 0.8013496415014761, "grad_norm": 0.057301086470950384, "learning_rate": 5.628000705283873e-06, "loss": 0.021770477294921875, "step": 475 }, { "epoch": 0.8097849008857022, "grad_norm": 0.058618795566843934, "learning_rate": 5.620298311862985e-06, "loss": 0.0174072265625, "step": 480 }, { "epoch": 0.8182201602699283, "grad_norm": 0.053997897902853975, "learning_rate": 5.612522375655783e-06, "loss": 0.0246124267578125, "step": 485 }, { "epoch": 0.8266554196541543, "grad_norm": 0.058293384553658546, "learning_rate": 5.604673114906126e-06, "loss": 0.0239288330078125, "step": 490 }, { "epoch": 0.8350906790383804, "grad_norm": 0.062099166751088966, "learning_rate": 5.596750749915842e-06, "loss": 0.023724365234375, "step": 495 }, { "epoch": 0.8435259384226065, "grad_norm": 0.0518337334475497, "learning_rate": 5.588755503038543e-06, "loss": 0.01995849609375, "step": 500 }, { "epoch": 0.8435259384226065, "eval_loss": 0.00542406877502799, "eval_margin": -0.010786364688688228, "eval_mean_neg": 0.5346763134002686, "eval_mean_pos": 0.7906754016876221, "eval_runtime": 367.5462, "eval_samples_per_second": 21.736, "eval_steps_per_second": 0.34, "step": 500 }, { "epoch": 0.8519611978068325, "grad_norm": 0.0579368996460804, "learning_rate": 5.580687598673387e-06, "loss": 0.02121734619140625, "step": 505 }, { "epoch": 0.8603964571910586, "grad_norm": 0.05214070956939639, "learning_rate": 5.572547263258776e-06, "loss": 0.0197113037109375, "step": 510 }, { "epoch": 0.8688317165752847, "grad_norm": 0.0687906199565583, "learning_rate": 5.564334725266006e-06, "loss": 0.0217254638671875, "step": 515 }, { "epoch": 0.8772669759595108, "grad_norm": 0.051621267659708626, "learning_rate": 5.55605021519285e-06, "loss": 0.019158935546875, "step": 520 }, { "epoch": 0.8857022353437368, "grad_norm": 0.05599957001213385, "learning_rate": 5.547693965557092e-06, "loss": 0.0195770263671875, "step": 525 }, { "epoch": 0.8941374947279629, "grad_norm": 0.06034671456944424, "learning_rate": 5.539266210889997e-06, "loss": 0.0231231689453125, "step": 530 }, { "epoch": 0.902572754112189, "grad_norm": 0.04518349407201743, "learning_rate": 5.5307671877297326e-06, "loss": 0.0208709716796875, "step": 535 }, { "epoch": 0.911008013496415, "grad_norm": 0.0503478793140038, "learning_rate": 5.522197134614728e-06, "loss": 0.0209930419921875, "step": 540 }, { "epoch": 0.9194432728806411, "grad_norm": 0.046047217532892024, "learning_rate": 5.513556292076981e-06, "loss": 0.0175750732421875, "step": 545 }, { "epoch": 0.9278785322648672, "grad_norm": 0.05575253759567789, "learning_rate": 5.504844902635303e-06, "loss": 0.0171112060546875, "step": 550 }, { "epoch": 0.9363137916490932, "grad_norm": 0.04687503220455111, "learning_rate": 5.496063210788519e-06, "loss": 0.0167633056640625, "step": 555 }, { "epoch": 0.9447490510333193, "grad_norm": 0.04891593875536363, "learning_rate": 5.487211463008597e-06, "loss": 0.019036865234375, "step": 560 }, { "epoch": 0.9531843104175454, "grad_norm": 0.04841249311058062, "learning_rate": 5.478289907733738e-06, "loss": 0.01807098388671875, "step": 565 }, { "epoch": 0.9616195698017714, "grad_norm": 0.060373651634708765, "learning_rate": 5.469298795361397e-06, "loss": 0.015673828125, "step": 570 }, { "epoch": 0.9700548291859975, "grad_norm": 0.051868174671481436, "learning_rate": 5.460238378241262e-06, "loss": 0.01802978515625, "step": 575 }, { "epoch": 0.9784900885702236, "grad_norm": 0.051146316151485995, "learning_rate": 5.451108910668163e-06, "loss": 0.01664581298828125, "step": 580 }, { "epoch": 0.9869253479544496, "grad_norm": 0.04017649470362814, "learning_rate": 5.441910648874945e-06, "loss": 0.016483306884765625, "step": 585 }, { "epoch": 0.9953606073386757, "grad_norm": 0.04457228909606784, "learning_rate": 5.4326438510252655e-06, "loss": 0.0192718505859375, "step": 590 }, { "epoch": 1.0033741037536905, "grad_norm": 0.053005736672298354, "learning_rate": 5.423308777206357e-06, "loss": 0.015604400634765625, "step": 595 }, { "epoch": 1.0118093631379166, "grad_norm": 0.05754347568157857, "learning_rate": 5.413905689421722e-06, "loss": 0.0159515380859375, "step": 600 }, { "epoch": 1.0118093631379166, "eval_loss": 0.004593910649418831, "eval_margin": -0.011103880922159842, "eval_mean_neg": 0.5064941644668579, "eval_mean_pos": 0.7831713557243347, "eval_runtime": 364.5429, "eval_samples_per_second": 21.915, "eval_steps_per_second": 0.343, "step": 600 }, { "epoch": 1.0202446225221427, "grad_norm": 0.04962686662442784, "learning_rate": 5.404434851583785e-06, "loss": 0.01360015869140625, "step": 605 }, { "epoch": 1.0286798819063687, "grad_norm": 0.046926535788142015, "learning_rate": 5.394896529506479e-06, "loss": 0.01566925048828125, "step": 610 }, { "epoch": 1.0371151412905948, "grad_norm": 0.043042108440633, "learning_rate": 5.38529099089779e-06, "loss": 0.0128326416015625, "step": 615 }, { "epoch": 1.0455504006748209, "grad_norm": 0.049749099013614635, "learning_rate": 5.375618505352241e-06, "loss": 0.0136383056640625, "step": 620 }, { "epoch": 1.053985660059047, "grad_norm": 0.04945151693616336, "learning_rate": 5.365879344343326e-06, "loss": 0.01544036865234375, "step": 625 }, { "epoch": 1.062420919443273, "grad_norm": 0.04626935309793636, "learning_rate": 5.35607378121589e-06, "loss": 0.0143829345703125, "step": 630 }, { "epoch": 1.070856178827499, "grad_norm": 0.04580735975264899, "learning_rate": 5.346202091178459e-06, "loss": 0.014122772216796874, "step": 635 }, { "epoch": 1.079291438211725, "grad_norm": 0.046216725385350446, "learning_rate": 5.336264551295512e-06, "loss": 0.014672088623046874, "step": 640 }, { "epoch": 1.087726697595951, "grad_norm": 0.04564303944680029, "learning_rate": 5.326261440479709e-06, "loss": 0.0136993408203125, "step": 645 }, { "epoch": 1.096161956980177, "grad_norm": 0.05114495970312972, "learning_rate": 5.316193039484063e-06, "loss": 0.0147705078125, "step": 650 }, { "epoch": 1.1045972163644031, "grad_norm": 0.044105955284847585, "learning_rate": 5.306059630894056e-06, "loss": 0.015480804443359374, "step": 655 }, { "epoch": 1.1130324757486292, "grad_norm": 0.045182200484827885, "learning_rate": 5.295861499119711e-06, "loss": 0.013404083251953126, "step": 660 }, { "epoch": 1.1214677351328552, "grad_norm": 0.04324759296793784, "learning_rate": 5.2855989303876065e-06, "loss": 0.01672821044921875, "step": 665 }, { "epoch": 1.1299029945170813, "grad_norm": 0.03328038907845692, "learning_rate": 5.275272212732849e-06, "loss": 0.01335906982421875, "step": 670 }, { "epoch": 1.1383382539013074, "grad_norm": 0.044225327184826406, "learning_rate": 5.264881635990984e-06, "loss": 0.012935638427734375, "step": 675 }, { "epoch": 1.1467735132855335, "grad_norm": 0.04645591264342837, "learning_rate": 5.2544274917898615e-06, "loss": 0.01385498046875, "step": 680 }, { "epoch": 1.1552087726697595, "grad_norm": 0.05453216622664439, "learning_rate": 5.243910073541454e-06, "loss": 0.016290283203125, "step": 685 }, { "epoch": 1.1636440320539856, "grad_norm": 0.057731965028177075, "learning_rate": 5.233329676433617e-06, "loss": 0.0145355224609375, "step": 690 }, { "epoch": 1.1720792914382117, "grad_norm": 0.05145183297720149, "learning_rate": 5.222686597421808e-06, "loss": 0.01390838623046875, "step": 695 }, { "epoch": 1.1805145508224377, "grad_norm": 0.04021056012812571, "learning_rate": 5.211981135220751e-06, "loss": 0.01344757080078125, "step": 700 }, { "epoch": 1.1805145508224377, "eval_loss": 0.004191060084849596, "eval_margin": -0.010819014589933137, "eval_mean_neg": 0.4851545989513397, "eval_mean_pos": 0.7733471989631653, "eval_runtime": 359.0481, "eval_samples_per_second": 22.251, "eval_steps_per_second": 0.348, "step": 700 }, { "epoch": 1.1889498102066638, "grad_norm": 0.0416204676277527, "learning_rate": 5.201213590296052e-06, "loss": 0.014748382568359374, "step": 705 }, { "epoch": 1.1973850695908899, "grad_norm": 0.05633713089091016, "learning_rate": 5.190384264855764e-06, "loss": 0.014013671875, "step": 710 }, { "epoch": 1.205820328975116, "grad_norm": 0.05143948467095745, "learning_rate": 5.1794934628419104e-06, "loss": 0.015460205078125, "step": 715 }, { "epoch": 1.214255588359342, "grad_norm": 0.05227911954680101, "learning_rate": 5.168541489921949e-06, "loss": 0.01507415771484375, "step": 720 }, { "epoch": 1.222690847743568, "grad_norm": 0.058608960783147375, "learning_rate": 5.1575286534801955e-06, "loss": 0.01417236328125, "step": 725 }, { "epoch": 1.2311261071277941, "grad_norm": 0.04818858161693878, "learning_rate": 5.146455262609197e-06, "loss": 0.013425445556640625, "step": 730 }, { "epoch": 1.2395613665120202, "grad_norm": 0.05406749848988645, "learning_rate": 5.1353216281010535e-06, "loss": 0.013022613525390626, "step": 735 }, { "epoch": 1.2479966258962463, "grad_norm": 0.044408669007062154, "learning_rate": 5.1241280624387e-06, "loss": 0.01393585205078125, "step": 740 }, { "epoch": 1.2564318852804723, "grad_norm": 0.04519048638967848, "learning_rate": 5.1128748797871314e-06, "loss": 0.013826751708984375, "step": 745 }, { "epoch": 1.2648671446646984, "grad_norm": 0.0491460974626283, "learning_rate": 5.101562395984587e-06, "loss": 0.01336212158203125, "step": 750 }, { "epoch": 1.2733024040489245, "grad_norm": 0.04356609182045035, "learning_rate": 5.090190928533689e-06, "loss": 0.01492156982421875, "step": 755 }, { "epoch": 1.2817376634331505, "grad_norm": 0.03556136795064142, "learning_rate": 5.078760796592524e-06, "loss": 0.0125732421875, "step": 760 }, { "epoch": 1.2901729228173766, "grad_norm": 0.04189977738590891, "learning_rate": 5.067272320965692e-06, "loss": 0.0149322509765625, "step": 765 }, { "epoch": 1.2986081822016027, "grad_norm": 0.05051201336701144, "learning_rate": 5.055725824095301e-06, "loss": 0.01419525146484375, "step": 770 }, { "epoch": 1.3070434415858287, "grad_norm": 0.0416942039130722, "learning_rate": 5.0441216300519126e-06, "loss": 0.01274261474609375, "step": 775 }, { "epoch": 1.3154787009700548, "grad_norm": 0.04629875001130603, "learning_rate": 5.032460064525455e-06, "loss": 0.01363525390625, "step": 780 }, { "epoch": 1.3239139603542809, "grad_norm": 0.03704688355237128, "learning_rate": 5.020741454816074e-06, "loss": 0.01301422119140625, "step": 785 }, { "epoch": 1.332349219738507, "grad_norm": 0.03742406408262459, "learning_rate": 5.00896612982495e-06, "loss": 0.01353302001953125, "step": 790 }, { "epoch": 1.340784479122733, "grad_norm": 0.050480726423335516, "learning_rate": 4.99713442004507e-06, "loss": 0.01196746826171875, "step": 795 }, { "epoch": 1.349219738506959, "grad_norm": 0.03808846024736694, "learning_rate": 4.985246657551943e-06, "loss": 0.0110015869140625, "step": 800 }, { "epoch": 1.349219738506959, "eval_loss": 0.003908403683453798, "eval_margin": -0.010123856463319352, "eval_mean_neg": 0.49688851833343506, "eval_mean_pos": 0.7784863114356995, "eval_runtime": 364.0137, "eval_samples_per_second": 21.947, "eval_steps_per_second": 0.343, "step": 800 }, { "epoch": 1.3576549978911852, "grad_norm": 0.04637758927467518, "learning_rate": 4.973303175994289e-06, "loss": 0.013458251953125, "step": 805 }, { "epoch": 1.3660902572754112, "grad_norm": 0.05066098296531039, "learning_rate": 4.961304310584674e-06, "loss": 0.01515960693359375, "step": 810 }, { "epoch": 1.3745255166596373, "grad_norm": 0.038530384714911596, "learning_rate": 4.949250398090092e-06, "loss": 0.011260223388671876, "step": 815 }, { "epoch": 1.3829607760438634, "grad_norm": 0.040188601844867354, "learning_rate": 4.937141776822525e-06, "loss": 0.0158447265625, "step": 820 }, { "epoch": 1.3913960354280894, "grad_norm": 0.03574613677300634, "learning_rate": 4.92497878662944e-06, "loss": 0.011143875122070313, "step": 825 }, { "epoch": 1.3998312948123155, "grad_norm": 0.05019423126073816, "learning_rate": 4.912761768884255e-06, "loss": 0.01179351806640625, "step": 830 }, { "epoch": 1.4082665541965416, "grad_norm": 0.04311116805857567, "learning_rate": 4.9004910664767545e-06, "loss": 0.01372833251953125, "step": 835 }, { "epoch": 1.4167018135807676, "grad_norm": 0.04928580588462512, "learning_rate": 4.888167023803468e-06, "loss": 0.01297607421875, "step": 840 }, { "epoch": 1.4251370729649937, "grad_norm": 0.054968450905918724, "learning_rate": 4.8757899867580046e-06, "loss": 0.014654541015625, "step": 845 }, { "epoch": 1.4335723323492198, "grad_norm": 0.050366347428194534, "learning_rate": 4.86336030272134e-06, "loss": 0.011295318603515625, "step": 850 }, { "epoch": 1.4420075917334458, "grad_norm": 0.05107215089989217, "learning_rate": 4.850878320552076e-06, "loss": 0.01334228515625, "step": 855 }, { "epoch": 1.450442851117672, "grad_norm": 0.0391963683003482, "learning_rate": 4.838344390576638e-06, "loss": 0.01104736328125, "step": 860 }, { "epoch": 1.458878110501898, "grad_norm": 0.03985676744245212, "learning_rate": 4.825758864579452e-06, "loss": 0.013307952880859375, "step": 865 }, { "epoch": 1.467313369886124, "grad_norm": 0.04852757651119817, "learning_rate": 4.813122095793066e-06, "loss": 0.014328384399414062, "step": 870 }, { "epoch": 1.47574862927035, "grad_norm": 0.0454254941425111, "learning_rate": 4.800434438888235e-06, "loss": 0.012960052490234375, "step": 875 }, { "epoch": 1.4841838886545762, "grad_norm": 0.03868230007157653, "learning_rate": 4.787696249963974e-06, "loss": 0.01402740478515625, "step": 880 }, { "epoch": 1.4926191480388022, "grad_norm": 0.05289135869423979, "learning_rate": 4.774907886537553e-06, "loss": 0.013831901550292968, "step": 885 }, { "epoch": 1.501054407423028, "grad_norm": 0.04594308680556284, "learning_rate": 4.7620697075344736e-06, "loss": 0.012446975708007813, "step": 890 }, { "epoch": 1.5094896668072542, "grad_norm": 0.048917845490978454, "learning_rate": 4.7491820732783866e-06, "loss": 0.011295318603515625, "step": 895 }, { "epoch": 1.5179249261914802, "grad_norm": 0.043266255463378436, "learning_rate": 4.73624534548098e-06, "loss": 0.01407012939453125, "step": 900 }, { "epoch": 1.5179249261914802, "eval_loss": 0.0036048581823706627, "eval_margin": -0.009617562525935711, "eval_mean_neg": 0.4904225468635559, "eval_mean_pos": 0.7793014049530029, "eval_runtime": 363.5397, "eval_samples_per_second": 21.976, "eval_steps_per_second": 0.344, "step": 900 }, { "epoch": 1.5263601855757063, "grad_norm": 0.04363576408467007, "learning_rate": 4.723259887231835e-06, "loss": 0.0138519287109375, "step": 905 }, { "epoch": 1.5347954449599324, "grad_norm": 0.04199459687850267, "learning_rate": 4.710226062988223e-06, "loss": 0.01312255859375, "step": 910 }, { "epoch": 1.5432307043441584, "grad_norm": 0.047436231412077354, "learning_rate": 4.697144238564889e-06, "loss": 0.01208648681640625, "step": 915 }, { "epoch": 1.5516659637283845, "grad_norm": 0.04092453404900873, "learning_rate": 4.684014781123775e-06, "loss": 0.012505340576171874, "step": 920 }, { "epoch": 1.5601012231126106, "grad_norm": 0.045645370405214956, "learning_rate": 4.6708380591637166e-06, "loss": 0.0120208740234375, "step": 925 }, { "epoch": 1.5685364824968366, "grad_norm": 0.04911154284719614, "learning_rate": 4.6576144425101076e-06, "loss": 0.013311767578125, "step": 930 }, { "epoch": 1.5769717418810627, "grad_norm": 0.045881762593597546, "learning_rate": 4.64434430230451e-06, "loss": 0.012969207763671876, "step": 935 }, { "epoch": 1.5854070012652888, "grad_norm": 0.04728445094523914, "learning_rate": 4.631028010994245e-06, "loss": 0.01099395751953125, "step": 940 }, { "epoch": 1.5938422606495148, "grad_norm": 0.03903116673162643, "learning_rate": 4.617665942321937e-06, "loss": 0.0129608154296875, "step": 945 }, { "epoch": 1.602277520033741, "grad_norm": 0.040499425484585065, "learning_rate": 4.6042584713150225e-06, "loss": 0.009827423095703124, "step": 950 }, { "epoch": 1.610712779417967, "grad_norm": 0.047017092872005554, "learning_rate": 4.590805974275228e-06, "loss": 0.01045989990234375, "step": 955 }, { "epoch": 1.619148038802193, "grad_norm": 0.03869016761931018, "learning_rate": 4.577308828768005e-06, "loss": 0.011346435546875, "step": 960 }, { "epoch": 1.6275832981864191, "grad_norm": 0.05726216064413269, "learning_rate": 4.563767413611932e-06, "loss": 0.01296844482421875, "step": 965 }, { "epoch": 1.6360185575706452, "grad_norm": 0.034971593802495975, "learning_rate": 4.550182108868089e-06, "loss": 0.01379852294921875, "step": 970 }, { "epoch": 1.6444538169548713, "grad_norm": 0.04877425067250454, "learning_rate": 4.536553295829384e-06, "loss": 0.012924957275390624, "step": 975 }, { "epoch": 1.6528890763390973, "grad_norm": 0.03927648322180213, "learning_rate": 4.522881357009853e-06, "loss": 0.01293792724609375, "step": 980 }, { "epoch": 1.6613243357233234, "grad_norm": 0.024976847462424127, "learning_rate": 4.5091666761339275e-06, "loss": 0.009877777099609375, "step": 985 }, { "epoch": 1.6697595951075495, "grad_norm": 0.03945379802090875, "learning_rate": 4.495409638125657e-06, "loss": 0.01130523681640625, "step": 990 }, { "epoch": 1.6781948544917755, "grad_norm": 0.03430320161614481, "learning_rate": 4.481610629097917e-06, "loss": 0.009923553466796875, "step": 995 }, { "epoch": 1.6866301138760016, "grad_norm": 0.03895065600017937, "learning_rate": 4.46777003634156e-06, "loss": 0.01330413818359375, "step": 1000 }, { "epoch": 1.6866301138760016, "eval_loss": 0.0034073551651090384, "eval_margin": -0.009528953300398444, "eval_mean_neg": 0.4926661550998688, "eval_mean_pos": 0.7842009663581848, "eval_runtime": 367.0219, "eval_samples_per_second": 21.767, "eval_steps_per_second": 0.341, "step": 1000 }, { "epoch": 1.6950653732602277, "grad_norm": 0.04302786223265218, "learning_rate": 4.453888248314553e-06, "loss": 0.01107330322265625, "step": 1005 }, { "epoch": 1.7035006326444537, "grad_norm": 0.04002206909489744, "learning_rate": 4.439965654631073e-06, "loss": 0.0105499267578125, "step": 1010 }, { "epoch": 1.7119358920286798, "grad_norm": 0.04439497813433074, "learning_rate": 4.426002646050574e-06, "loss": 0.010544586181640624, "step": 1015 }, { "epoch": 1.7203711514129059, "grad_norm": 0.043341839034531496, "learning_rate": 4.411999614466812e-06, "loss": 0.0125335693359375, "step": 1020 }, { "epoch": 1.728806410797132, "grad_norm": 0.03449321841295583, "learning_rate": 4.397956952896858e-06, "loss": 0.010623550415039063, "step": 1025 }, { "epoch": 1.737241670181358, "grad_norm": 0.041185961783139574, "learning_rate": 4.383875055470055e-06, "loss": 0.01031951904296875, "step": 1030 }, { "epoch": 1.745676929565584, "grad_norm": 0.04627446953615271, "learning_rate": 4.3697543174169675e-06, "loss": 0.01590385437011719, "step": 1035 }, { "epoch": 1.7541121889498101, "grad_norm": 0.04582345634360075, "learning_rate": 4.355595135058278e-06, "loss": 0.0119537353515625, "step": 1040 }, { "epoch": 1.7625474483340362, "grad_norm": 0.033580437424405536, "learning_rate": 4.3413979057936715e-06, "loss": 0.01235198974609375, "step": 1045 }, { "epoch": 1.7709827077182623, "grad_norm": 0.03545606353671419, "learning_rate": 4.32716302809068e-06, "loss": 0.012863922119140624, "step": 1050 }, { "epoch": 1.7794179671024883, "grad_norm": 0.03491571698794484, "learning_rate": 4.312890901473496e-06, "loss": 0.01035614013671875, "step": 1055 }, { "epoch": 1.7878532264867144, "grad_norm": 0.04391496148899165, "learning_rate": 4.29858192651176e-06, "loss": 0.011370468139648437, "step": 1060 }, { "epoch": 1.7962884858709405, "grad_norm": 0.049338016603549396, "learning_rate": 4.284236504809324e-06, "loss": 0.011846160888671875, "step": 1065 }, { "epoch": 1.8047237452551665, "grad_norm": 0.035387852478552806, "learning_rate": 4.269855038992971e-06, "loss": 0.011142349243164063, "step": 1070 }, { "epoch": 1.8131590046393926, "grad_norm": 0.043891210942711104, "learning_rate": 4.2554379327011196e-06, "loss": 0.011545944213867187, "step": 1075 }, { "epoch": 1.8215942640236187, "grad_norm": 0.040327331551499056, "learning_rate": 4.240985590572496e-06, "loss": 0.00897674560546875, "step": 1080 }, { "epoch": 1.8300295234078447, "grad_norm": 0.03274271686886844, "learning_rate": 4.226498418234771e-06, "loss": 0.01215667724609375, "step": 1085 }, { "epoch": 1.8384647827920708, "grad_norm": 0.04375742422856697, "learning_rate": 4.2119768222931865e-06, "loss": 0.0109588623046875, "step": 1090 }, { "epoch": 1.8469000421762969, "grad_norm": 0.036163256401816654, "learning_rate": 4.19742121031913e-06, "loss": 0.012054443359375, "step": 1095 }, { "epoch": 1.855335301560523, "grad_norm": 0.04078407955383746, "learning_rate": 4.182831990838709e-06, "loss": 0.0132843017578125, "step": 1100 }, { "epoch": 1.855335301560523, "eval_loss": 0.003225065069273114, "eval_margin": -0.008749207222623932, "eval_mean_neg": 0.49084940552711487, "eval_mean_pos": 0.7849159836769104, "eval_runtime": 366.687, "eval_samples_per_second": 21.787, "eval_steps_per_second": 0.341, "step": 1100 }, { "epoch": 1.863770560944749, "grad_norm": 0.047827239751426935, "learning_rate": 4.168209573321271e-06, "loss": 0.0133697509765625, "step": 1105 }, { "epoch": 1.872205820328975, "grad_norm": 0.0274823880547768, "learning_rate": 4.153554368167927e-06, "loss": 0.010877227783203125, "step": 1110 }, { "epoch": 1.8806410797132012, "grad_norm": 0.052787755841206804, "learning_rate": 4.138866786700016e-06, "loss": 0.0139434814453125, "step": 1115 }, { "epoch": 1.8890763390974272, "grad_norm": 0.029629846825489692, "learning_rate": 4.124147241147577e-06, "loss": 0.011189651489257813, "step": 1120 }, { "epoch": 1.8975115984816533, "grad_norm": 0.039855575258898726, "learning_rate": 4.109396144637764e-06, "loss": 0.010993194580078126, "step": 1125 }, { "epoch": 1.9059468578658794, "grad_norm": 0.03789188882991695, "learning_rate": 4.094613911183265e-06, "loss": 0.01313323974609375, "step": 1130 }, { "epoch": 1.9143821172501054, "grad_norm": 0.03482605825228896, "learning_rate": 4.0798009556706685e-06, "loss": 0.008492279052734374, "step": 1135 }, { "epoch": 1.9228173766343315, "grad_norm": 0.0395626147511318, "learning_rate": 4.064957693848831e-06, "loss": 0.011167144775390625, "step": 1140 }, { "epoch": 1.9312526360185576, "grad_norm": 0.026910728579180684, "learning_rate": 4.050084542317201e-06, "loss": 0.0124908447265625, "step": 1145 }, { "epoch": 1.9396878954027836, "grad_norm": 0.05111929237613795, "learning_rate": 4.0351819185141284e-06, "loss": 0.01279144287109375, "step": 1150 }, { "epoch": 1.9481231547870097, "grad_norm": 0.031631097839140386, "learning_rate": 4.02025024070515e-06, "loss": 0.010783004760742187, "step": 1155 }, { "epoch": 1.9565584141712358, "grad_norm": 0.03921591693735718, "learning_rate": 4.005289927971248e-06, "loss": 0.009867095947265625, "step": 1160 }, { "epoch": 1.9649936735554618, "grad_norm": 0.03786979993880419, "learning_rate": 3.990301400197088e-06, "loss": 0.010943603515625, "step": 1165 }, { "epoch": 1.973428932939688, "grad_norm": 0.033688024912648086, "learning_rate": 3.9752850780592366e-06, "loss": 0.010836410522460937, "step": 1170 }, { "epoch": 1.981864192323914, "grad_norm": 0.0473160707405277, "learning_rate": 3.960241383014353e-06, "loss": 0.011658477783203124, "step": 1175 }, { "epoch": 1.99029945170814, "grad_norm": 0.034470209590808834, "learning_rate": 3.945170737287356e-06, "loss": 0.0096588134765625, "step": 1180 }, { "epoch": 1.9987347110923661, "grad_norm": 0.04035006428036731, "learning_rate": 3.930073563859583e-06, "loss": 0.013312530517578126, "step": 1185 }, { "epoch": 2.006748207507381, "grad_norm": 0.03443773853658945, "learning_rate": 3.914950286456911e-06, "loss": 0.0104766845703125, "step": 1190 }, { "epoch": 2.015183466891607, "grad_norm": 0.02321269258461312, "learning_rate": 3.899801329537865e-06, "loss": 0.008111572265625, "step": 1195 }, { "epoch": 2.023618726275833, "grad_norm": 0.02427731911492366, "learning_rate": 3.884627118281706e-06, "loss": 0.009668731689453125, "step": 1200 }, { "epoch": 2.023618726275833, "eval_loss": 0.0028827113565057516, "eval_margin": -0.008073512017877111, "eval_mean_neg": 0.5066258907318115, "eval_mean_pos": 0.7934735417366028, "eval_runtime": 365.1842, "eval_samples_per_second": 21.877, "eval_steps_per_second": 0.342, "step": 1200 }, { "epoch": 2.0320539856600592, "grad_norm": 0.033776934236771874, "learning_rate": 3.869428078576498e-06, "loss": 0.00937347412109375, "step": 1205 }, { "epoch": 2.0404892450442853, "grad_norm": 0.05070270762284893, "learning_rate": 3.8542046370071575e-06, "loss": 0.008733367919921875, "step": 1210 }, { "epoch": 2.0489245044285114, "grad_norm": 0.028063560546546604, "learning_rate": 3.838957220843472e-06, "loss": 0.00914459228515625, "step": 1215 }, { "epoch": 2.0573597638127374, "grad_norm": 0.041287537117132886, "learning_rate": 3.8236862580281175e-06, "loss": 0.010516357421875, "step": 1220 }, { "epoch": 2.0657950231969635, "grad_norm": 0.03173632436563901, "learning_rate": 3.808392177164642e-06, "loss": 0.010186767578125, "step": 1225 }, { "epoch": 2.0742302825811896, "grad_norm": 0.03149301964970768, "learning_rate": 3.7930754075054406e-06, "loss": 0.010378265380859375, "step": 1230 }, { "epoch": 2.0826655419654156, "grad_norm": 0.03183747792195117, "learning_rate": 3.7777363789397004e-06, "loss": 0.009032630920410156, "step": 1235 }, { "epoch": 2.0911008013496417, "grad_norm": 0.0353065686803631, "learning_rate": 3.7623755219813442e-06, "loss": 0.0096771240234375, "step": 1240 }, { "epoch": 2.0995360607338673, "grad_norm": 0.040800577074973816, "learning_rate": 3.746993267756939e-06, "loss": 0.009685516357421875, "step": 1245 }, { "epoch": 2.107971320118094, "grad_norm": 0.04064182954953987, "learning_rate": 3.7315900479936044e-06, "loss": 0.010097503662109375, "step": 1250 }, { "epoch": 2.1164065795023195, "grad_norm": 0.04908593416403285, "learning_rate": 3.7161662950068846e-06, "loss": 0.009412384033203125, "step": 1255 }, { "epoch": 2.124841838886546, "grad_norm": 0.0284060145446946, "learning_rate": 3.7007224416886276e-06, "loss": 0.00821533203125, "step": 1260 }, { "epoch": 2.1332770982707716, "grad_norm": 0.030842726867602113, "learning_rate": 3.685258921494824e-06, "loss": 0.009014129638671875, "step": 1265 }, { "epoch": 2.141712357654998, "grad_norm": 0.036273158990138075, "learning_rate": 3.6697761684334466e-06, "loss": 0.010558700561523438, "step": 1270 }, { "epoch": 2.1501476170392237, "grad_norm": 0.03693819496482909, "learning_rate": 3.6542746170522717e-06, "loss": 0.010668182373046875, "step": 1275 }, { "epoch": 2.15858287642345, "grad_norm": 0.03797267942950567, "learning_rate": 3.638754702426678e-06, "loss": 0.008889389038085938, "step": 1280 }, { "epoch": 2.167018135807676, "grad_norm": 0.03341108305444907, "learning_rate": 3.6232168601474363e-06, "loss": 0.006923675537109375, "step": 1285 }, { "epoch": 2.175453395191902, "grad_norm": 0.03454779917085028, "learning_rate": 3.607661526308488e-06, "loss": 0.00969085693359375, "step": 1290 }, { "epoch": 2.183888654576128, "grad_norm": 0.035727285557249105, "learning_rate": 3.5920891374947005e-06, "loss": 0.00997161865234375, "step": 1295 }, { "epoch": 2.192323913960354, "grad_norm": 0.031320211315080816, "learning_rate": 3.5765001307696152e-06, "loss": 0.007769393920898438, "step": 1300 }, { "epoch": 2.192323913960354, "eval_loss": 0.0026897923089563847, "eval_margin": -0.0077478337221808975, "eval_mean_neg": 0.49285975098609924, "eval_mean_pos": 0.7862820625305176, "eval_runtime": 362.3777, "eval_samples_per_second": 22.046, "eval_steps_per_second": 0.345, "step": 1300 }, { "epoch": 2.20075917334458, "grad_norm": 0.023952498523963275, "learning_rate": 3.560894943663185e-06, "loss": 0.009902191162109376, "step": 1305 }, { "epoch": 2.2091944327288062, "grad_norm": 0.029616458459003896, "learning_rate": 3.545274014159486e-06, "loss": 0.008718109130859375, "step": 1310 }, { "epoch": 2.2176296921130323, "grad_norm": 0.026768679077660198, "learning_rate": 3.5296377806844334e-06, "loss": 0.006624603271484375, "step": 1315 }, { "epoch": 2.2260649514972584, "grad_norm": 0.03723135315427558, "learning_rate": 3.5139866820934687e-06, "loss": 0.010486793518066407, "step": 1320 }, { "epoch": 2.2345002108814844, "grad_norm": 0.030973900207479872, "learning_rate": 3.498321157659248e-06, "loss": 0.00841064453125, "step": 1325 }, { "epoch": 2.2429354702657105, "grad_norm": 0.042522927349784224, "learning_rate": 3.482641647059313e-06, "loss": 0.010484886169433594, "step": 1330 }, { "epoch": 2.2513707296499366, "grad_norm": 0.036298357689256384, "learning_rate": 3.4669485903637452e-06, "loss": 0.010845947265625, "step": 1335 }, { "epoch": 2.2598059890341626, "grad_norm": 0.04210885166855473, "learning_rate": 3.4512424280228227e-06, "loss": 0.009656906127929688, "step": 1340 }, { "epoch": 2.2682412484183887, "grad_norm": 0.037852259539673916, "learning_rate": 3.435523600854652e-06, "loss": 0.009561920166015625, "step": 1345 }, { "epoch": 2.2766765078026148, "grad_norm": 0.03972030283651443, "learning_rate": 3.4197925500327973e-06, "loss": 0.00974578857421875, "step": 1350 }, { "epoch": 2.285111767186841, "grad_norm": 0.03864567979018308, "learning_rate": 3.4040497170739e-06, "loss": 0.009082794189453125, "step": 1355 }, { "epoch": 2.293547026571067, "grad_norm": 0.03547766099076331, "learning_rate": 3.3882955438252852e-06, "loss": 0.008104705810546875, "step": 1360 }, { "epoch": 2.301982285955293, "grad_norm": 0.042069666240123815, "learning_rate": 3.372530472452561e-06, "loss": 0.010825538635253906, "step": 1365 }, { "epoch": 2.310417545339519, "grad_norm": 0.030187240942476403, "learning_rate": 3.356754945427209e-06, "loss": 0.010921478271484375, "step": 1370 }, { "epoch": 2.318852804723745, "grad_norm": 0.03775236120881388, "learning_rate": 3.3409694055141636e-06, "loss": 0.00971527099609375, "step": 1375 }, { "epoch": 2.327288064107971, "grad_norm": 0.04517333042895106, "learning_rate": 3.3251742957593896e-06, "loss": 0.010394287109375, "step": 1380 }, { "epoch": 2.3357233234921972, "grad_norm": 0.03441694727754078, "learning_rate": 3.3093700594774415e-06, "loss": 0.008525848388671875, "step": 1385 }, { "epoch": 2.3441585828764233, "grad_norm": 0.039855958024762626, "learning_rate": 3.2935571402390243e-06, "loss": 0.01035003662109375, "step": 1390 }, { "epoch": 2.3525938422606494, "grad_norm": 0.036912654679360425, "learning_rate": 3.2777359818585453e-06, "loss": 0.01036224365234375, "step": 1395 }, { "epoch": 2.3610291016448754, "grad_norm": 0.02819486898709386, "learning_rate": 3.2619070283816567e-06, "loss": 0.008788299560546876, "step": 1400 }, { "epoch": 2.3610291016448754, "eval_loss": 0.002835027640685439, "eval_margin": -0.006975951657119778, "eval_mean_neg": 0.4969240725040436, "eval_mean_pos": 0.7925288081169128, "eval_runtime": 364.0594, "eval_samples_per_second": 21.944, "eval_steps_per_second": 0.343, "step": 1400 }, { "epoch": 2.3694643610291015, "grad_norm": 0.0347736325148637, "learning_rate": 3.24607072407279e-06, "loss": 0.00931854248046875, "step": 1405 }, { "epoch": 2.3778996204133276, "grad_norm": 0.02843547221351205, "learning_rate": 3.2302275134026902e-06, "loss": 0.008514404296875, "step": 1410 }, { "epoch": 2.3863348797975537, "grad_norm": 0.035527939183407756, "learning_rate": 3.2143778410359414e-06, "loss": 0.009189605712890625, "step": 1415 }, { "epoch": 2.3947701391817797, "grad_norm": 0.02697400462877436, "learning_rate": 3.1985221518184845e-06, "loss": 0.008056259155273438, "step": 1420 }, { "epoch": 2.403205398566006, "grad_norm": 0.02974726363919492, "learning_rate": 3.1826608907651327e-06, "loss": 0.008675384521484374, "step": 1425 }, { "epoch": 2.411640657950232, "grad_norm": 0.04279127831329293, "learning_rate": 3.1667945030470815e-06, "loss": 0.009341812133789063, "step": 1430 }, { "epoch": 2.420075917334458, "grad_norm": 0.039837807919925805, "learning_rate": 3.1509234339794144e-06, "loss": 0.010208892822265624, "step": 1435 }, { "epoch": 2.428511176718684, "grad_norm": 0.03024657864136027, "learning_rate": 3.1350481290086038e-06, "loss": 0.008173370361328125, "step": 1440 }, { "epoch": 2.43694643610291, "grad_norm": 0.03564318900525913, "learning_rate": 3.119169033700011e-06, "loss": 0.00924224853515625, "step": 1445 }, { "epoch": 2.445381695487136, "grad_norm": 0.028913985964356455, "learning_rate": 3.103286593725377e-06, "loss": 0.008563995361328125, "step": 1450 }, { "epoch": 2.453816954871362, "grad_norm": 0.035875161756803144, "learning_rate": 3.0874012548503173e-06, "loss": 0.009112548828125, "step": 1455 }, { "epoch": 2.4622522142555883, "grad_norm": 0.03817913502015442, "learning_rate": 3.0715134629218095e-06, "loss": 0.007489013671875, "step": 1460 }, { "epoch": 2.4706874736398143, "grad_norm": 0.03470677728941542, "learning_rate": 3.0556236638556803e-06, "loss": 0.012370681762695313, "step": 1465 }, { "epoch": 2.4791227330240404, "grad_norm": 0.042966141209856486, "learning_rate": 3.0397323036240886e-06, "loss": 0.0088165283203125, "step": 1470 }, { "epoch": 2.4875579924082665, "grad_norm": 0.03434953324492014, "learning_rate": 3.023839828243012e-06, "loss": 0.008261871337890626, "step": 1475 }, { "epoch": 2.4959932517924925, "grad_norm": 0.03061507966476803, "learning_rate": 3.007946683759723e-06, "loss": 0.008873748779296874, "step": 1480 }, { "epoch": 2.5044285111767186, "grad_norm": 0.027732115444419583, "learning_rate": 2.9920533162402776e-06, "loss": 0.008371734619140625, "step": 1485 }, { "epoch": 2.5128637705609447, "grad_norm": 0.029951392389848317, "learning_rate": 2.9761601717569896e-06, "loss": 0.00865478515625, "step": 1490 }, { "epoch": 2.5212990299451707, "grad_norm": 0.033652436341082566, "learning_rate": 2.960267696375911e-06, "loss": 0.009691619873046875, "step": 1495 }, { "epoch": 2.529734289329397, "grad_norm": 0.024511774862390433, "learning_rate": 2.9443763361443203e-06, "loss": 0.010028076171875, "step": 1500 }, { "epoch": 2.529734289329397, "eval_loss": 0.002559108193963766, "eval_margin": -0.006972289358776423, "eval_mean_neg": 0.4843982458114624, "eval_mean_pos": 0.7881345152854919, "eval_runtime": 365.5243, "eval_samples_per_second": 21.856, "eval_steps_per_second": 0.342, "step": 1500 }, { "epoch": 2.538169548713623, "grad_norm": 0.034627287332041165, "learning_rate": 2.9284865370781906e-06, "loss": 0.00982513427734375, "step": 1505 }, { "epoch": 2.546604808097849, "grad_norm": 0.03482839500691478, "learning_rate": 2.9125987451496837e-06, "loss": 0.00842742919921875, "step": 1510 }, { "epoch": 2.555040067482075, "grad_norm": 0.034040276652772095, "learning_rate": 2.8967134062746236e-06, "loss": 0.008990859985351563, "step": 1515 }, { "epoch": 2.563475326866301, "grad_norm": 0.03868034786852329, "learning_rate": 2.8808309662999897e-06, "loss": 0.007648468017578125, "step": 1520 }, { "epoch": 2.571910586250527, "grad_norm": 0.0419385930164125, "learning_rate": 2.864951870991397e-06, "loss": 0.009268951416015626, "step": 1525 }, { "epoch": 2.580345845634753, "grad_norm": 0.03690945718603307, "learning_rate": 2.8490765660205857e-06, "loss": 0.00864715576171875, "step": 1530 }, { "epoch": 2.5887811050189793, "grad_norm": 0.02493335874585237, "learning_rate": 2.833205496952919e-06, "loss": 0.00865478515625, "step": 1535 }, { "epoch": 2.5972163644032054, "grad_norm": 0.0355467734297459, "learning_rate": 2.817339109234868e-06, "loss": 0.009038543701171875, "step": 1540 }, { "epoch": 2.6056516237874314, "grad_norm": 0.035706551369837684, "learning_rate": 2.801477848181517e-06, "loss": 0.008769607543945313, "step": 1545 }, { "epoch": 2.6140868831716575, "grad_norm": 0.030590948482880534, "learning_rate": 2.7856221589640584e-06, "loss": 0.010419464111328125, "step": 1550 }, { "epoch": 2.6225221425558836, "grad_norm": 0.031187166990055255, "learning_rate": 2.7697724865973103e-06, "loss": 0.008966064453125, "step": 1555 }, { "epoch": 2.6309574019401096, "grad_norm": 0.03195446103788609, "learning_rate": 2.753929275927211e-06, "loss": 0.00810089111328125, "step": 1560 }, { "epoch": 2.6393926613243357, "grad_norm": 0.03124766219549955, "learning_rate": 2.7380929716183448e-06, "loss": 0.00867919921875, "step": 1565 }, { "epoch": 2.6478279207085618, "grad_norm": 0.04158743972175772, "learning_rate": 2.722264018141455e-06, "loss": 0.008811187744140626, "step": 1570 }, { "epoch": 2.656263180092788, "grad_norm": 0.042358151513616535, "learning_rate": 2.706442859760976e-06, "loss": 0.008480644226074219, "step": 1575 }, { "epoch": 2.664698439477014, "grad_norm": 0.02876853915749735, "learning_rate": 2.6906299405225595e-06, "loss": 0.009603309631347656, "step": 1580 }, { "epoch": 2.67313369886124, "grad_norm": 0.031452133973887623, "learning_rate": 2.6748257042406114e-06, "loss": 0.008524322509765625, "step": 1585 }, { "epoch": 2.681568958245466, "grad_norm": 0.03502577600676223, "learning_rate": 2.659030594485836e-06, "loss": 0.007845306396484375, "step": 1590 }, { "epoch": 2.690004217629692, "grad_norm": 0.029358280910616305, "learning_rate": 2.6432450545727913e-06, "loss": 0.008304595947265625, "step": 1595 }, { "epoch": 2.698439477013918, "grad_norm": 0.037226468621806945, "learning_rate": 2.62746952754744e-06, "loss": 0.0089141845703125, "step": 1600 }, { "epoch": 2.698439477013918, "eval_loss": 0.002468662802129984, "eval_margin": -0.006652700444383006, "eval_mean_neg": 0.5055871605873108, "eval_mean_pos": 0.8004181981086731, "eval_runtime": 363.3432, "eval_samples_per_second": 21.987, "eval_steps_per_second": 0.344, "step": 1600 }, { "epoch": 2.7068747363981442, "grad_norm": 0.029979441347867175, "learning_rate": 2.6117044561747145e-06, "loss": 0.007899856567382813, "step": 1605 }, { "epoch": 2.7153099957823703, "grad_norm": 0.04117264280378634, "learning_rate": 2.5959502829261e-06, "loss": 0.009801483154296875, "step": 1610 }, { "epoch": 2.7237452551665964, "grad_norm": 0.02874139529420723, "learning_rate": 2.5802074499672033e-06, "loss": 0.007126617431640625, "step": 1615 }, { "epoch": 2.7321805145508224, "grad_norm": 0.032009387593884574, "learning_rate": 2.564476399145349e-06, "loss": 0.007319259643554688, "step": 1620 }, { "epoch": 2.7406157739350485, "grad_norm": 0.0343660828009257, "learning_rate": 2.5487575719771774e-06, "loss": 0.010648345947265625, "step": 1625 }, { "epoch": 2.7490510333192746, "grad_norm": 0.033859872264591424, "learning_rate": 2.533051409636255e-06, "loss": 0.007244110107421875, "step": 1630 }, { "epoch": 2.7574862927035007, "grad_norm": 0.032973506044290384, "learning_rate": 2.517358352940688e-06, "loss": 0.008284759521484376, "step": 1635 }, { "epoch": 2.7659215520877267, "grad_norm": 0.03481146191160576, "learning_rate": 2.501678842340753e-06, "loss": 0.00882110595703125, "step": 1640 }, { "epoch": 2.774356811471953, "grad_norm": 0.03862588539253724, "learning_rate": 2.4860133179065323e-06, "loss": 0.00964202880859375, "step": 1645 }, { "epoch": 2.782792070856179, "grad_norm": 0.02979780702601001, "learning_rate": 2.4703622193155676e-06, "loss": 0.009095001220703124, "step": 1650 }, { "epoch": 2.791227330240405, "grad_norm": 0.02658002258647219, "learning_rate": 2.4547259858405147e-06, "loss": 0.008580398559570313, "step": 1655 }, { "epoch": 2.799662589624631, "grad_norm": 0.03237100489547251, "learning_rate": 2.439105056336816e-06, "loss": 0.006137275695800781, "step": 1660 }, { "epoch": 2.808097849008857, "grad_norm": 0.035925961611001624, "learning_rate": 2.423499869230385e-06, "loss": 0.006979179382324219, "step": 1665 }, { "epoch": 2.816533108393083, "grad_norm": 0.028925897672990208, "learning_rate": 2.4079108625053e-06, "loss": 0.007439422607421875, "step": 1670 }, { "epoch": 2.824968367777309, "grad_norm": 0.02643424196739614, "learning_rate": 2.392338473691513e-06, "loss": 0.007563400268554688, "step": 1675 }, { "epoch": 2.8334036271615353, "grad_norm": 0.029469931037551172, "learning_rate": 2.376783139852564e-06, "loss": 0.00782928466796875, "step": 1680 }, { "epoch": 2.8418388865457613, "grad_norm": 0.03519097117769341, "learning_rate": 2.3612452975733225e-06, "loss": 0.0081695556640625, "step": 1685 }, { "epoch": 2.8502741459299874, "grad_norm": 0.041842720836538394, "learning_rate": 2.3457253829477284e-06, "loss": 0.00938720703125, "step": 1690 }, { "epoch": 2.8587094053142135, "grad_norm": 0.02803118980318521, "learning_rate": 2.3302238315665544e-06, "loss": 0.007602310180664063, "step": 1695 }, { "epoch": 2.8671446646984395, "grad_norm": 0.06643247372472408, "learning_rate": 2.314741078505177e-06, "loss": 0.009275436401367188, "step": 1700 }, { "epoch": 2.8671446646984395, "eval_loss": 0.002381447935476899, "eval_margin": -0.0063614378337778395, "eval_mean_neg": 0.4982295334339142, "eval_mean_pos": 0.7957465648651123, "eval_runtime": 364.3957, "eval_samples_per_second": 21.924, "eval_steps_per_second": 0.343, "step": 1700 }, { "epoch": 2.8755799240826656, "grad_norm": 0.04573493937998368, "learning_rate": 2.299277558311373e-06, "loss": 0.008275604248046875, "step": 1705 }, { "epoch": 2.8840151834668917, "grad_norm": 0.030855319414577996, "learning_rate": 2.283833704993116e-06, "loss": 0.008497047424316406, "step": 1710 }, { "epoch": 2.8924504428511177, "grad_norm": 0.03287831217925721, "learning_rate": 2.268409952006397e-06, "loss": 0.006939697265625, "step": 1715 }, { "epoch": 2.900885702235344, "grad_norm": 0.03738971418410914, "learning_rate": 2.253006732243061e-06, "loss": 0.00982208251953125, "step": 1720 }, { "epoch": 2.90932096161957, "grad_norm": 0.02295281003302144, "learning_rate": 2.237624478018656e-06, "loss": 0.00743560791015625, "step": 1725 }, { "epoch": 2.917756221003796, "grad_norm": 0.03960242549923526, "learning_rate": 2.2222636210603002e-06, "loss": 0.008847427368164063, "step": 1730 }, { "epoch": 2.926191480388022, "grad_norm": 0.04741800625952587, "learning_rate": 2.2069245924945604e-06, "loss": 0.009384918212890624, "step": 1735 }, { "epoch": 2.934626739772248, "grad_norm": 0.03409532340357435, "learning_rate": 2.191607822835357e-06, "loss": 0.0076019287109375, "step": 1740 }, { "epoch": 2.943061999156474, "grad_norm": 0.0239211291682541, "learning_rate": 2.1763137419718826e-06, "loss": 0.007954025268554687, "step": 1745 }, { "epoch": 2.9514972585407, "grad_norm": 0.0255275562880085, "learning_rate": 2.161042779156529e-06, "loss": 0.007129669189453125, "step": 1750 }, { "epoch": 2.9599325179249263, "grad_norm": 0.026100931475016395, "learning_rate": 2.1457953629928426e-06, "loss": 0.007111358642578125, "step": 1755 }, { "epoch": 2.9683677773091524, "grad_norm": 0.03040565516608014, "learning_rate": 2.1305719214235017e-06, "loss": 0.00856170654296875, "step": 1760 }, { "epoch": 2.9768030366933784, "grad_norm": 0.031725391054917944, "learning_rate": 2.115372881718295e-06, "loss": 0.00930938720703125, "step": 1765 }, { "epoch": 2.9852382960776045, "grad_norm": 0.025864373534585865, "learning_rate": 2.100198670462137e-06, "loss": 0.007320022583007813, "step": 1770 }, { "epoch": 2.9936735554618306, "grad_norm": 0.013680490985647303, "learning_rate": 2.0850497135430897e-06, "loss": 0.007777786254882813, "step": 1775 }, { "epoch": 3.001687051876845, "grad_norm": 0.03143671946142631, "learning_rate": 2.0699264361404174e-06, "loss": 0.008609771728515625, "step": 1780 }, { "epoch": 3.010122311261071, "grad_norm": 0.021237532660395856, "learning_rate": 2.054829262712645e-06, "loss": 0.007422637939453125, "step": 1785 }, { "epoch": 3.018557570645297, "grad_norm": 0.02970629169587053, "learning_rate": 2.0397586169856488e-06, "loss": 0.008047866821289062, "step": 1790 }, { "epoch": 3.0269928300295232, "grad_norm": 0.0280079357370666, "learning_rate": 2.024714921940763e-06, "loss": 0.008725738525390625, "step": 1795 }, { "epoch": 3.0354280894137493, "grad_norm": 0.05178206206651836, "learning_rate": 2.0096985998029124e-06, "loss": 0.007384490966796875, "step": 1800 }, { "epoch": 3.0354280894137493, "eval_loss": 0.0022954940795898438, "eval_margin": -0.005535545939159009, "eval_mean_neg": 0.49804064631462097, "eval_mean_pos": 0.7978142499923706, "eval_runtime": 362.5441, "eval_samples_per_second": 22.036, "eval_steps_per_second": 0.345, "step": 1800 } ], "logging_steps": 5, "max_steps": 2965, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2008625567629312.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }