Invalid JSON: Unexpected token 'N', ..."al_loss": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.6109660574412534, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013054830287206266, | |
| "grad_norm": 43.415225982666016, | |
| "learning_rate": 4.978241949521324e-05, | |
| "loss": 2.9884, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02610966057441253, | |
| "grad_norm": 49.437705993652344, | |
| "learning_rate": 4.956483899042646e-05, | |
| "loss": 2.9654, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0391644908616188, | |
| "grad_norm": 47.24225997924805, | |
| "learning_rate": 4.934725848563969e-05, | |
| "loss": 3.5102, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05221932114882506, | |
| "grad_norm": 32.93499755859375, | |
| "learning_rate": 4.912967798085292e-05, | |
| "loss": 2.4143, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06527415143603134, | |
| "grad_norm": 31.01405143737793, | |
| "learning_rate": 4.891209747606615e-05, | |
| "loss": 2.6864, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0783289817232376, | |
| "grad_norm": 77.32862854003906, | |
| "learning_rate": 4.8694516971279375e-05, | |
| "loss": 2.7403, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09138381201044386, | |
| "grad_norm": 28.313295364379883, | |
| "learning_rate": 4.84769364664926e-05, | |
| "loss": 2.5107, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10443864229765012, | |
| "grad_norm": 38.45579528808594, | |
| "learning_rate": 4.825935596170583e-05, | |
| "loss": 2.6546, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1174934725848564, | |
| "grad_norm": 41.91643142700195, | |
| "learning_rate": 4.8041775456919065e-05, | |
| "loss": 2.82, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13054830287206268, | |
| "grad_norm": 36.32301712036133, | |
| "learning_rate": 4.782419495213229e-05, | |
| "loss": 2.9063, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14360313315926893, | |
| "grad_norm": 30.05735969543457, | |
| "learning_rate": 4.760661444734552e-05, | |
| "loss": 2.5942, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1566579634464752, | |
| "grad_norm": 38.422706604003906, | |
| "learning_rate": 4.738903394255875e-05, | |
| "loss": 2.5929, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16971279373368145, | |
| "grad_norm": 49.74126052856445, | |
| "learning_rate": 4.7171453437771976e-05, | |
| "loss": 2.5271, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18276762402088773, | |
| "grad_norm": 31.077625274658203, | |
| "learning_rate": 4.6953872932985203e-05, | |
| "loss": 2.6357, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.195822454308094, | |
| "grad_norm": 24.155317306518555, | |
| "learning_rate": 4.673629242819844e-05, | |
| "loss": 2.5678, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20887728459530025, | |
| "grad_norm": 30.788408279418945, | |
| "learning_rate": 4.651871192341166e-05, | |
| "loss": 2.8182, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.22193211488250653, | |
| "grad_norm": 25.434738159179688, | |
| "learning_rate": 4.630113141862489e-05, | |
| "loss": 3.303, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2349869451697128, | |
| "grad_norm": 37.103668212890625, | |
| "learning_rate": 4.608355091383813e-05, | |
| "loss": 2.5012, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.24804177545691905, | |
| "grad_norm": 70.19502258300781, | |
| "learning_rate": 4.586597040905135e-05, | |
| "loss": 2.812, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.26109660574412535, | |
| "grad_norm": 37.93436050415039, | |
| "learning_rate": 4.564838990426458e-05, | |
| "loss": 2.6683, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2741514360313316, | |
| "grad_norm": 149.83016967773438, | |
| "learning_rate": 4.543080939947781e-05, | |
| "loss": 2.9776, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.28720626631853785, | |
| "grad_norm": 31.74551010131836, | |
| "learning_rate": 4.521322889469104e-05, | |
| "loss": 2.9663, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3002610966057441, | |
| "grad_norm": 34.0869255065918, | |
| "learning_rate": 4.4995648389904266e-05, | |
| "loss": 2.712, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3133159268929504, | |
| "grad_norm": 28.85022735595703, | |
| "learning_rate": 4.47780678851175e-05, | |
| "loss": 2.3012, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3263707571801567, | |
| "grad_norm": 36.18962860107422, | |
| "learning_rate": 4.456048738033072e-05, | |
| "loss": 2.4339, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3394255874673629, | |
| "grad_norm": 27.252077102661133, | |
| "learning_rate": 4.4342906875543956e-05, | |
| "loss": 2.851, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3524804177545692, | |
| "grad_norm": 38.39606857299805, | |
| "learning_rate": 4.4125326370757184e-05, | |
| "loss": 2.6867, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.36553524804177545, | |
| "grad_norm": 25.8907527923584, | |
| "learning_rate": 4.390774586597041e-05, | |
| "loss": 2.4467, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3785900783289817, | |
| "grad_norm": 24.98986053466797, | |
| "learning_rate": 4.369016536118364e-05, | |
| "loss": 3.0278, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.391644908616188, | |
| "grad_norm": 24.536916732788086, | |
| "learning_rate": 4.347258485639687e-05, | |
| "loss": 2.3857, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4046997389033943, | |
| "grad_norm": 22.012798309326172, | |
| "learning_rate": 4.3255004351610094e-05, | |
| "loss": 2.1414, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4177545691906005, | |
| "grad_norm": 25.466167449951172, | |
| "learning_rate": 4.303742384682333e-05, | |
| "loss": 2.5714, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4308093994778068, | |
| "grad_norm": 31.785062789916992, | |
| "learning_rate": 4.281984334203655e-05, | |
| "loss": 2.4608, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.44386422976501305, | |
| "grad_norm": 36.67721176147461, | |
| "learning_rate": 4.2602262837249784e-05, | |
| "loss": 2.787, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.45691906005221933, | |
| "grad_norm": 49.02054214477539, | |
| "learning_rate": 4.238468233246302e-05, | |
| "loss": 2.7207, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4699738903394256, | |
| "grad_norm": 26.876636505126953, | |
| "learning_rate": 4.216710182767624e-05, | |
| "loss": 2.6041, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4830287206266319, | |
| "grad_norm": 34.956336975097656, | |
| "learning_rate": 4.1949521322889474e-05, | |
| "loss": 3.1905, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4960835509138381, | |
| "grad_norm": 35.72273254394531, | |
| "learning_rate": 4.17319408181027e-05, | |
| "loss": 2.121, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5091383812010444, | |
| "grad_norm": 28.895980834960938, | |
| "learning_rate": 4.151436031331593e-05, | |
| "loss": 2.6956, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5221932114882507, | |
| "grad_norm": 28.925390243530273, | |
| "learning_rate": 4.129677980852916e-05, | |
| "loss": 3.0316, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5352480417754569, | |
| "grad_norm": 34.79185485839844, | |
| "learning_rate": 4.107919930374239e-05, | |
| "loss": 3.0389, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5483028720626631, | |
| "grad_norm": 30.246923446655273, | |
| "learning_rate": 4.086161879895561e-05, | |
| "loss": 2.8086, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5613577023498695, | |
| "grad_norm": 32.78372573852539, | |
| "learning_rate": 4.064403829416885e-05, | |
| "loss": 2.814, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5744125326370757, | |
| "grad_norm": 24.346147537231445, | |
| "learning_rate": 4.0426457789382075e-05, | |
| "loss": 2.8039, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.587467362924282, | |
| "grad_norm": 31.166654586791992, | |
| "learning_rate": 4.02088772845953e-05, | |
| "loss": 2.9612, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6005221932114883, | |
| "grad_norm": 23.0938777923584, | |
| "learning_rate": 3.999129677980853e-05, | |
| "loss": 2.5463, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6135770234986945, | |
| "grad_norm": 26.590911865234375, | |
| "learning_rate": 3.977371627502176e-05, | |
| "loss": 2.3543, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6266318537859008, | |
| "grad_norm": 29.803422927856445, | |
| "learning_rate": 3.9556135770234985e-05, | |
| "loss": 2.7445, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.639686684073107, | |
| "grad_norm": 46.66853713989258, | |
| "learning_rate": 3.933855526544822e-05, | |
| "loss": 2.4905, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6527415143603134, | |
| "grad_norm": 39.04319381713867, | |
| "learning_rate": 3.912097476066145e-05, | |
| "loss": 1.8311, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6657963446475196, | |
| "grad_norm": 30.50276756286621, | |
| "learning_rate": 3.8903394255874675e-05, | |
| "loss": 2.8152, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6788511749347258, | |
| "grad_norm": 37.25984191894531, | |
| "learning_rate": 3.868581375108791e-05, | |
| "loss": 2.4759, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6919060052219321, | |
| "grad_norm": 25.89512062072754, | |
| "learning_rate": 3.846823324630113e-05, | |
| "loss": 2.486, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7049608355091384, | |
| "grad_norm": 37.318450927734375, | |
| "learning_rate": 3.8250652741514365e-05, | |
| "loss": 2.77, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7180156657963447, | |
| "grad_norm": 34.45144271850586, | |
| "learning_rate": 3.803307223672759e-05, | |
| "loss": 2.2646, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7310704960835509, | |
| "grad_norm": 50.494144439697266, | |
| "learning_rate": 3.781549173194082e-05, | |
| "loss": 2.7686, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7441253263707572, | |
| "grad_norm": 29.753643035888672, | |
| "learning_rate": 3.759791122715405e-05, | |
| "loss": 2.6239, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7571801566579635, | |
| "grad_norm": 39.54145431518555, | |
| "learning_rate": 3.738033072236728e-05, | |
| "loss": 2.4995, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7702349869451697, | |
| "grad_norm": 36.82713317871094, | |
| "learning_rate": 3.71627502175805e-05, | |
| "loss": 3.0274, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.783289817232376, | |
| "grad_norm": 36.62627410888672, | |
| "learning_rate": 3.694516971279374e-05, | |
| "loss": 2.2364, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7963446475195822, | |
| "grad_norm": 18.279882431030273, | |
| "learning_rate": 3.6727589208006965e-05, | |
| "loss": 2.3391, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8093994778067886, | |
| "grad_norm": 23.61455535888672, | |
| "learning_rate": 3.651000870322019e-05, | |
| "loss": 2.5222, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8224543080939948, | |
| "grad_norm": 32.03522872924805, | |
| "learning_rate": 3.629242819843342e-05, | |
| "loss": 2.2871, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.835509138381201, | |
| "grad_norm": 45.24649429321289, | |
| "learning_rate": 3.607484769364665e-05, | |
| "loss": 2.6863, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8485639686684073, | |
| "grad_norm": 24.39188575744629, | |
| "learning_rate": 3.5857267188859876e-05, | |
| "loss": 2.6426, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8616187989556136, | |
| "grad_norm": 21.67547607421875, | |
| "learning_rate": 3.563968668407311e-05, | |
| "loss": 2.1157, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8746736292428199, | |
| "grad_norm": 24.245168685913086, | |
| "learning_rate": 3.542210617928634e-05, | |
| "loss": 2.3781, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8877284595300261, | |
| "grad_norm": 27.57684326171875, | |
| "learning_rate": 3.5204525674499566e-05, | |
| "loss": 2.7673, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9007832898172323, | |
| "grad_norm": 28.42872428894043, | |
| "learning_rate": 3.49869451697128e-05, | |
| "loss": 2.4503, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9138381201044387, | |
| "grad_norm": 39.387813568115234, | |
| "learning_rate": 3.476936466492602e-05, | |
| "loss": 2.1765, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9268929503916449, | |
| "grad_norm": 20.197811126708984, | |
| "learning_rate": 3.4551784160139256e-05, | |
| "loss": 2.555, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9399477806788512, | |
| "grad_norm": 22.066137313842773, | |
| "learning_rate": 3.4334203655352484e-05, | |
| "loss": 2.4827, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9530026109660574, | |
| "grad_norm": 32.67851638793945, | |
| "learning_rate": 3.411662315056571e-05, | |
| "loss": 2.762, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9660574412532638, | |
| "grad_norm": 28.471988677978516, | |
| "learning_rate": 3.389904264577894e-05, | |
| "loss": 2.5872, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.97911227154047, | |
| "grad_norm": 22.934885025024414, | |
| "learning_rate": 3.368146214099217e-05, | |
| "loss": 2.8826, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9921671018276762, | |
| "grad_norm": 24.063716888427734, | |
| "learning_rate": 3.3463881636205394e-05, | |
| "loss": 2.8909, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 2.3283705711364746, | |
| "eval_runtime": 12.1219, | |
| "eval_samples_per_second": 112.276, | |
| "eval_steps_per_second": 14.107, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 1.0052219321148825, | |
| "grad_norm": 36.15023422241211, | |
| "learning_rate": 3.324630113141863e-05, | |
| "loss": 2.5282, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.0182767624020888, | |
| "grad_norm": 35.99642562866211, | |
| "learning_rate": 3.3028720626631856e-05, | |
| "loss": 2.176, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.031331592689295, | |
| "grad_norm": 30.217031478881836, | |
| "learning_rate": 3.2811140121845084e-05, | |
| "loss": 2.2727, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.0443864229765012, | |
| "grad_norm": 29.16168212890625, | |
| "learning_rate": 3.259355961705831e-05, | |
| "loss": 2.0302, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0574412532637076, | |
| "grad_norm": 25.400541305541992, | |
| "learning_rate": 3.237597911227154e-05, | |
| "loss": 2.3489, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.0704960835509139, | |
| "grad_norm": 21.281591415405273, | |
| "learning_rate": 3.215839860748477e-05, | |
| "loss": 2.3976, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.08355091383812, | |
| "grad_norm": 23.941238403320312, | |
| "learning_rate": 3.1940818102698e-05, | |
| "loss": 2.6169, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0966057441253263, | |
| "grad_norm": 26.626665115356445, | |
| "learning_rate": 3.172323759791123e-05, | |
| "loss": 2.6102, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.1096605744125327, | |
| "grad_norm": 28.539621353149414, | |
| "learning_rate": 3.150565709312446e-05, | |
| "loss": 2.019, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.122715404699739, | |
| "grad_norm": 36.77280044555664, | |
| "learning_rate": 3.128807658833769e-05, | |
| "loss": 2.2338, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.1357702349869452, | |
| "grad_norm": 38.7175407409668, | |
| "learning_rate": 3.107049608355091e-05, | |
| "loss": 2.7259, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1488250652741514, | |
| "grad_norm": 32.9740104675293, | |
| "learning_rate": 3.085291557876415e-05, | |
| "loss": 2.294, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.1618798955613576, | |
| "grad_norm": 35.01115036010742, | |
| "learning_rate": 3.0635335073977374e-05, | |
| "loss": 2.5392, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.174934725848564, | |
| "grad_norm": 40.960968017578125, | |
| "learning_rate": 3.0417754569190606e-05, | |
| "loss": 1.8062, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1879895561357703, | |
| "grad_norm": 24.627063751220703, | |
| "learning_rate": 3.020017406440383e-05, | |
| "loss": 2.746, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.2010443864229765, | |
| "grad_norm": 28.861692428588867, | |
| "learning_rate": 2.998259355961706e-05, | |
| "loss": 2.1434, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.2140992167101827, | |
| "grad_norm": 30.32466697692871, | |
| "learning_rate": 2.976501305483029e-05, | |
| "loss": 2.8282, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.227154046997389, | |
| "grad_norm": 45.79476547241211, | |
| "learning_rate": 2.954743255004352e-05, | |
| "loss": 2.1972, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.2402088772845954, | |
| "grad_norm": 34.27708435058594, | |
| "learning_rate": 2.9329852045256744e-05, | |
| "loss": 3.0359, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.2532637075718016, | |
| "grad_norm": 33.62773513793945, | |
| "learning_rate": 2.9112271540469975e-05, | |
| "loss": 2.4254, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.2663185378590078, | |
| "grad_norm": 26.693920135498047, | |
| "learning_rate": 2.8894691035683203e-05, | |
| "loss": 1.883, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.279373368146214, | |
| "grad_norm": 36.45111083984375, | |
| "learning_rate": 2.8677110530896434e-05, | |
| "loss": 2.232, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2924281984334205, | |
| "grad_norm": 34.38032913208008, | |
| "learning_rate": 2.845953002610966e-05, | |
| "loss": 2.4289, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.3054830287206267, | |
| "grad_norm": 19.068925857543945, | |
| "learning_rate": 2.8241949521322892e-05, | |
| "loss": 2.473, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.318537859007833, | |
| "grad_norm": 30.37474250793457, | |
| "learning_rate": 2.8024369016536117e-05, | |
| "loss": 2.6123, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.3315926892950392, | |
| "grad_norm": 29.176149368286133, | |
| "learning_rate": 2.7806788511749348e-05, | |
| "loss": 2.3158, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.3446475195822454, | |
| "grad_norm": 27.721553802490234, | |
| "learning_rate": 2.758920800696258e-05, | |
| "loss": 2.3646, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.3577023498694518, | |
| "grad_norm": 12.712676048278809, | |
| "learning_rate": 2.7371627502175807e-05, | |
| "loss": 2.1114, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.370757180156658, | |
| "grad_norm": 23.869230270385742, | |
| "learning_rate": 2.7154046997389038e-05, | |
| "loss": 2.1335, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.3838120104438643, | |
| "grad_norm": 23.66261100769043, | |
| "learning_rate": 2.6936466492602262e-05, | |
| "loss": 2.0774, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.3968668407310705, | |
| "grad_norm": 44.03451156616211, | |
| "learning_rate": 2.6718885987815496e-05, | |
| "loss": 2.6329, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.4099216710182767, | |
| "grad_norm": 20.849573135375977, | |
| "learning_rate": 2.650130548302872e-05, | |
| "loss": 2.8549, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.4229765013054831, | |
| "grad_norm": 27.392250061035156, | |
| "learning_rate": 2.6283724978241952e-05, | |
| "loss": 2.4537, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.4360313315926894, | |
| "grad_norm": 25.04733657836914, | |
| "learning_rate": 2.606614447345518e-05, | |
| "loss": 1.9425, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4490861618798956, | |
| "grad_norm": 32.247894287109375, | |
| "learning_rate": 2.584856396866841e-05, | |
| "loss": 2.0056, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.4621409921671018, | |
| "grad_norm": 32.498191833496094, | |
| "learning_rate": 2.5630983463881635e-05, | |
| "loss": 2.6457, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.475195822454308, | |
| "grad_norm": 26.728214263916016, | |
| "learning_rate": 2.5413402959094866e-05, | |
| "loss": 1.6954, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.4882506527415145, | |
| "grad_norm": 63.32724380493164, | |
| "learning_rate": 2.5195822454308094e-05, | |
| "loss": 2.5008, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.5013054830287205, | |
| "grad_norm": 24.65690803527832, | |
| "learning_rate": 2.4978241949521325e-05, | |
| "loss": 1.5901, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.514360313315927, | |
| "grad_norm": 19.37055206298828, | |
| "learning_rate": 2.4760661444734552e-05, | |
| "loss": 1.9548, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.5274151436031331, | |
| "grad_norm": 23.001806259155273, | |
| "learning_rate": 2.4543080939947783e-05, | |
| "loss": 2.382, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.5404699738903394, | |
| "grad_norm": 26.997100830078125, | |
| "learning_rate": 2.432550043516101e-05, | |
| "loss": 2.2834, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.5535248041775458, | |
| "grad_norm": 22.0489559173584, | |
| "learning_rate": 2.410791993037424e-05, | |
| "loss": 2.1768, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.566579634464752, | |
| "grad_norm": 29.986967086791992, | |
| "learning_rate": 2.389033942558747e-05, | |
| "loss": 2.0854, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5796344647519582, | |
| "grad_norm": 34.0990104675293, | |
| "learning_rate": 2.3672758920800698e-05, | |
| "loss": 2.4429, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.5926892950391645, | |
| "grad_norm": 25.31661033630371, | |
| "learning_rate": 2.3455178416013925e-05, | |
| "loss": 2.3145, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.6057441253263707, | |
| "grad_norm": 30.300716400146484, | |
| "learning_rate": 2.3237597911227156e-05, | |
| "loss": 1.9357, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.6187989556135771, | |
| "grad_norm": 25.158327102661133, | |
| "learning_rate": 2.3020017406440384e-05, | |
| "loss": 2.3195, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.6318537859007833, | |
| "grad_norm": 33.35712432861328, | |
| "learning_rate": 2.280243690165361e-05, | |
| "loss": 2.4724, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.6449086161879896, | |
| "grad_norm": 22.938852310180664, | |
| "learning_rate": 2.258485639686684e-05, | |
| "loss": 2.3876, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.6579634464751958, | |
| "grad_norm": 26.44889259338379, | |
| "learning_rate": 2.236727589208007e-05, | |
| "loss": 2.1164, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.671018276762402, | |
| "grad_norm": 28.251296997070312, | |
| "learning_rate": 2.2149695387293298e-05, | |
| "loss": 2.3045, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.6840731070496084, | |
| "grad_norm": 28.00015640258789, | |
| "learning_rate": 2.193211488250653e-05, | |
| "loss": 1.9546, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.6971279373368147, | |
| "grad_norm": 20.0263729095459, | |
| "learning_rate": 2.171453437771976e-05, | |
| "loss": 2.2491, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.7101827676240209, | |
| "grad_norm": 23.335580825805664, | |
| "learning_rate": 2.1496953872932988e-05, | |
| "loss": 2.5543, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.723237597911227, | |
| "grad_norm": 25.97711181640625, | |
| "learning_rate": 2.1279373368146216e-05, | |
| "loss": 2.2486, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.7362924281984333, | |
| "grad_norm": 24.391855239868164, | |
| "learning_rate": 2.1061792863359443e-05, | |
| "loss": 2.3483, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.7493472584856398, | |
| "grad_norm": 29.249792098999023, | |
| "learning_rate": 2.0844212358572674e-05, | |
| "loss": 2.3703, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.762402088772846, | |
| "grad_norm": 20.213987350463867, | |
| "learning_rate": 2.0626631853785902e-05, | |
| "loss": 2.3178, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.7754569190600522, | |
| "grad_norm": 33.050018310546875, | |
| "learning_rate": 2.040905134899913e-05, | |
| "loss": 2.2622, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.7885117493472587, | |
| "grad_norm": 25.058115005493164, | |
| "learning_rate": 2.019147084421236e-05, | |
| "loss": 2.2577, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.8015665796344646, | |
| "grad_norm": 34.79226303100586, | |
| "learning_rate": 1.997389033942559e-05, | |
| "loss": 2.1586, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.814621409921671, | |
| "grad_norm": 38.56571578979492, | |
| "learning_rate": 1.9756309834638816e-05, | |
| "loss": 2.0911, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.8276762402088773, | |
| "grad_norm": 27.778825759887695, | |
| "learning_rate": 1.9538729329852047e-05, | |
| "loss": 2.2379, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.8407310704960835, | |
| "grad_norm": 27.160274505615234, | |
| "learning_rate": 1.9321148825065275e-05, | |
| "loss": 2.0847, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.85378590078329, | |
| "grad_norm": 26.11197853088379, | |
| "learning_rate": 1.9103568320278503e-05, | |
| "loss": 2.7021, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.866840731070496, | |
| "grad_norm": 28.448244094848633, | |
| "learning_rate": 1.8885987815491734e-05, | |
| "loss": 2.1596, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.8798955613577024, | |
| "grad_norm": 49.074729919433594, | |
| "learning_rate": 1.866840731070496e-05, | |
| "loss": 2.5753, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.8929503916449086, | |
| "grad_norm": 21.96980094909668, | |
| "learning_rate": 1.845082680591819e-05, | |
| "loss": 2.5118, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.9060052219321149, | |
| "grad_norm": 20.993181228637695, | |
| "learning_rate": 1.823324630113142e-05, | |
| "loss": 2.2332, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.9190600522193213, | |
| "grad_norm": 20.049209594726562, | |
| "learning_rate": 1.801566579634465e-05, | |
| "loss": 2.1748, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.9321148825065273, | |
| "grad_norm": 35.51521682739258, | |
| "learning_rate": 1.779808529155788e-05, | |
| "loss": 2.781, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.9451697127937337, | |
| "grad_norm": 25.36643409729004, | |
| "learning_rate": 1.7580504786771106e-05, | |
| "loss": 2.7489, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.95822454308094, | |
| "grad_norm": 38.09309387207031, | |
| "learning_rate": 1.7362924281984334e-05, | |
| "loss": 2.1326, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.9712793733681462, | |
| "grad_norm": 30.636632919311523, | |
| "learning_rate": 1.7145343777197565e-05, | |
| "loss": 2.2151, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.9843342036553526, | |
| "grad_norm": 27.038352966308594, | |
| "learning_rate": 1.6927763272410793e-05, | |
| "loss": 2.4523, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.9973890339425586, | |
| "grad_norm": 19.101573944091797, | |
| "learning_rate": 1.671018276762402e-05, | |
| "loss": 2.518, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": NaN, | |
| "eval_runtime": 11.9494, | |
| "eval_samples_per_second": 113.897, | |
| "eval_steps_per_second": 14.31, | |
| "step": 1532 | |
| }, | |
| { | |
| "epoch": 2.010443864229765, | |
| "grad_norm": 28.463035583496094, | |
| "learning_rate": 1.649260226283725e-05, | |
| "loss": 1.8945, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.023498694516971, | |
| "grad_norm": 30.520097732543945, | |
| "learning_rate": 1.627502175805048e-05, | |
| "loss": 2.3685, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.0365535248041775, | |
| "grad_norm": 19.876482009887695, | |
| "learning_rate": 1.6057441253263707e-05, | |
| "loss": 2.1494, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.049608355091384, | |
| "grad_norm": 23.423219680786133, | |
| "learning_rate": 1.5839860748476938e-05, | |
| "loss": 1.9791, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.06266318537859, | |
| "grad_norm": 20.257450103759766, | |
| "learning_rate": 1.5622280243690166e-05, | |
| "loss": 2.2526, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.0757180156657964, | |
| "grad_norm": 47.68708038330078, | |
| "learning_rate": 1.5404699738903393e-05, | |
| "loss": 1.9962, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.0887728459530024, | |
| "grad_norm": 27.561660766601562, | |
| "learning_rate": 1.5187119234116623e-05, | |
| "loss": 1.6106, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.101827676240209, | |
| "grad_norm": 26.832944869995117, | |
| "learning_rate": 1.4969538729329852e-05, | |
| "loss": 1.5015, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.1148825065274153, | |
| "grad_norm": 45.05983352661133, | |
| "learning_rate": 1.475195822454308e-05, | |
| "loss": 1.8897, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.1279373368146213, | |
| "grad_norm": 19.24533462524414, | |
| "learning_rate": 1.4534377719756313e-05, | |
| "loss": 2.4187, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.1409921671018277, | |
| "grad_norm": 29.476770401000977, | |
| "learning_rate": 1.431679721496954e-05, | |
| "loss": 2.1733, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.1540469973890337, | |
| "grad_norm": 26.505355834960938, | |
| "learning_rate": 1.409921671018277e-05, | |
| "loss": 1.8432, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.16710182767624, | |
| "grad_norm": 28.1693058013916, | |
| "learning_rate": 1.3881636205395997e-05, | |
| "loss": 2.1419, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.1801566579634466, | |
| "grad_norm": 37.704498291015625, | |
| "learning_rate": 1.3664055700609227e-05, | |
| "loss": 2.7488, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.1932114882506526, | |
| "grad_norm": 22.38772964477539, | |
| "learning_rate": 1.3446475195822456e-05, | |
| "loss": 2.3531, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.206266318537859, | |
| "grad_norm": 22.58838653564453, | |
| "learning_rate": 1.3228894691035684e-05, | |
| "loss": 1.8415, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.2193211488250655, | |
| "grad_norm": 30.01149559020996, | |
| "learning_rate": 1.3011314186248913e-05, | |
| "loss": 2.0757, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.2323759791122715, | |
| "grad_norm": 23.964759826660156, | |
| "learning_rate": 1.2793733681462141e-05, | |
| "loss": 2.4956, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.245430809399478, | |
| "grad_norm": 33.133541107177734, | |
| "learning_rate": 1.257615317667537e-05, | |
| "loss": 1.971, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.258485639686684, | |
| "grad_norm": 27.34188461303711, | |
| "learning_rate": 1.23585726718886e-05, | |
| "loss": 2.2084, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.2715404699738904, | |
| "grad_norm": 25.62513542175293, | |
| "learning_rate": 1.2140992167101827e-05, | |
| "loss": 1.9445, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.2845953002610964, | |
| "grad_norm": 33.618385314941406, | |
| "learning_rate": 1.1923411662315057e-05, | |
| "loss": 2.4505, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.297650130548303, | |
| "grad_norm": 25.787757873535156, | |
| "learning_rate": 1.1705831157528286e-05, | |
| "loss": 2.748, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.3107049608355092, | |
| "grad_norm": 24.533018112182617, | |
| "learning_rate": 1.1488250652741515e-05, | |
| "loss": 1.496, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.3237597911227152, | |
| "grad_norm": 41.41205596923828, | |
| "learning_rate": 1.1270670147954745e-05, | |
| "loss": 2.5213, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.3368146214099217, | |
| "grad_norm": 20.163238525390625, | |
| "learning_rate": 1.1053089643167972e-05, | |
| "loss": 1.8429, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.349869451697128, | |
| "grad_norm": 36.58127975463867, | |
| "learning_rate": 1.0835509138381202e-05, | |
| "loss": 2.617, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.362924281984334, | |
| "grad_norm": 29.932636260986328, | |
| "learning_rate": 1.061792863359443e-05, | |
| "loss": 2.2462, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.3759791122715406, | |
| "grad_norm": 20.780025482177734, | |
| "learning_rate": 1.0400348128807659e-05, | |
| "loss": 1.7878, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.3890339425587466, | |
| "grad_norm": 26.663557052612305, | |
| "learning_rate": 1.0182767624020888e-05, | |
| "loss": 2.2757, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.402088772845953, | |
| "grad_norm": 34.194435119628906, | |
| "learning_rate": 9.965187119234116e-06, | |
| "loss": 1.8959, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.4151436031331595, | |
| "grad_norm": 13.468405723571777, | |
| "learning_rate": 9.747606614447347e-06, | |
| "loss": 2.2176, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.4281984334203655, | |
| "grad_norm": 36.56803512573242, | |
| "learning_rate": 9.530026109660575e-06, | |
| "loss": 2.2373, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.441253263707572, | |
| "grad_norm": 25.580917358398438, | |
| "learning_rate": 9.312445604873804e-06, | |
| "loss": 2.004, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.454308093994778, | |
| "grad_norm": 47.98051071166992, | |
| "learning_rate": 9.094865100087033e-06, | |
| "loss": 2.09, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.4673629242819843, | |
| "grad_norm": 44.3414421081543, | |
| "learning_rate": 8.877284595300261e-06, | |
| "loss": 2.4412, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.480417754569191, | |
| "grad_norm": 22.451644897460938, | |
| "learning_rate": 8.65970409051349e-06, | |
| "loss": 1.5646, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.493472584856397, | |
| "grad_norm": 21.15165138244629, | |
| "learning_rate": 8.442123585726718e-06, | |
| "loss": 1.5899, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.506527415143603, | |
| "grad_norm": 49.44068145751953, | |
| "learning_rate": 8.224543080939948e-06, | |
| "loss": 2.0553, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.5195822454308097, | |
| "grad_norm": 30.540063858032227, | |
| "learning_rate": 8.006962576153177e-06, | |
| "loss": 1.7215, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.5326370757180157, | |
| "grad_norm": 29.058853149414062, | |
| "learning_rate": 7.789382071366406e-06, | |
| "loss": 1.7025, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.5456919060052217, | |
| "grad_norm": 36.38139343261719, | |
| "learning_rate": 7.571801566579635e-06, | |
| "loss": 1.9976, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.558746736292428, | |
| "grad_norm": 21.742773056030273, | |
| "learning_rate": 7.354221061792864e-06, | |
| "loss": 1.8528, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.5718015665796345, | |
| "grad_norm": 22.30496597290039, | |
| "learning_rate": 7.136640557006093e-06, | |
| "loss": 1.726, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.584856396866841, | |
| "grad_norm": 28.57794761657715, | |
| "learning_rate": 6.919060052219321e-06, | |
| "loss": 1.9179, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.597911227154047, | |
| "grad_norm": 35.680999755859375, | |
| "learning_rate": 6.70147954743255e-06, | |
| "loss": 1.9789, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.6109660574412534, | |
| "grad_norm": 41.53910827636719, | |
| "learning_rate": 6.483899042645779e-06, | |
| "loss": 2.3311, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2298, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2103832360255488.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |