diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.004178715667294215, + "epoch": 0.001511450347744716, "eval_steps": 500, - "global_step": 940, + "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -2388,4206 +2388,6 @@ "learning_rate": 1.9999998248867153e-05, "loss": 4.2528, "step": 340 - }, - { - "epoch": 0.0015158957899439652, - "grad_norm": 14.4375, - "learning_rate": 1.9999998238520747e-05, - "loss": 3.8207, - "step": 341 - }, - { - "epoch": 0.0015203412321432143, - "grad_norm": 10.3125, - "learning_rate": 1.999999822814387e-05, - "loss": 4.2008, - "step": 342 - }, - { - "epoch": 0.0015247866743424636, - "grad_norm": 11.25, - "learning_rate": 1.999999821773651e-05, - "loss": 3.9915, - "step": 343 - }, - { - "epoch": 0.0015292321165417126, - "grad_norm": 11.5625, - "learning_rate": 1.9999998207298682e-05, - "loss": 4.009, - "step": 344 - }, - { - "epoch": 0.001533677558740962, - "grad_norm": 10.8125, - "learning_rate": 1.9999998196830375e-05, - "loss": 3.9844, - "step": 345 - }, - { - "epoch": 0.001538123000940211, - "grad_norm": 9.5, - "learning_rate": 1.999999818633159e-05, - "loss": 3.9852, - "step": 346 - }, - { - "epoch": 0.0015425684431394603, - "grad_norm": 10.9375, - "learning_rate": 1.9999998175802335e-05, - "loss": 3.9036, - "step": 347 - }, - { - "epoch": 0.0015470138853387093, - "grad_norm": 8.9375, - "learning_rate": 1.9999998165242602e-05, - "loss": 3.9887, - "step": 348 - }, - { - "epoch": 0.0015514593275379586, - "grad_norm": 10.25, - "learning_rate": 1.9999998154652393e-05, - "loss": 4.042, - "step": 349 - }, - { - "epoch": 0.0015559047697372077, - "grad_norm": 11.875, - "learning_rate": 1.999999814403171e-05, - "loss": 3.7839, - "step": 350 - }, - { - "epoch": 0.001560350211936457, - "grad_norm": 13.1875, - "learning_rate": 1.9999998133380553e-05, - "loss": 3.7002, - "step": 351 - }, - { - "epoch": 0.001564795654135706, - "grad_norm": 10.0625, - "learning_rate": 1.999999812269892e-05, - "loss": 3.8629, - "step": 352 - }, - { - "epoch": 0.0015692410963349553, - "grad_norm": 11.4375, - "learning_rate": 1.9999998111986808e-05, - "loss": 3.8494, - "step": 353 - }, - { - "epoch": 0.0015736865385342043, - "grad_norm": 13.125, - "learning_rate": 1.999999810124422e-05, - "loss": 3.6433, - "step": 354 - }, - { - "epoch": 0.0015781319807334534, - "grad_norm": 10.9375, - "learning_rate": 1.9999998090471165e-05, - "loss": 3.8246, - "step": 355 - }, - { - "epoch": 0.0015825774229327027, - "grad_norm": 9.6875, - "learning_rate": 1.999999807966763e-05, - "loss": 3.8999, - "step": 356 - }, - { - "epoch": 0.0015870228651319518, - "grad_norm": 10.0625, - "learning_rate": 1.9999998068833614e-05, - "loss": 4.0345, - "step": 357 - }, - { - "epoch": 0.001591468307331201, - "grad_norm": 12.3125, - "learning_rate": 1.999999805796913e-05, - "loss": 3.7046, - "step": 358 - }, - { - "epoch": 0.00159591374953045, - "grad_norm": 10.4375, - "learning_rate": 1.9999998047074167e-05, - "loss": 3.8184, - "step": 359 - }, - { - "epoch": 0.0016003591917296994, - "grad_norm": 12.375, - "learning_rate": 1.999999803614873e-05, - "loss": 3.7053, - "step": 360 - }, - { - "epoch": 0.0016048046339289484, - "grad_norm": 9.375, - "learning_rate": 1.9999998025192816e-05, - "loss": 3.9704, - "step": 361 - }, - { - "epoch": 0.0016092500761281977, - "grad_norm": 10.5, - "learning_rate": 1.999999801420643e-05, - "loss": 3.8118, - "step": 362 - }, - { - "epoch": 0.0016136955183274468, - "grad_norm": 11.3125, - "learning_rate": 1.9999998003189563e-05, - "loss": 3.8241, - "step": 363 - }, - { - "epoch": 0.001618140960526696, - "grad_norm": 12.0625, - "learning_rate": 1.9999997992142227e-05, - "loss": 3.6775, - "step": 364 - }, - { - "epoch": 0.0016225864027259451, - "grad_norm": 12.9375, - "learning_rate": 1.999999798106441e-05, - "loss": 3.5826, - "step": 365 - }, - { - "epoch": 0.0016270318449251944, - "grad_norm": 13.4375, - "learning_rate": 1.999999796995612e-05, - "loss": 3.5549, - "step": 366 - }, - { - "epoch": 0.0016314772871244435, - "grad_norm": 13.3125, - "learning_rate": 1.9999997958817356e-05, - "loss": 3.5974, - "step": 367 - }, - { - "epoch": 0.0016359227293236927, - "grad_norm": 10.6875, - "learning_rate": 1.9999997947648112e-05, - "loss": 4.016, - "step": 368 - }, - { - "epoch": 0.0016403681715229418, - "grad_norm": 11.0625, - "learning_rate": 1.9999997936448398e-05, - "loss": 3.7551, - "step": 369 - }, - { - "epoch": 0.001644813613722191, - "grad_norm": 10.0, - "learning_rate": 1.9999997925218205e-05, - "loss": 3.9777, - "step": 370 - }, - { - "epoch": 0.0016492590559214401, - "grad_norm": 10.5, - "learning_rate": 1.999999791395754e-05, - "loss": 3.7386, - "step": 371 - }, - { - "epoch": 0.0016537044981206892, - "grad_norm": 11.0, - "learning_rate": 1.9999997902666397e-05, - "loss": 3.7703, - "step": 372 - }, - { - "epoch": 0.0016581499403199385, - "grad_norm": 9.75, - "learning_rate": 1.9999997891344778e-05, - "loss": 3.9092, - "step": 373 - }, - { - "epoch": 0.0016625953825191876, - "grad_norm": 13.8125, - "learning_rate": 1.9999997879992687e-05, - "loss": 3.5821, - "step": 374 - }, - { - "epoch": 0.0016670408247184368, - "grad_norm": 12.8125, - "learning_rate": 1.9999997868610116e-05, - "loss": 3.632, - "step": 375 - }, - { - "epoch": 0.001671486266917686, - "grad_norm": 9.4375, - "learning_rate": 1.999999785719707e-05, - "loss": 3.9452, - "step": 376 - }, - { - "epoch": 0.0016759317091169352, - "grad_norm": 12.0625, - "learning_rate": 1.9999997845753555e-05, - "loss": 3.7551, - "step": 377 - }, - { - "epoch": 0.0016803771513161842, - "grad_norm": 13.6875, - "learning_rate": 1.999999783427956e-05, - "loss": 3.5054, - "step": 378 - }, - { - "epoch": 0.0016848225935154335, - "grad_norm": 9.875, - "learning_rate": 1.999999782277509e-05, - "loss": 3.8821, - "step": 379 - }, - { - "epoch": 0.0016892680357146826, - "grad_norm": 10.5625, - "learning_rate": 1.999999781124014e-05, - "loss": 3.8158, - "step": 380 - }, - { - "epoch": 0.0016937134779139319, - "grad_norm": 11.5, - "learning_rate": 1.999999779967472e-05, - "loss": 3.7465, - "step": 381 - }, - { - "epoch": 0.001698158920113181, - "grad_norm": 7.96875, - "learning_rate": 1.9999997788078824e-05, - "loss": 4.1271, - "step": 382 - }, - { - "epoch": 0.0017026043623124302, - "grad_norm": 12.6875, - "learning_rate": 1.999999777645245e-05, - "loss": 3.4796, - "step": 383 - }, - { - "epoch": 0.0017070498045116793, - "grad_norm": 11.9375, - "learning_rate": 1.9999997764795606e-05, - "loss": 3.6678, - "step": 384 - }, - { - "epoch": 0.0017114952467109285, - "grad_norm": 11.5, - "learning_rate": 1.999999775310828e-05, - "loss": 3.6183, - "step": 385 - }, - { - "epoch": 0.0017159406889101776, - "grad_norm": 10.375, - "learning_rate": 1.9999997741390483e-05, - "loss": 3.8081, - "step": 386 - }, - { - "epoch": 0.0017203861311094269, - "grad_norm": 6.25, - "learning_rate": 1.999999772964221e-05, - "loss": 4.1382, - "step": 387 - }, - { - "epoch": 0.001724831573308676, - "grad_norm": 11.25, - "learning_rate": 1.999999771786346e-05, - "loss": 3.6953, - "step": 388 - }, - { - "epoch": 0.001729277015507925, - "grad_norm": 10.25, - "learning_rate": 1.9999997706054234e-05, - "loss": 3.6919, - "step": 389 - }, - { - "epoch": 0.0017337224577071743, - "grad_norm": 10.25, - "learning_rate": 1.9999997694214536e-05, - "loss": 3.8112, - "step": 390 - }, - { - "epoch": 0.0017381678999064233, - "grad_norm": 8.6875, - "learning_rate": 1.9999997682344362e-05, - "loss": 3.8363, - "step": 391 - }, - { - "epoch": 0.0017426133421056726, - "grad_norm": 12.5, - "learning_rate": 1.9999997670443708e-05, - "loss": 3.5642, - "step": 392 - }, - { - "epoch": 0.0017470587843049217, - "grad_norm": 8.625, - "learning_rate": 1.9999997658512585e-05, - "loss": 3.9516, - "step": 393 - }, - { - "epoch": 0.001751504226504171, - "grad_norm": 10.875, - "learning_rate": 1.9999997646550982e-05, - "loss": 3.7024, - "step": 394 - }, - { - "epoch": 0.00175594966870342, - "grad_norm": 11.8125, - "learning_rate": 1.9999997634558907e-05, - "loss": 3.739, - "step": 395 - }, - { - "epoch": 0.0017603951109026693, - "grad_norm": 8.1875, - "learning_rate": 1.9999997622536355e-05, - "loss": 3.995, - "step": 396 - }, - { - "epoch": 0.0017648405531019184, - "grad_norm": 10.5, - "learning_rate": 1.9999997610483323e-05, - "loss": 3.7714, - "step": 397 - }, - { - "epoch": 0.0017692859953011677, - "grad_norm": 11.3125, - "learning_rate": 1.9999997598399822e-05, - "loss": 3.6416, - "step": 398 - }, - { - "epoch": 0.0017737314375004167, - "grad_norm": 10.375, - "learning_rate": 1.9999997586285845e-05, - "loss": 3.8548, - "step": 399 - }, - { - "epoch": 0.001778176879699666, - "grad_norm": 12.3125, - "learning_rate": 1.9999997574141388e-05, - "loss": 3.5211, - "step": 400 - }, - { - "epoch": 0.001782622321898915, - "grad_norm": 10.8125, - "learning_rate": 1.999999756196646e-05, - "loss": 3.8054, - "step": 401 - }, - { - "epoch": 0.0017870677640981643, - "grad_norm": 9.125, - "learning_rate": 1.9999997549761056e-05, - "loss": 3.9125, - "step": 402 - }, - { - "epoch": 0.0017915132062974134, - "grad_norm": 11.875, - "learning_rate": 1.9999997537525174e-05, - "loss": 3.5618, - "step": 403 - }, - { - "epoch": 0.0017959586484966627, - "grad_norm": 11.25, - "learning_rate": 1.999999752525882e-05, - "loss": 3.6129, - "step": 404 - }, - { - "epoch": 0.0018004040906959117, - "grad_norm": 8.4375, - "learning_rate": 1.9999997512961987e-05, - "loss": 3.9453, - "step": 405 - }, - { - "epoch": 0.001804849532895161, - "grad_norm": 11.4375, - "learning_rate": 1.999999750063468e-05, - "loss": 3.6281, - "step": 406 - }, - { - "epoch": 0.00180929497509441, - "grad_norm": 10.125, - "learning_rate": 1.99999974882769e-05, - "loss": 3.8177, - "step": 407 - }, - { - "epoch": 0.0018137404172936591, - "grad_norm": 10.5, - "learning_rate": 1.9999997475888643e-05, - "loss": 3.7559, - "step": 408 - }, - { - "epoch": 0.0018181858594929084, - "grad_norm": 12.5625, - "learning_rate": 1.999999746346991e-05, - "loss": 3.483, - "step": 409 - }, - { - "epoch": 0.0018226313016921575, - "grad_norm": 10.9375, - "learning_rate": 1.9999997451020697e-05, - "loss": 3.717, - "step": 410 - }, - { - "epoch": 0.0018270767438914068, - "grad_norm": 8.75, - "learning_rate": 1.9999997438541015e-05, - "loss": 3.8596, - "step": 411 - }, - { - "epoch": 0.0018315221860906558, - "grad_norm": 12.5, - "learning_rate": 1.9999997426030857e-05, - "loss": 3.556, - "step": 412 - }, - { - "epoch": 0.0018359676282899051, - "grad_norm": 7.9375, - "learning_rate": 1.9999997413490223e-05, - "loss": 3.9077, - "step": 413 - }, - { - "epoch": 0.0018404130704891542, - "grad_norm": 8.5, - "learning_rate": 1.9999997400919112e-05, - "loss": 3.8719, - "step": 414 - }, - { - "epoch": 0.0018448585126884035, - "grad_norm": 12.1875, - "learning_rate": 1.9999997388317528e-05, - "loss": 3.5653, - "step": 415 - }, - { - "epoch": 0.0018493039548876525, - "grad_norm": 10.875, - "learning_rate": 1.999999737568547e-05, - "loss": 3.6182, - "step": 416 - }, - { - "epoch": 0.0018537493970869018, - "grad_norm": 9.25, - "learning_rate": 1.999999736302293e-05, - "loss": 3.8269, - "step": 417 - }, - { - "epoch": 0.0018581948392861509, - "grad_norm": 11.5625, - "learning_rate": 1.9999997350329917e-05, - "loss": 3.5552, - "step": 418 - }, - { - "epoch": 0.0018626402814854001, - "grad_norm": 9.6875, - "learning_rate": 1.9999997337606435e-05, - "loss": 3.7447, - "step": 419 - }, - { - "epoch": 0.0018670857236846492, - "grad_norm": 9.1875, - "learning_rate": 1.999999732485247e-05, - "loss": 3.7601, - "step": 420 - }, - { - "epoch": 0.0018715311658838985, - "grad_norm": 14.25, - "learning_rate": 1.9999997312068032e-05, - "loss": 3.3464, - "step": 421 - }, - { - "epoch": 0.0018759766080831475, - "grad_norm": 11.9375, - "learning_rate": 1.999999729925312e-05, - "loss": 3.6996, - "step": 422 - }, - { - "epoch": 0.0018804220502823968, - "grad_norm": 10.9375, - "learning_rate": 1.999999728640773e-05, - "loss": 3.584, - "step": 423 - }, - { - "epoch": 0.0018848674924816459, - "grad_norm": 10.875, - "learning_rate": 1.9999997273531865e-05, - "loss": 3.516, - "step": 424 - }, - { - "epoch": 0.001889312934680895, - "grad_norm": 10.25, - "learning_rate": 1.999999726062553e-05, - "loss": 3.6636, - "step": 425 - }, - { - "epoch": 0.0018937583768801442, - "grad_norm": 11.25, - "learning_rate": 1.999999724768871e-05, - "loss": 3.5651, - "step": 426 - }, - { - "epoch": 0.0018982038190793933, - "grad_norm": 13.125, - "learning_rate": 1.9999997234721422e-05, - "loss": 3.499, - "step": 427 - }, - { - "epoch": 0.0019026492612786426, - "grad_norm": 10.625, - "learning_rate": 1.9999997221723654e-05, - "loss": 3.6427, - "step": 428 - }, - { - "epoch": 0.0019070947034778916, - "grad_norm": 12.125, - "learning_rate": 1.9999997208695413e-05, - "loss": 3.4008, - "step": 429 - }, - { - "epoch": 0.001911540145677141, - "grad_norm": 6.15625, - "learning_rate": 1.99999971956367e-05, - "loss": 4.3989, - "step": 430 - }, - { - "epoch": 0.00191598558787639, - "grad_norm": 12.375, - "learning_rate": 1.999999718254751e-05, - "loss": 3.4254, - "step": 431 - }, - { - "epoch": 0.0019204310300756392, - "grad_norm": 10.4375, - "learning_rate": 1.999999716942784e-05, - "loss": 3.7712, - "step": 432 - }, - { - "epoch": 0.0019248764722748883, - "grad_norm": 10.375, - "learning_rate": 1.9999997156277694e-05, - "loss": 3.6701, - "step": 433 - }, - { - "epoch": 0.0019293219144741376, - "grad_norm": 11.9375, - "learning_rate": 1.999999714309708e-05, - "loss": 3.5038, - "step": 434 - }, - { - "epoch": 0.0019337673566733867, - "grad_norm": 12.5625, - "learning_rate": 1.9999997129885984e-05, - "loss": 3.4095, - "step": 435 - }, - { - "epoch": 0.001938212798872636, - "grad_norm": 11.3125, - "learning_rate": 1.9999997116644416e-05, - "loss": 3.4975, - "step": 436 - }, - { - "epoch": 0.001942658241071885, - "grad_norm": 10.0625, - "learning_rate": 1.9999997103372372e-05, - "loss": 3.6372, - "step": 437 - }, - { - "epoch": 0.0019471036832711343, - "grad_norm": 9.875, - "learning_rate": 1.999999709006985e-05, - "loss": 3.6131, - "step": 438 - }, - { - "epoch": 0.0019515491254703833, - "grad_norm": 10.1875, - "learning_rate": 1.9999997076736855e-05, - "loss": 3.6679, - "step": 439 - }, - { - "epoch": 0.0019559945676696326, - "grad_norm": 12.0625, - "learning_rate": 1.9999997063373386e-05, - "loss": 3.5668, - "step": 440 - }, - { - "epoch": 0.0019604400098688817, - "grad_norm": 10.1875, - "learning_rate": 1.999999704997944e-05, - "loss": 3.6034, - "step": 441 - }, - { - "epoch": 0.0019648854520681307, - "grad_norm": 8.8125, - "learning_rate": 1.9999997036555018e-05, - "loss": 3.8659, - "step": 442 - }, - { - "epoch": 0.00196933089426738, - "grad_norm": 8.0, - "learning_rate": 1.9999997023100123e-05, - "loss": 3.882, - "step": 443 - }, - { - "epoch": 0.0019737763364666293, - "grad_norm": 9.125, - "learning_rate": 1.9999997009614752e-05, - "loss": 3.7203, - "step": 444 - }, - { - "epoch": 0.0019782217786658784, - "grad_norm": 11.4375, - "learning_rate": 1.9999996996098902e-05, - "loss": 3.4729, - "step": 445 - }, - { - "epoch": 0.0019826672208651274, - "grad_norm": 11.5, - "learning_rate": 1.9999996982552578e-05, - "loss": 3.5027, - "step": 446 - }, - { - "epoch": 0.0019871126630643765, - "grad_norm": 6.53125, - "learning_rate": 1.999999696897578e-05, - "loss": 3.9166, - "step": 447 - }, - { - "epoch": 0.001991558105263626, - "grad_norm": 9.25, - "learning_rate": 1.999999695536851e-05, - "loss": 3.7694, - "step": 448 - }, - { - "epoch": 0.001996003547462875, - "grad_norm": 11.3125, - "learning_rate": 1.9999996941730757e-05, - "loss": 3.5172, - "step": 449 - }, - { - "epoch": 0.002000448989662124, - "grad_norm": 9.6875, - "learning_rate": 1.9999996928062532e-05, - "loss": 3.7209, - "step": 450 - }, - { - "epoch": 0.002004894431861373, - "grad_norm": 11.0, - "learning_rate": 1.9999996914363834e-05, - "loss": 3.4922, - "step": 451 - }, - { - "epoch": 0.0020093398740606227, - "grad_norm": 12.3125, - "learning_rate": 1.9999996900634657e-05, - "loss": 3.3862, - "step": 452 - }, - { - "epoch": 0.0020137853162598717, - "grad_norm": 11.5, - "learning_rate": 1.9999996886875006e-05, - "loss": 3.5077, - "step": 453 - }, - { - "epoch": 0.002018230758459121, - "grad_norm": 11.25, - "learning_rate": 1.999999687308488e-05, - "loss": 3.4912, - "step": 454 - }, - { - "epoch": 0.00202267620065837, - "grad_norm": 8.125, - "learning_rate": 1.9999996859264276e-05, - "loss": 3.8365, - "step": 455 - }, - { - "epoch": 0.0020271216428576194, - "grad_norm": 11.875, - "learning_rate": 1.99999968454132e-05, - "loss": 3.4271, - "step": 456 - }, - { - "epoch": 0.0020315670850568684, - "grad_norm": 10.25, - "learning_rate": 1.999999683153165e-05, - "loss": 3.4894, - "step": 457 - }, - { - "epoch": 0.0020360125272561175, - "grad_norm": 12.0625, - "learning_rate": 1.999999681761962e-05, - "loss": 3.4184, - "step": 458 - }, - { - "epoch": 0.0020404579694553665, - "grad_norm": 8.6875, - "learning_rate": 1.9999996803677116e-05, - "loss": 3.7042, - "step": 459 - }, - { - "epoch": 0.0020449034116546156, - "grad_norm": 10.375, - "learning_rate": 1.999999678970414e-05, - "loss": 3.6001, - "step": 460 - }, - { - "epoch": 0.002049348853853865, - "grad_norm": 10.4375, - "learning_rate": 1.9999996775700685e-05, - "loss": 3.6513, - "step": 461 - }, - { - "epoch": 0.002053794296053114, - "grad_norm": 9.125, - "learning_rate": 1.9999996761666755e-05, - "loss": 3.7353, - "step": 462 - }, - { - "epoch": 0.0020582397382523632, - "grad_norm": 11.4375, - "learning_rate": 1.999999674760235e-05, - "loss": 3.4612, - "step": 463 - }, - { - "epoch": 0.0020626851804516123, - "grad_norm": 10.6875, - "learning_rate": 1.9999996733507467e-05, - "loss": 3.5767, - "step": 464 - }, - { - "epoch": 0.0020671306226508618, - "grad_norm": 12.75, - "learning_rate": 1.9999996719382115e-05, - "loss": 3.2946, - "step": 465 - }, - { - "epoch": 0.002071576064850111, - "grad_norm": 10.875, - "learning_rate": 1.999999670522628e-05, - "loss": 3.5141, - "step": 466 - }, - { - "epoch": 0.00207602150704936, - "grad_norm": 10.8125, - "learning_rate": 1.9999996691039976e-05, - "loss": 3.5843, - "step": 467 - }, - { - "epoch": 0.002080466949248609, - "grad_norm": 8.75, - "learning_rate": 1.999999667682319e-05, - "loss": 3.7526, - "step": 468 - }, - { - "epoch": 0.0020849123914478585, - "grad_norm": 11.0625, - "learning_rate": 1.9999996662575935e-05, - "loss": 3.509, - "step": 469 - }, - { - "epoch": 0.0020893578336471075, - "grad_norm": 12.1875, - "learning_rate": 1.9999996648298202e-05, - "loss": 3.4055, - "step": 470 - }, - { - "epoch": 0.0020938032758463566, - "grad_norm": 8.5625, - "learning_rate": 1.9999996633989993e-05, - "loss": 3.8298, - "step": 471 - }, - { - "epoch": 0.0020982487180456057, - "grad_norm": 10.25, - "learning_rate": 1.999999661965131e-05, - "loss": 3.5558, - "step": 472 - }, - { - "epoch": 0.002102694160244855, - "grad_norm": 11.25, - "learning_rate": 1.9999996605282152e-05, - "loss": 3.405, - "step": 473 - }, - { - "epoch": 0.002107139602444104, - "grad_norm": 9.3125, - "learning_rate": 1.9999996590882514e-05, - "loss": 3.7025, - "step": 474 - }, - { - "epoch": 0.0021115850446433533, - "grad_norm": 12.3125, - "learning_rate": 1.9999996576452406e-05, - "loss": 3.3288, - "step": 475 - }, - { - "epoch": 0.0021160304868426023, - "grad_norm": 9.6875, - "learning_rate": 1.9999996561991823e-05, - "loss": 3.4993, - "step": 476 - }, - { - "epoch": 0.0021204759290418514, - "grad_norm": 11.9375, - "learning_rate": 1.999999654750076e-05, - "loss": 3.4184, - "step": 477 - }, - { - "epoch": 0.002124921371241101, - "grad_norm": 10.9375, - "learning_rate": 1.9999996532979223e-05, - "loss": 3.4869, - "step": 478 - }, - { - "epoch": 0.00212936681344035, - "grad_norm": 9.8125, - "learning_rate": 1.999999651842721e-05, - "loss": 3.6483, - "step": 479 - }, - { - "epoch": 0.002133812255639599, - "grad_norm": 7.96875, - "learning_rate": 1.9999996503844725e-05, - "loss": 3.8066, - "step": 480 - }, - { - "epoch": 0.002138257697838848, - "grad_norm": 8.875, - "learning_rate": 1.9999996489231764e-05, - "loss": 3.6627, - "step": 481 - }, - { - "epoch": 0.0021427031400380976, - "grad_norm": 11.5, - "learning_rate": 1.9999996474588326e-05, - "loss": 3.463, - "step": 482 - }, - { - "epoch": 0.0021471485822373466, - "grad_norm": 10.9375, - "learning_rate": 1.9999996459914412e-05, - "loss": 3.6264, - "step": 483 - }, - { - "epoch": 0.0021515940244365957, - "grad_norm": 11.0625, - "learning_rate": 1.9999996445210025e-05, - "loss": 3.51, - "step": 484 - }, - { - "epoch": 0.0021560394666358448, - "grad_norm": 10.0, - "learning_rate": 1.9999996430475158e-05, - "loss": 3.5382, - "step": 485 - }, - { - "epoch": 0.0021604849088350943, - "grad_norm": 11.1875, - "learning_rate": 1.999999641570982e-05, - "loss": 3.551, - "step": 486 - }, - { - "epoch": 0.0021649303510343433, - "grad_norm": 8.125, - "learning_rate": 1.9999996400914003e-05, - "loss": 3.8123, - "step": 487 - }, - { - "epoch": 0.0021693757932335924, - "grad_norm": 8.75, - "learning_rate": 1.9999996386087715e-05, - "loss": 3.7286, - "step": 488 - }, - { - "epoch": 0.0021738212354328415, - "grad_norm": 11.9375, - "learning_rate": 1.9999996371230947e-05, - "loss": 3.3771, - "step": 489 - }, - { - "epoch": 0.002178266677632091, - "grad_norm": 11.4375, - "learning_rate": 1.9999996356343705e-05, - "loss": 3.4106, - "step": 490 - }, - { - "epoch": 0.00218271211983134, - "grad_norm": 11.5, - "learning_rate": 1.999999634142599e-05, - "loss": 3.402, - "step": 491 - }, - { - "epoch": 0.002187157562030589, - "grad_norm": 10.125, - "learning_rate": 1.9999996326477798e-05, - "loss": 3.4834, - "step": 492 - }, - { - "epoch": 0.002191603004229838, - "grad_norm": 11.4375, - "learning_rate": 1.9999996311499132e-05, - "loss": 3.4038, - "step": 493 - }, - { - "epoch": 0.0021960484464290876, - "grad_norm": 11.625, - "learning_rate": 1.999999629648999e-05, - "loss": 3.2577, - "step": 494 - }, - { - "epoch": 0.0022004938886283367, - "grad_norm": 11.0, - "learning_rate": 1.999999628145037e-05, - "loss": 3.3284, - "step": 495 - }, - { - "epoch": 0.0022049393308275858, - "grad_norm": 12.125, - "learning_rate": 1.999999626638028e-05, - "loss": 3.3102, - "step": 496 - }, - { - "epoch": 0.002209384773026835, - "grad_norm": 9.3125, - "learning_rate": 1.9999996251279707e-05, - "loss": 3.6485, - "step": 497 - }, - { - "epoch": 0.002213830215226084, - "grad_norm": 11.0625, - "learning_rate": 1.9999996236148663e-05, - "loss": 3.414, - "step": 498 - }, - { - "epoch": 0.0022182756574253334, - "grad_norm": 10.75, - "learning_rate": 1.9999996220987143e-05, - "loss": 3.5082, - "step": 499 - }, - { - "epoch": 0.0022227210996245824, - "grad_norm": 10.875, - "learning_rate": 1.999999620579515e-05, - "loss": 3.3798, - "step": 500 - }, - { - "epoch": 0.0022271665418238315, - "grad_norm": 9.75, - "learning_rate": 1.9999996190572676e-05, - "loss": 3.5725, - "step": 501 - }, - { - "epoch": 0.0022316119840230806, - "grad_norm": 10.75, - "learning_rate": 1.999999617531973e-05, - "loss": 3.4291, - "step": 502 - }, - { - "epoch": 0.00223605742622233, - "grad_norm": 8.3125, - "learning_rate": 1.9999996160036312e-05, - "loss": 3.7093, - "step": 503 - }, - { - "epoch": 0.002240502868421579, - "grad_norm": 9.5625, - "learning_rate": 1.9999996144722417e-05, - "loss": 3.6666, - "step": 504 - }, - { - "epoch": 0.002244948310620828, - "grad_norm": 11.25, - "learning_rate": 1.9999996129378042e-05, - "loss": 3.3824, - "step": 505 - }, - { - "epoch": 0.0022493937528200772, - "grad_norm": 10.6875, - "learning_rate": 1.999999611400319e-05, - "loss": 3.462, - "step": 506 - }, - { - "epoch": 0.0022538391950193267, - "grad_norm": 10.8125, - "learning_rate": 1.999999609859787e-05, - "loss": 3.4495, - "step": 507 - }, - { - "epoch": 0.002258284637218576, - "grad_norm": 9.875, - "learning_rate": 1.999999608316207e-05, - "loss": 3.5392, - "step": 508 - }, - { - "epoch": 0.002262730079417825, - "grad_norm": 10.75, - "learning_rate": 1.99999960676958e-05, - "loss": 3.4609, - "step": 509 - }, - { - "epoch": 0.002267175521617074, - "grad_norm": 10.625, - "learning_rate": 1.999999605219905e-05, - "loss": 3.4103, - "step": 510 - }, - { - "epoch": 0.0022716209638163234, - "grad_norm": 9.1875, - "learning_rate": 1.9999996036671825e-05, - "loss": 3.7679, - "step": 511 - }, - { - "epoch": 0.0022760664060155725, - "grad_norm": 12.6875, - "learning_rate": 1.9999996021114123e-05, - "loss": 3.3041, - "step": 512 - }, - { - "epoch": 0.0022805118482148216, - "grad_norm": 11.5, - "learning_rate": 1.999999600552595e-05, - "loss": 3.3168, - "step": 513 - }, - { - "epoch": 0.0022849572904140706, - "grad_norm": 12.375, - "learning_rate": 1.9999995989907298e-05, - "loss": 3.2309, - "step": 514 - }, - { - "epoch": 0.0022894027326133197, - "grad_norm": 10.5625, - "learning_rate": 1.999999597425817e-05, - "loss": 3.5079, - "step": 515 - }, - { - "epoch": 0.002293848174812569, - "grad_norm": 12.375, - "learning_rate": 1.9999995958578568e-05, - "loss": 3.2642, - "step": 516 - }, - { - "epoch": 0.0022982936170118182, - "grad_norm": 13.875, - "learning_rate": 1.9999995942868492e-05, - "loss": 3.0731, - "step": 517 - }, - { - "epoch": 0.0023027390592110673, - "grad_norm": 9.75, - "learning_rate": 1.999999592712794e-05, - "loss": 3.4679, - "step": 518 - }, - { - "epoch": 0.0023071845014103164, - "grad_norm": 9.4375, - "learning_rate": 1.9999995911356912e-05, - "loss": 3.5807, - "step": 519 - }, - { - "epoch": 0.002311629943609566, - "grad_norm": 9.4375, - "learning_rate": 1.9999995895555407e-05, - "loss": 3.5454, - "step": 520 - }, - { - "epoch": 0.002316075385808815, - "grad_norm": 12.625, - "learning_rate": 1.999999587972343e-05, - "loss": 3.2515, - "step": 521 - }, - { - "epoch": 0.002320520828008064, - "grad_norm": 11.625, - "learning_rate": 1.9999995863860976e-05, - "loss": 3.3921, - "step": 522 - }, - { - "epoch": 0.002324966270207313, - "grad_norm": 10.75, - "learning_rate": 1.999999584796805e-05, - "loss": 3.4137, - "step": 523 - }, - { - "epoch": 0.0023294117124065625, - "grad_norm": 10.375, - "learning_rate": 1.999999583204464e-05, - "loss": 3.443, - "step": 524 - }, - { - "epoch": 0.0023338571546058116, - "grad_norm": 9.5625, - "learning_rate": 1.999999581609076e-05, - "loss": 3.5869, - "step": 525 - }, - { - "epoch": 0.0023383025968050607, - "grad_norm": 11.5625, - "learning_rate": 1.9999995800106405e-05, - "loss": 3.3545, - "step": 526 - }, - { - "epoch": 0.0023427480390043097, - "grad_norm": 8.1875, - "learning_rate": 1.9999995784091573e-05, - "loss": 3.687, - "step": 527 - }, - { - "epoch": 0.0023471934812035592, - "grad_norm": 10.5625, - "learning_rate": 1.999999576804627e-05, - "loss": 3.3707, - "step": 528 - }, - { - "epoch": 0.0023516389234028083, - "grad_norm": 11.1875, - "learning_rate": 1.9999995751970485e-05, - "loss": 3.2675, - "step": 529 - }, - { - "epoch": 0.0023560843656020574, - "grad_norm": 10.0, - "learning_rate": 1.9999995735864228e-05, - "loss": 3.4382, - "step": 530 - }, - { - "epoch": 0.0023605298078013064, - "grad_norm": 9.5, - "learning_rate": 1.9999995719727495e-05, - "loss": 3.5451, - "step": 531 - }, - { - "epoch": 0.0023649752500005555, - "grad_norm": 12.625, - "learning_rate": 1.9999995703560285e-05, - "loss": 3.1354, - "step": 532 - }, - { - "epoch": 0.002369420692199805, - "grad_norm": 14.5, - "learning_rate": 1.9999995687362603e-05, - "loss": 3.0024, - "step": 533 - }, - { - "epoch": 0.002373866134399054, - "grad_norm": 11.875, - "learning_rate": 1.999999567113444e-05, - "loss": 3.2358, - "step": 534 - }, - { - "epoch": 0.002378311576598303, - "grad_norm": 11.9375, - "learning_rate": 1.9999995654875806e-05, - "loss": 3.1775, - "step": 535 - }, - { - "epoch": 0.002382757018797552, - "grad_norm": 9.75, - "learning_rate": 1.99999956385867e-05, - "loss": 3.5538, - "step": 536 - }, - { - "epoch": 0.0023872024609968017, - "grad_norm": 11.5, - "learning_rate": 1.9999995622267115e-05, - "loss": 3.2296, - "step": 537 - }, - { - "epoch": 0.0023916479031960507, - "grad_norm": 10.875, - "learning_rate": 1.9999995605917055e-05, - "loss": 3.3412, - "step": 538 - }, - { - "epoch": 0.0023960933453952998, - "grad_norm": 7.96875, - "learning_rate": 1.9999995589536518e-05, - "loss": 3.8301, - "step": 539 - }, - { - "epoch": 0.002400538787594549, - "grad_norm": 9.0, - "learning_rate": 1.9999995573125506e-05, - "loss": 3.6256, - "step": 540 - }, - { - "epoch": 0.0024049842297937983, - "grad_norm": 10.0, - "learning_rate": 1.999999555668402e-05, - "loss": 3.5335, - "step": 541 - }, - { - "epoch": 0.0024094296719930474, - "grad_norm": 8.5625, - "learning_rate": 1.9999995540212055e-05, - "loss": 3.5369, - "step": 542 - }, - { - "epoch": 0.0024138751141922965, - "grad_norm": 12.0625, - "learning_rate": 1.999999552370962e-05, - "loss": 3.2069, - "step": 543 - }, - { - "epoch": 0.0024183205563915455, - "grad_norm": 7.0, - "learning_rate": 1.9999995507176706e-05, - "loss": 3.9889, - "step": 544 - }, - { - "epoch": 0.002422765998590795, - "grad_norm": 9.375, - "learning_rate": 1.9999995490613316e-05, - "loss": 3.4777, - "step": 545 - }, - { - "epoch": 0.002427211440790044, - "grad_norm": 9.625, - "learning_rate": 1.9999995474019452e-05, - "loss": 3.4964, - "step": 546 - }, - { - "epoch": 0.002431656882989293, - "grad_norm": 8.75, - "learning_rate": 1.9999995457395113e-05, - "loss": 3.6935, - "step": 547 - }, - { - "epoch": 0.002436102325188542, - "grad_norm": 9.5, - "learning_rate": 1.9999995440740297e-05, - "loss": 3.422, - "step": 548 - }, - { - "epoch": 0.0024405477673877913, - "grad_norm": 10.0, - "learning_rate": 1.9999995424055008e-05, - "loss": 3.4478, - "step": 549 - }, - { - "epoch": 0.0024449932095870408, - "grad_norm": 10.4375, - "learning_rate": 1.9999995407339243e-05, - "loss": 3.3747, - "step": 550 - }, - { - "epoch": 0.00244943865178629, - "grad_norm": 11.25, - "learning_rate": 1.9999995390593002e-05, - "loss": 3.2678, - "step": 551 - }, - { - "epoch": 0.002453884093985539, - "grad_norm": 13.625, - "learning_rate": 1.9999995373816285e-05, - "loss": 3.0307, - "step": 552 - }, - { - "epoch": 0.002458329536184788, - "grad_norm": 9.9375, - "learning_rate": 1.9999995357009095e-05, - "loss": 3.4527, - "step": 553 - }, - { - "epoch": 0.0024627749783840375, - "grad_norm": 10.375, - "learning_rate": 1.9999995340171425e-05, - "loss": 3.3464, - "step": 554 - }, - { - "epoch": 0.0024672204205832865, - "grad_norm": 10.8125, - "learning_rate": 1.9999995323303282e-05, - "loss": 3.3944, - "step": 555 - }, - { - "epoch": 0.0024716658627825356, - "grad_norm": 10.8125, - "learning_rate": 1.9999995306404666e-05, - "loss": 3.3686, - "step": 556 - }, - { - "epoch": 0.0024761113049817846, - "grad_norm": 7.96875, - "learning_rate": 1.9999995289475574e-05, - "loss": 3.6202, - "step": 557 - }, - { - "epoch": 0.002480556747181034, - "grad_norm": 12.5, - "learning_rate": 1.9999995272516003e-05, - "loss": 3.152, - "step": 558 - }, - { - "epoch": 0.002485002189380283, - "grad_norm": 10.6875, - "learning_rate": 1.999999525552596e-05, - "loss": 3.3756, - "step": 559 - }, - { - "epoch": 0.0024894476315795323, - "grad_norm": 10.125, - "learning_rate": 1.9999995238505438e-05, - "loss": 3.3489, - "step": 560 - }, - { - "epoch": 0.0024938930737787813, - "grad_norm": 12.0625, - "learning_rate": 1.9999995221454444e-05, - "loss": 3.1829, - "step": 561 - }, - { - "epoch": 0.002498338515978031, - "grad_norm": 7.96875, - "learning_rate": 1.999999520437297e-05, - "loss": 3.4863, - "step": 562 - }, - { - "epoch": 0.00250278395817728, - "grad_norm": 11.9375, - "learning_rate": 1.999999518726103e-05, - "loss": 3.1497, - "step": 563 - }, - { - "epoch": 0.002507229400376529, - "grad_norm": 10.75, - "learning_rate": 1.9999995170118607e-05, - "loss": 3.1597, - "step": 564 - }, - { - "epoch": 0.002511674842575778, - "grad_norm": 11.375, - "learning_rate": 1.999999515294571e-05, - "loss": 3.2349, - "step": 565 - }, - { - "epoch": 0.002516120284775027, - "grad_norm": 8.9375, - "learning_rate": 1.9999995135742337e-05, - "loss": 3.5368, - "step": 566 - }, - { - "epoch": 0.0025205657269742766, - "grad_norm": 10.0625, - "learning_rate": 1.999999511850849e-05, - "loss": 3.3565, - "step": 567 - }, - { - "epoch": 0.0025250111691735256, - "grad_norm": 10.3125, - "learning_rate": 1.999999510124417e-05, - "loss": 3.3276, - "step": 568 - }, - { - "epoch": 0.0025294566113727747, - "grad_norm": 11.25, - "learning_rate": 1.999999508394937e-05, - "loss": 3.3477, - "step": 569 - }, - { - "epoch": 0.0025339020535720238, - "grad_norm": 10.75, - "learning_rate": 1.9999995066624096e-05, - "loss": 3.27, - "step": 570 - }, - { - "epoch": 0.0025383474957712733, - "grad_norm": 9.0625, - "learning_rate": 1.9999995049268347e-05, - "loss": 3.4303, - "step": 571 - }, - { - "epoch": 0.0025427929379705223, - "grad_norm": 13.5, - "learning_rate": 1.999999503188212e-05, - "loss": 2.9878, - "step": 572 - }, - { - "epoch": 0.0025472383801697714, - "grad_norm": 11.0625, - "learning_rate": 1.999999501446542e-05, - "loss": 3.1965, - "step": 573 - }, - { - "epoch": 0.0025516838223690204, - "grad_norm": 11.8125, - "learning_rate": 1.999999499701825e-05, - "loss": 3.0944, - "step": 574 - }, - { - "epoch": 0.00255612926456827, - "grad_norm": 9.0, - "learning_rate": 1.9999994979540595e-05, - "loss": 3.5211, - "step": 575 - }, - { - "epoch": 0.002560574706767519, - "grad_norm": 11.625, - "learning_rate": 1.999999496203247e-05, - "loss": 3.1712, - "step": 576 - }, - { - "epoch": 0.002565020148966768, - "grad_norm": 10.625, - "learning_rate": 1.9999994944493868e-05, - "loss": 3.277, - "step": 577 - }, - { - "epoch": 0.002569465591166017, - "grad_norm": 9.6875, - "learning_rate": 1.9999994926924792e-05, - "loss": 3.4259, - "step": 578 - }, - { - "epoch": 0.0025739110333652666, - "grad_norm": 11.75, - "learning_rate": 1.999999490932524e-05, - "loss": 3.2173, - "step": 579 - }, - { - "epoch": 0.0025783564755645157, - "grad_norm": 9.5, - "learning_rate": 1.999999489169521e-05, - "loss": 3.436, - "step": 580 - }, - { - "epoch": 0.0025828019177637647, - "grad_norm": 11.75, - "learning_rate": 1.999999487403471e-05, - "loss": 3.1885, - "step": 581 - }, - { - "epoch": 0.002587247359963014, - "grad_norm": 9.5, - "learning_rate": 1.999999485634373e-05, - "loss": 3.4471, - "step": 582 - }, - { - "epoch": 0.002591692802162263, - "grad_norm": 10.4375, - "learning_rate": 1.9999994838622278e-05, - "loss": 3.3545, - "step": 583 - }, - { - "epoch": 0.0025961382443615124, - "grad_norm": 8.625, - "learning_rate": 1.9999994820870348e-05, - "loss": 3.4657, - "step": 584 - }, - { - "epoch": 0.0026005836865607614, - "grad_norm": 10.3125, - "learning_rate": 1.999999480308794e-05, - "loss": 3.3305, - "step": 585 - }, - { - "epoch": 0.0026050291287600105, - "grad_norm": 10.625, - "learning_rate": 1.9999994785275062e-05, - "loss": 3.2989, - "step": 586 - }, - { - "epoch": 0.0026094745709592596, - "grad_norm": 11.25, - "learning_rate": 1.9999994767431707e-05, - "loss": 3.203, - "step": 587 - }, - { - "epoch": 0.002613920013158509, - "grad_norm": 9.0, - "learning_rate": 1.9999994749557875e-05, - "loss": 3.4859, - "step": 588 - }, - { - "epoch": 0.002618365455357758, - "grad_norm": 9.1875, - "learning_rate": 1.9999994731653567e-05, - "loss": 3.5069, - "step": 589 - }, - { - "epoch": 0.002622810897557007, - "grad_norm": 10.0625, - "learning_rate": 1.9999994713718786e-05, - "loss": 3.3317, - "step": 590 - }, - { - "epoch": 0.0026272563397562562, - "grad_norm": 8.25, - "learning_rate": 1.999999469575353e-05, - "loss": 3.634, - "step": 591 - }, - { - "epoch": 0.0026317017819555057, - "grad_norm": 10.25, - "learning_rate": 1.9999994677757795e-05, - "loss": 3.3247, - "step": 592 - }, - { - "epoch": 0.002636147224154755, - "grad_norm": 10.75, - "learning_rate": 1.9999994659731586e-05, - "loss": 3.2889, - "step": 593 - }, - { - "epoch": 0.002640592666354004, - "grad_norm": 10.5625, - "learning_rate": 1.9999994641674903e-05, - "loss": 3.3302, - "step": 594 - }, - { - "epoch": 0.002645038108553253, - "grad_norm": 11.0, - "learning_rate": 1.9999994623587745e-05, - "loss": 3.26, - "step": 595 - }, - { - "epoch": 0.0026494835507525024, - "grad_norm": 10.5, - "learning_rate": 1.9999994605470113e-05, - "loss": 3.285, - "step": 596 - }, - { - "epoch": 0.0026539289929517515, - "grad_norm": 8.375, - "learning_rate": 1.9999994587322002e-05, - "loss": 3.432, - "step": 597 - }, - { - "epoch": 0.0026583744351510005, - "grad_norm": 10.0625, - "learning_rate": 1.9999994569143418e-05, - "loss": 3.371, - "step": 598 - }, - { - "epoch": 0.0026628198773502496, - "grad_norm": 11.4375, - "learning_rate": 1.9999994550934358e-05, - "loss": 3.091, - "step": 599 - }, - { - "epoch": 0.002667265319549499, - "grad_norm": 11.0, - "learning_rate": 1.999999453269482e-05, - "loss": 3.2493, - "step": 600 - }, - { - "epoch": 0.002671710761748748, - "grad_norm": 11.625, - "learning_rate": 1.999999451442481e-05, - "loss": 3.0888, - "step": 601 - }, - { - "epoch": 0.0026761562039479972, - "grad_norm": 11.5, - "learning_rate": 1.9999994496124323e-05, - "loss": 3.2067, - "step": 602 - }, - { - "epoch": 0.0026806016461472463, - "grad_norm": 11.0625, - "learning_rate": 1.999999447779336e-05, - "loss": 3.2311, - "step": 603 - }, - { - "epoch": 0.0026850470883464954, - "grad_norm": 10.9375, - "learning_rate": 1.9999994459431923e-05, - "loss": 3.2996, - "step": 604 - }, - { - "epoch": 0.002689492530545745, - "grad_norm": 9.0, - "learning_rate": 1.9999994441040012e-05, - "loss": 3.4908, - "step": 605 - }, - { - "epoch": 0.002693937972744994, - "grad_norm": 11.375, - "learning_rate": 1.9999994422617624e-05, - "loss": 3.1737, - "step": 606 - }, - { - "epoch": 0.002698383414944243, - "grad_norm": 11.5625, - "learning_rate": 1.9999994404164758e-05, - "loss": 3.2563, - "step": 607 - }, - { - "epoch": 0.002702828857143492, - "grad_norm": 8.75, - "learning_rate": 1.9999994385681418e-05, - "loss": 3.4574, - "step": 608 - }, - { - "epoch": 0.0027072742993427415, - "grad_norm": 11.125, - "learning_rate": 1.9999994367167605e-05, - "loss": 3.2396, - "step": 609 - }, - { - "epoch": 0.0027117197415419906, - "grad_norm": 10.3125, - "learning_rate": 1.9999994348623316e-05, - "loss": 3.2326, - "step": 610 - }, - { - "epoch": 0.0027161651837412397, - "grad_norm": 8.6875, - "learning_rate": 1.999999433004855e-05, - "loss": 3.5007, - "step": 611 - }, - { - "epoch": 0.0027206106259404887, - "grad_norm": 10.75, - "learning_rate": 1.999999431144331e-05, - "loss": 3.1768, - "step": 612 - }, - { - "epoch": 0.002725056068139738, - "grad_norm": 10.0625, - "learning_rate": 1.9999994292807593e-05, - "loss": 3.3782, - "step": 613 - }, - { - "epoch": 0.0027295015103389873, - "grad_norm": 10.375, - "learning_rate": 1.9999994274141402e-05, - "loss": 3.2574, - "step": 614 - }, - { - "epoch": 0.0027339469525382363, - "grad_norm": 11.3125, - "learning_rate": 1.9999994255444735e-05, - "loss": 3.1049, - "step": 615 - }, - { - "epoch": 0.0027383923947374854, - "grad_norm": 9.625, - "learning_rate": 1.999999423671759e-05, - "loss": 3.3579, - "step": 616 - }, - { - "epoch": 0.002742837836936735, - "grad_norm": 11.75, - "learning_rate": 1.9999994217959973e-05, - "loss": 3.154, - "step": 617 - }, - { - "epoch": 0.002747283279135984, - "grad_norm": 10.25, - "learning_rate": 1.999999419917188e-05, - "loss": 3.4031, - "step": 618 - }, - { - "epoch": 0.002751728721335233, - "grad_norm": 10.875, - "learning_rate": 1.9999994180353313e-05, - "loss": 3.2815, - "step": 619 - }, - { - "epoch": 0.002756174163534482, - "grad_norm": 10.8125, - "learning_rate": 1.999999416150427e-05, - "loss": 3.1934, - "step": 620 - }, - { - "epoch": 0.002760619605733731, - "grad_norm": 12.1875, - "learning_rate": 1.999999414262475e-05, - "loss": 3.1404, - "step": 621 - }, - { - "epoch": 0.0027650650479329806, - "grad_norm": 9.0625, - "learning_rate": 1.9999994123714754e-05, - "loss": 3.4232, - "step": 622 - }, - { - "epoch": 0.0027695104901322297, - "grad_norm": 11.3125, - "learning_rate": 1.9999994104774284e-05, - "loss": 3.1405, - "step": 623 - }, - { - "epoch": 0.0027739559323314788, - "grad_norm": 9.3125, - "learning_rate": 1.9999994085803342e-05, - "loss": 3.4351, - "step": 624 - }, - { - "epoch": 0.002778401374530728, - "grad_norm": 8.8125, - "learning_rate": 1.999999406680192e-05, - "loss": 3.2862, - "step": 625 - }, - { - "epoch": 0.0027828468167299773, - "grad_norm": 8.8125, - "learning_rate": 1.999999404777002e-05, - "loss": 3.4087, - "step": 626 - }, - { - "epoch": 0.0027872922589292264, - "grad_norm": 8.125, - "learning_rate": 1.999999402870765e-05, - "loss": 3.4755, - "step": 627 - }, - { - "epoch": 0.0027917377011284755, - "grad_norm": 10.1875, - "learning_rate": 1.99999940096148e-05, - "loss": 3.276, - "step": 628 - }, - { - "epoch": 0.0027961831433277245, - "grad_norm": 9.25, - "learning_rate": 1.999999399049148e-05, - "loss": 3.3477, - "step": 629 - }, - { - "epoch": 0.002800628585526974, - "grad_norm": 12.375, - "learning_rate": 1.999999397133768e-05, - "loss": 2.9778, - "step": 630 - }, - { - "epoch": 0.002805074027726223, - "grad_norm": 12.25, - "learning_rate": 1.9999993952153408e-05, - "loss": 3.0629, - "step": 631 - }, - { - "epoch": 0.002809519469925472, - "grad_norm": 8.875, - "learning_rate": 1.999999393293866e-05, - "loss": 3.2943, - "step": 632 - }, - { - "epoch": 0.002813964912124721, - "grad_norm": 10.125, - "learning_rate": 1.9999993913693436e-05, - "loss": 3.2947, - "step": 633 - }, - { - "epoch": 0.0028184103543239707, - "grad_norm": 11.5, - "learning_rate": 1.9999993894417735e-05, - "loss": 3.1093, - "step": 634 - }, - { - "epoch": 0.0028228557965232198, - "grad_norm": 9.5625, - "learning_rate": 1.999999387511156e-05, - "loss": 3.3693, - "step": 635 - }, - { - "epoch": 0.002827301238722469, - "grad_norm": 10.3125, - "learning_rate": 1.999999385577491e-05, - "loss": 3.2095, - "step": 636 - }, - { - "epoch": 0.002831746680921718, - "grad_norm": 6.5, - "learning_rate": 1.9999993836407785e-05, - "loss": 3.7453, - "step": 637 - }, - { - "epoch": 0.002836192123120967, - "grad_norm": 9.4375, - "learning_rate": 1.9999993817010182e-05, - "loss": 3.2892, - "step": 638 - }, - { - "epoch": 0.0028406375653202164, - "grad_norm": 11.0, - "learning_rate": 1.9999993797582106e-05, - "loss": 3.1485, - "step": 639 - }, - { - "epoch": 0.0028450830075194655, - "grad_norm": 9.6875, - "learning_rate": 1.9999993778123557e-05, - "loss": 3.3941, - "step": 640 - }, - { - "epoch": 0.0028495284497187146, - "grad_norm": 9.0625, - "learning_rate": 1.9999993758634528e-05, - "loss": 3.2603, - "step": 641 - }, - { - "epoch": 0.0028539738919179636, - "grad_norm": 10.3125, - "learning_rate": 1.9999993739115026e-05, - "loss": 3.2634, - "step": 642 - }, - { - "epoch": 0.002858419334117213, - "grad_norm": 10.0, - "learning_rate": 1.999999371956505e-05, - "loss": 3.2908, - "step": 643 - }, - { - "epoch": 0.002862864776316462, - "grad_norm": 11.1875, - "learning_rate": 1.9999993699984595e-05, - "loss": 3.1282, - "step": 644 - }, - { - "epoch": 0.0028673102185157113, - "grad_norm": 11.6875, - "learning_rate": 1.9999993680373664e-05, - "loss": 3.1469, - "step": 645 - }, - { - "epoch": 0.0028717556607149603, - "grad_norm": 8.3125, - "learning_rate": 1.999999366073226e-05, - "loss": 3.3643, - "step": 646 - }, - { - "epoch": 0.00287620110291421, - "grad_norm": 10.0, - "learning_rate": 1.999999364106038e-05, - "loss": 3.1389, - "step": 647 - }, - { - "epoch": 0.002880646545113459, - "grad_norm": 10.8125, - "learning_rate": 1.9999993621358026e-05, - "loss": 3.2337, - "step": 648 - }, - { - "epoch": 0.002885091987312708, - "grad_norm": 8.3125, - "learning_rate": 1.9999993601625194e-05, - "loss": 3.4623, - "step": 649 - }, - { - "epoch": 0.002889537429511957, - "grad_norm": 9.125, - "learning_rate": 1.9999993581861886e-05, - "loss": 3.2623, - "step": 650 - }, - { - "epoch": 0.0028939828717112065, - "grad_norm": 8.125, - "learning_rate": 1.999999356206811e-05, - "loss": 3.4831, - "step": 651 - }, - { - "epoch": 0.0028984283139104556, - "grad_norm": 10.75, - "learning_rate": 1.999999354224385e-05, - "loss": 3.087, - "step": 652 - }, - { - "epoch": 0.0029028737561097046, - "grad_norm": 8.8125, - "learning_rate": 1.9999993522389118e-05, - "loss": 3.5375, - "step": 653 - }, - { - "epoch": 0.0029073191983089537, - "grad_norm": 10.5625, - "learning_rate": 1.9999993502503908e-05, - "loss": 3.0652, - "step": 654 - }, - { - "epoch": 0.0029117646405082027, - "grad_norm": 11.9375, - "learning_rate": 1.999999348258823e-05, - "loss": 3.0065, - "step": 655 - }, - { - "epoch": 0.0029162100827074522, - "grad_norm": 9.5, - "learning_rate": 1.999999346264207e-05, - "loss": 3.316, - "step": 656 - }, - { - "epoch": 0.0029206555249067013, - "grad_norm": 9.0, - "learning_rate": 1.9999993442665435e-05, - "loss": 3.3838, - "step": 657 - }, - { - "epoch": 0.0029251009671059504, - "grad_norm": 11.625, - "learning_rate": 1.9999993422658327e-05, - "loss": 3.1086, - "step": 658 - }, - { - "epoch": 0.0029295464093051994, - "grad_norm": 10.9375, - "learning_rate": 1.9999993402620743e-05, - "loss": 3.1647, - "step": 659 - }, - { - "epoch": 0.002933991851504449, - "grad_norm": 11.0625, - "learning_rate": 1.9999993382552682e-05, - "loss": 3.1304, - "step": 660 - }, - { - "epoch": 0.002938437293703698, - "grad_norm": 9.875, - "learning_rate": 1.9999993362454145e-05, - "loss": 3.1442, - "step": 661 - }, - { - "epoch": 0.002942882735902947, - "grad_norm": 11.6875, - "learning_rate": 1.9999993342325136e-05, - "loss": 2.9592, - "step": 662 - }, - { - "epoch": 0.002947328178102196, - "grad_norm": 9.0, - "learning_rate": 1.999999332216565e-05, - "loss": 3.312, - "step": 663 - }, - { - "epoch": 0.0029517736203014456, - "grad_norm": 8.3125, - "learning_rate": 1.9999993301975688e-05, - "loss": 3.3625, - "step": 664 - }, - { - "epoch": 0.0029562190625006947, - "grad_norm": 11.1875, - "learning_rate": 1.999999328175525e-05, - "loss": 2.9879, - "step": 665 - }, - { - "epoch": 0.0029606645046999437, - "grad_norm": 6.53125, - "learning_rate": 1.999999326150434e-05, - "loss": 3.6176, - "step": 666 - }, - { - "epoch": 0.002965109946899193, - "grad_norm": 10.875, - "learning_rate": 1.999999324122295e-05, - "loss": 3.1512, - "step": 667 - }, - { - "epoch": 0.0029695553890984423, - "grad_norm": 11.8125, - "learning_rate": 1.9999993220911088e-05, - "loss": 2.9802, - "step": 668 - }, - { - "epoch": 0.0029740008312976914, - "grad_norm": 9.625, - "learning_rate": 1.9999993200568748e-05, - "loss": 3.3643, - "step": 669 - }, - { - "epoch": 0.0029784462734969404, - "grad_norm": 10.375, - "learning_rate": 1.9999993180195935e-05, - "loss": 3.253, - "step": 670 - }, - { - "epoch": 0.0029828917156961895, - "grad_norm": 8.1875, - "learning_rate": 1.9999993159792646e-05, - "loss": 3.5495, - "step": 671 - }, - { - "epoch": 0.0029873371578954385, - "grad_norm": 10.6875, - "learning_rate": 1.999999313935888e-05, - "loss": 3.1558, - "step": 672 - }, - { - "epoch": 0.002991782600094688, - "grad_norm": 8.125, - "learning_rate": 1.999999311889464e-05, - "loss": 3.4163, - "step": 673 - }, - { - "epoch": 0.002996228042293937, - "grad_norm": 9.875, - "learning_rate": 1.9999993098399925e-05, - "loss": 3.2813, - "step": 674 - }, - { - "epoch": 0.003000673484493186, - "grad_norm": 12.0, - "learning_rate": 1.9999993077874734e-05, - "loss": 2.9985, - "step": 675 - }, - { - "epoch": 0.0030051189266924352, - "grad_norm": 9.375, - "learning_rate": 1.9999993057319068e-05, - "loss": 3.3644, - "step": 676 - }, - { - "epoch": 0.0030095643688916847, - "grad_norm": 7.84375, - "learning_rate": 1.9999993036732925e-05, - "loss": 3.5214, - "step": 677 - }, - { - "epoch": 0.003014009811090934, - "grad_norm": 10.6875, - "learning_rate": 1.999999301611631e-05, - "loss": 3.0842, - "step": 678 - }, - { - "epoch": 0.003018455253290183, - "grad_norm": 9.1875, - "learning_rate": 1.9999992995469216e-05, - "loss": 3.1887, - "step": 679 - }, - { - "epoch": 0.003022900695489432, - "grad_norm": 10.8125, - "learning_rate": 1.9999992974791648e-05, - "loss": 3.121, - "step": 680 - }, - { - "epoch": 0.0030273461376886814, - "grad_norm": 11.25, - "learning_rate": 1.9999992954083603e-05, - "loss": 3.0536, - "step": 681 - }, - { - "epoch": 0.0030317915798879305, - "grad_norm": 9.8125, - "learning_rate": 1.9999992933345086e-05, - "loss": 3.3246, - "step": 682 - }, - { - "epoch": 0.0030362370220871795, - "grad_norm": 10.125, - "learning_rate": 1.9999992912576092e-05, - "loss": 3.1238, - "step": 683 - }, - { - "epoch": 0.0030406824642864286, - "grad_norm": 12.3125, - "learning_rate": 1.9999992891776622e-05, - "loss": 2.9796, - "step": 684 - }, - { - "epoch": 0.003045127906485678, - "grad_norm": 10.75, - "learning_rate": 1.9999992870946676e-05, - "loss": 3.2246, - "step": 685 - }, - { - "epoch": 0.003049573348684927, - "grad_norm": 8.9375, - "learning_rate": 1.9999992850086257e-05, - "loss": 3.2809, - "step": 686 - }, - { - "epoch": 0.003054018790884176, - "grad_norm": 10.5625, - "learning_rate": 1.999999282919536e-05, - "loss": 3.1584, - "step": 687 - }, - { - "epoch": 0.0030584642330834253, - "grad_norm": 10.4375, - "learning_rate": 1.999999280827399e-05, - "loss": 3.2956, - "step": 688 - }, - { - "epoch": 0.0030629096752826743, - "grad_norm": 9.75, - "learning_rate": 1.9999992787322142e-05, - "loss": 3.1825, - "step": 689 - }, - { - "epoch": 0.003067355117481924, - "grad_norm": 9.0625, - "learning_rate": 1.999999276633982e-05, - "loss": 3.231, - "step": 690 - }, - { - "epoch": 0.003071800559681173, - "grad_norm": 10.9375, - "learning_rate": 1.9999992745327024e-05, - "loss": 3.0539, - "step": 691 - }, - { - "epoch": 0.003076246001880422, - "grad_norm": 10.625, - "learning_rate": 1.999999272428375e-05, - "loss": 3.1277, - "step": 692 - }, - { - "epoch": 0.003080691444079671, - "grad_norm": 11.8125, - "learning_rate": 1.9999992703210002e-05, - "loss": 3.0296, - "step": 693 - }, - { - "epoch": 0.0030851368862789205, - "grad_norm": 10.4375, - "learning_rate": 1.9999992682105776e-05, - "loss": 3.2257, - "step": 694 - }, - { - "epoch": 0.0030895823284781696, - "grad_norm": 9.5, - "learning_rate": 1.999999266097108e-05, - "loss": 3.1343, - "step": 695 - }, - { - "epoch": 0.0030940277706774186, - "grad_norm": 10.0, - "learning_rate": 1.9999992639805906e-05, - "loss": 3.1851, - "step": 696 - }, - { - "epoch": 0.0030984732128766677, - "grad_norm": 10.75, - "learning_rate": 1.999999261861026e-05, - "loss": 3.1471, - "step": 697 - }, - { - "epoch": 0.003102918655075917, - "grad_norm": 13.0625, - "learning_rate": 1.999999259738413e-05, - "loss": 2.9176, - "step": 698 - }, - { - "epoch": 0.0031073640972751663, - "grad_norm": 7.8125, - "learning_rate": 1.9999992576127528e-05, - "loss": 3.4666, - "step": 699 - }, - { - "epoch": 0.0031118095394744153, - "grad_norm": 5.4375, - "learning_rate": 1.9999992554840455e-05, - "loss": 3.6549, - "step": 700 - }, - { - "epoch": 0.0031162549816736644, - "grad_norm": 8.0625, - "learning_rate": 1.9999992533522902e-05, - "loss": 3.4028, - "step": 701 - }, - { - "epoch": 0.003120700423872914, - "grad_norm": 8.75, - "learning_rate": 1.9999992512174877e-05, - "loss": 3.3475, - "step": 702 - }, - { - "epoch": 0.003125145866072163, - "grad_norm": 10.0625, - "learning_rate": 1.9999992490796375e-05, - "loss": 3.3234, - "step": 703 - }, - { - "epoch": 0.003129591308271412, - "grad_norm": 12.0625, - "learning_rate": 1.9999992469387397e-05, - "loss": 2.9509, - "step": 704 - }, - { - "epoch": 0.003134036750470661, - "grad_norm": 8.625, - "learning_rate": 1.9999992447947943e-05, - "loss": 3.299, - "step": 705 - }, - { - "epoch": 0.0031384821926699106, - "grad_norm": 8.9375, - "learning_rate": 1.9999992426478016e-05, - "loss": 3.2435, - "step": 706 - }, - { - "epoch": 0.0031429276348691596, - "grad_norm": 7.625, - "learning_rate": 1.9999992404977612e-05, - "loss": 3.3967, - "step": 707 - }, - { - "epoch": 0.0031473730770684087, - "grad_norm": 12.25, - "learning_rate": 1.9999992383446733e-05, - "loss": 3.0166, - "step": 708 - }, - { - "epoch": 0.0031518185192676578, - "grad_norm": 10.3125, - "learning_rate": 1.9999992361885377e-05, - "loss": 3.1663, - "step": 709 - }, - { - "epoch": 0.003156263961466907, - "grad_norm": 11.25, - "learning_rate": 1.9999992340293548e-05, - "loss": 3.073, - "step": 710 - }, - { - "epoch": 0.0031607094036661563, - "grad_norm": 10.75, - "learning_rate": 1.9999992318671243e-05, - "loss": 3.0348, - "step": 711 - }, - { - "epoch": 0.0031651548458654054, - "grad_norm": 10.875, - "learning_rate": 1.9999992297018462e-05, - "loss": 3.0667, - "step": 712 - }, - { - "epoch": 0.0031696002880646544, - "grad_norm": 10.0, - "learning_rate": 1.9999992275335205e-05, - "loss": 3.2862, - "step": 713 - }, - { - "epoch": 0.0031740457302639035, - "grad_norm": 9.0625, - "learning_rate": 1.9999992253621475e-05, - "loss": 3.5573, - "step": 714 - }, - { - "epoch": 0.003178491172463153, - "grad_norm": 10.375, - "learning_rate": 1.9999992231877268e-05, - "loss": 3.2112, - "step": 715 - }, - { - "epoch": 0.003182936614662402, - "grad_norm": 14.8125, - "learning_rate": 1.9999992210102585e-05, - "loss": 2.7557, - "step": 716 - }, - { - "epoch": 0.003187382056861651, - "grad_norm": 10.5625, - "learning_rate": 1.9999992188297426e-05, - "loss": 3.2758, - "step": 717 - }, - { - "epoch": 0.0031918274990609, - "grad_norm": 11.5625, - "learning_rate": 1.9999992166461794e-05, - "loss": 3.1905, - "step": 718 - }, - { - "epoch": 0.0031962729412601497, - "grad_norm": 11.3125, - "learning_rate": 1.9999992144595686e-05, - "loss": 2.976, - "step": 719 - }, - { - "epoch": 0.0032007183834593987, - "grad_norm": 11.0625, - "learning_rate": 1.9999992122699102e-05, - "loss": 3.0233, - "step": 720 - }, - { - "epoch": 0.003205163825658648, - "grad_norm": 7.34375, - "learning_rate": 1.999999210077204e-05, - "loss": 3.6071, - "step": 721 - }, - { - "epoch": 0.003209609267857897, - "grad_norm": 10.4375, - "learning_rate": 1.9999992078814508e-05, - "loss": 3.1355, - "step": 722 - }, - { - "epoch": 0.0032140547100571464, - "grad_norm": 9.625, - "learning_rate": 1.9999992056826498e-05, - "loss": 3.2042, - "step": 723 - }, - { - "epoch": 0.0032185001522563954, - "grad_norm": 7.25, - "learning_rate": 1.9999992034808012e-05, - "loss": 3.4196, - "step": 724 - }, - { - "epoch": 0.0032229455944556445, - "grad_norm": 10.8125, - "learning_rate": 1.9999992012759054e-05, - "loss": 3.117, - "step": 725 - }, - { - "epoch": 0.0032273910366548936, - "grad_norm": 9.875, - "learning_rate": 1.9999991990679615e-05, - "loss": 3.1844, - "step": 726 - }, - { - "epoch": 0.0032318364788541426, - "grad_norm": 8.875, - "learning_rate": 1.9999991968569704e-05, - "loss": 3.2937, - "step": 727 - }, - { - "epoch": 0.003236281921053392, - "grad_norm": 12.8125, - "learning_rate": 1.9999991946429316e-05, - "loss": 2.958, - "step": 728 - }, - { - "epoch": 0.003240727363252641, - "grad_norm": 9.5, - "learning_rate": 1.9999991924258456e-05, - "loss": 3.3805, - "step": 729 - }, - { - "epoch": 0.0032451728054518902, - "grad_norm": 9.4375, - "learning_rate": 1.9999991902057116e-05, - "loss": 3.3056, - "step": 730 - }, - { - "epoch": 0.0032496182476511393, - "grad_norm": 8.875, - "learning_rate": 1.9999991879825303e-05, - "loss": 3.3765, - "step": 731 - }, - { - "epoch": 0.003254063689850389, - "grad_norm": 10.5625, - "learning_rate": 1.9999991857563014e-05, - "loss": 3.0629, - "step": 732 - }, - { - "epoch": 0.003258509132049638, - "grad_norm": 10.25, - "learning_rate": 1.9999991835270252e-05, - "loss": 3.1716, - "step": 733 - }, - { - "epoch": 0.003262954574248887, - "grad_norm": 12.125, - "learning_rate": 1.999999181294701e-05, - "loss": 2.8864, - "step": 734 - }, - { - "epoch": 0.003267400016448136, - "grad_norm": 11.875, - "learning_rate": 1.9999991790593296e-05, - "loss": 2.9344, - "step": 735 - }, - { - "epoch": 0.0032718454586473855, - "grad_norm": 8.25, - "learning_rate": 1.9999991768209105e-05, - "loss": 3.4282, - "step": 736 - }, - { - "epoch": 0.0032762909008466345, - "grad_norm": 8.1875, - "learning_rate": 1.999999174579444e-05, - "loss": 3.4134, - "step": 737 - }, - { - "epoch": 0.0032807363430458836, - "grad_norm": 8.625, - "learning_rate": 1.9999991723349298e-05, - "loss": 3.1534, - "step": 738 - }, - { - "epoch": 0.0032851817852451327, - "grad_norm": 8.75, - "learning_rate": 1.9999991700873682e-05, - "loss": 3.5116, - "step": 739 - }, - { - "epoch": 0.003289627227444382, - "grad_norm": 10.625, - "learning_rate": 1.9999991678367593e-05, - "loss": 3.0845, - "step": 740 - }, - { - "epoch": 0.0032940726696436312, - "grad_norm": 9.625, - "learning_rate": 1.9999991655831025e-05, - "loss": 3.2396, - "step": 741 - }, - { - "epoch": 0.0032985181118428803, - "grad_norm": 10.4375, - "learning_rate": 1.9999991633263983e-05, - "loss": 3.0021, - "step": 742 - }, - { - "epoch": 0.0033029635540421294, - "grad_norm": 11.625, - "learning_rate": 1.9999991610666462e-05, - "loss": 3.0806, - "step": 743 - }, - { - "epoch": 0.0033074089962413784, - "grad_norm": 8.25, - "learning_rate": 1.999999158803847e-05, - "loss": 3.3851, - "step": 744 - }, - { - "epoch": 0.003311854438440628, - "grad_norm": 11.375, - "learning_rate": 1.9999991565379998e-05, - "loss": 3.0312, - "step": 745 - }, - { - "epoch": 0.003316299880639877, - "grad_norm": 7.78125, - "learning_rate": 1.9999991542691055e-05, - "loss": 3.3919, - "step": 746 - }, - { - "epoch": 0.003320745322839126, - "grad_norm": 9.4375, - "learning_rate": 1.999999151997164e-05, - "loss": 3.1935, - "step": 747 - }, - { - "epoch": 0.003325190765038375, - "grad_norm": 9.625, - "learning_rate": 1.9999991497221744e-05, - "loss": 3.2646, - "step": 748 - }, - { - "epoch": 0.0033296362072376246, - "grad_norm": 9.25, - "learning_rate": 1.9999991474441372e-05, - "loss": 3.1429, - "step": 749 - }, - { - "epoch": 0.0033340816494368737, - "grad_norm": 9.8125, - "learning_rate": 1.9999991451630527e-05, - "loss": 3.1611, - "step": 750 - }, - { - "epoch": 0.0033385270916361227, - "grad_norm": 9.75, - "learning_rate": 1.9999991428789207e-05, - "loss": 3.1068, - "step": 751 - }, - { - "epoch": 0.003342972533835372, - "grad_norm": 9.1875, - "learning_rate": 1.999999140591741e-05, - "loss": 3.1891, - "step": 752 - }, - { - "epoch": 0.0033474179760346213, - "grad_norm": 9.75, - "learning_rate": 1.9999991383015136e-05, - "loss": 3.2103, - "step": 753 - }, - { - "epoch": 0.0033518634182338703, - "grad_norm": 11.1875, - "learning_rate": 1.999999136008239e-05, - "loss": 2.9437, - "step": 754 - }, - { - "epoch": 0.0033563088604331194, - "grad_norm": 10.8125, - "learning_rate": 1.9999991337119168e-05, - "loss": 3.0051, - "step": 755 - }, - { - "epoch": 0.0033607543026323685, - "grad_norm": 10.5625, - "learning_rate": 1.9999991314125465e-05, - "loss": 3.165, - "step": 756 - }, - { - "epoch": 0.003365199744831618, - "grad_norm": 9.0625, - "learning_rate": 1.9999991291101294e-05, - "loss": 3.3081, - "step": 757 - }, - { - "epoch": 0.003369645187030867, - "grad_norm": 8.0, - "learning_rate": 1.9999991268046646e-05, - "loss": 3.3961, - "step": 758 - }, - { - "epoch": 0.003374090629230116, - "grad_norm": 9.9375, - "learning_rate": 1.9999991244961522e-05, - "loss": 3.1331, - "step": 759 - }, - { - "epoch": 0.003378536071429365, - "grad_norm": 11.4375, - "learning_rate": 1.9999991221845922e-05, - "loss": 3.0869, - "step": 760 - }, - { - "epoch": 0.003382981513628614, - "grad_norm": 9.9375, - "learning_rate": 1.9999991198699845e-05, - "loss": 3.0735, - "step": 761 - }, - { - "epoch": 0.0033874269558278637, - "grad_norm": 10.8125, - "learning_rate": 1.9999991175523296e-05, - "loss": 3.1311, - "step": 762 - }, - { - "epoch": 0.0033918723980271128, - "grad_norm": 13.0625, - "learning_rate": 1.999999115231627e-05, - "loss": 2.8146, - "step": 763 - }, - { - "epoch": 0.003396317840226362, - "grad_norm": 8.125, - "learning_rate": 1.999999112907877e-05, - "loss": 3.3518, - "step": 764 - }, - { - "epoch": 0.003400763282425611, - "grad_norm": 6.53125, - "learning_rate": 1.9999991105810794e-05, - "loss": 3.4741, - "step": 765 - }, - { - "epoch": 0.0034052087246248604, - "grad_norm": 10.3125, - "learning_rate": 1.999999108251234e-05, - "loss": 3.099, - "step": 766 - }, - { - "epoch": 0.0034096541668241095, - "grad_norm": 9.9375, - "learning_rate": 1.9999991059183412e-05, - "loss": 3.1818, - "step": 767 - }, - { - "epoch": 0.0034140996090233585, - "grad_norm": 9.75, - "learning_rate": 1.999999103582401e-05, - "loss": 2.9794, - "step": 768 - }, - { - "epoch": 0.0034185450512226076, - "grad_norm": 11.4375, - "learning_rate": 1.9999991012434132e-05, - "loss": 3.0902, - "step": 769 - }, - { - "epoch": 0.003422990493421857, - "grad_norm": 11.0, - "learning_rate": 1.9999990989013776e-05, - "loss": 3.0924, - "step": 770 - }, - { - "epoch": 0.003427435935621106, - "grad_norm": 9.625, - "learning_rate": 1.999999096556295e-05, - "loss": 3.225, - "step": 771 - }, - { - "epoch": 0.003431881377820355, - "grad_norm": 11.4375, - "learning_rate": 1.9999990942081642e-05, - "loss": 2.9458, - "step": 772 - }, - { - "epoch": 0.0034363268200196043, - "grad_norm": 9.375, - "learning_rate": 1.9999990918569865e-05, - "loss": 3.2132, - "step": 773 - }, - { - "epoch": 0.0034407722622188538, - "grad_norm": 10.1875, - "learning_rate": 1.9999990895027607e-05, - "loss": 3.0489, - "step": 774 - }, - { - "epoch": 0.003445217704418103, - "grad_norm": 11.6875, - "learning_rate": 1.9999990871454877e-05, - "loss": 2.9673, - "step": 775 - }, - { - "epoch": 0.003449663146617352, - "grad_norm": 8.875, - "learning_rate": 1.9999990847851673e-05, - "loss": 3.4372, - "step": 776 - }, - { - "epoch": 0.003454108588816601, - "grad_norm": 9.6875, - "learning_rate": 1.999999082421799e-05, - "loss": 3.1813, - "step": 777 - }, - { - "epoch": 0.00345855403101585, - "grad_norm": 13.375, - "learning_rate": 1.9999990800553835e-05, - "loss": 2.7103, - "step": 778 - }, - { - "epoch": 0.0034629994732150995, - "grad_norm": 10.9375, - "learning_rate": 1.9999990776859203e-05, - "loss": 3.0685, - "step": 779 - }, - { - "epoch": 0.0034674449154143486, - "grad_norm": 9.6875, - "learning_rate": 1.9999990753134094e-05, - "loss": 3.1243, - "step": 780 - }, - { - "epoch": 0.0034718903576135976, - "grad_norm": 10.9375, - "learning_rate": 1.999999072937851e-05, - "loss": 2.9362, - "step": 781 - }, - { - "epoch": 0.0034763357998128467, - "grad_norm": 8.5625, - "learning_rate": 1.9999990705592453e-05, - "loss": 3.3234, - "step": 782 - }, - { - "epoch": 0.003480781242012096, - "grad_norm": 10.5, - "learning_rate": 1.999999068177592e-05, - "loss": 3.0012, - "step": 783 - }, - { - "epoch": 0.0034852266842113453, - "grad_norm": 8.625, - "learning_rate": 1.9999990657928912e-05, - "loss": 3.2321, - "step": 784 - }, - { - "epoch": 0.0034896721264105943, - "grad_norm": 9.75, - "learning_rate": 1.9999990634051426e-05, - "loss": 3.2815, - "step": 785 - }, - { - "epoch": 0.0034941175686098434, - "grad_norm": 9.25, - "learning_rate": 1.9999990610143464e-05, - "loss": 3.3714, - "step": 786 - }, - { - "epoch": 0.003498563010809093, - "grad_norm": 14.0625, - "learning_rate": 1.9999990586205032e-05, - "loss": 2.6527, - "step": 787 - }, - { - "epoch": 0.003503008453008342, - "grad_norm": 8.4375, - "learning_rate": 1.999999056223612e-05, - "loss": 3.3823, - "step": 788 - }, - { - "epoch": 0.003507453895207591, - "grad_norm": 10.3125, - "learning_rate": 1.9999990538236733e-05, - "loss": 3.0229, - "step": 789 - }, - { - "epoch": 0.00351189933740684, - "grad_norm": 10.6875, - "learning_rate": 1.9999990514206873e-05, - "loss": 3.0808, - "step": 790 - }, - { - "epoch": 0.0035163447796060896, - "grad_norm": 13.25, - "learning_rate": 1.9999990490146536e-05, - "loss": 2.8303, - "step": 791 - }, - { - "epoch": 0.0035207902218053386, - "grad_norm": 11.0, - "learning_rate": 1.9999990466055723e-05, - "loss": 3.0774, - "step": 792 - }, - { - "epoch": 0.0035252356640045877, - "grad_norm": 10.75, - "learning_rate": 1.9999990441934437e-05, - "loss": 3.1097, - "step": 793 - }, - { - "epoch": 0.0035296811062038367, - "grad_norm": 8.3125, - "learning_rate": 1.999999041778267e-05, - "loss": 3.3127, - "step": 794 - }, - { - "epoch": 0.003534126548403086, - "grad_norm": 11.8125, - "learning_rate": 1.9999990393600433e-05, - "loss": 2.9924, - "step": 795 - }, - { - "epoch": 0.0035385719906023353, - "grad_norm": 10.4375, - "learning_rate": 1.9999990369387722e-05, - "loss": 3.0534, - "step": 796 - }, - { - "epoch": 0.0035430174328015844, - "grad_norm": 10.0, - "learning_rate": 1.999999034514453e-05, - "loss": 3.1595, - "step": 797 - }, - { - "epoch": 0.0035474628750008334, - "grad_norm": 9.25, - "learning_rate": 1.9999990320870867e-05, - "loss": 3.1229, - "step": 798 - }, - { - "epoch": 0.0035519083172000825, - "grad_norm": 9.8125, - "learning_rate": 1.9999990296566727e-05, - "loss": 3.0361, - "step": 799 - }, - { - "epoch": 0.003556353759399332, - "grad_norm": 10.0, - "learning_rate": 1.999999027223211e-05, - "loss": 3.0411, - "step": 800 - }, - { - "epoch": 0.003560799201598581, - "grad_norm": 8.25, - "learning_rate": 1.9999990247867022e-05, - "loss": 3.3235, - "step": 801 - }, - { - "epoch": 0.00356524464379783, - "grad_norm": 9.875, - "learning_rate": 1.9999990223471457e-05, - "loss": 3.0978, - "step": 802 - }, - { - "epoch": 0.003569690085997079, - "grad_norm": 7.6875, - "learning_rate": 1.999999019904541e-05, - "loss": 3.3701, - "step": 803 - }, - { - "epoch": 0.0035741355281963287, - "grad_norm": 9.625, - "learning_rate": 1.9999990174588894e-05, - "loss": 3.1934, - "step": 804 - }, - { - "epoch": 0.0035785809703955777, - "grad_norm": 7.6875, - "learning_rate": 1.9999990150101903e-05, - "loss": 3.2825, - "step": 805 - }, - { - "epoch": 0.003583026412594827, - "grad_norm": 9.6875, - "learning_rate": 1.9999990125584436e-05, - "loss": 3.1912, - "step": 806 - }, - { - "epoch": 0.003587471854794076, - "grad_norm": 9.375, - "learning_rate": 1.999999010103649e-05, - "loss": 3.2155, - "step": 807 - }, - { - "epoch": 0.0035919172969933254, - "grad_norm": 8.9375, - "learning_rate": 1.9999990076458074e-05, - "loss": 3.2062, - "step": 808 - }, - { - "epoch": 0.0035963627391925744, - "grad_norm": 10.8125, - "learning_rate": 1.9999990051849178e-05, - "loss": 2.9882, - "step": 809 - }, - { - "epoch": 0.0036008081813918235, - "grad_norm": 8.25, - "learning_rate": 1.999999002720981e-05, - "loss": 3.3132, - "step": 810 - }, - { - "epoch": 0.0036052536235910725, - "grad_norm": 11.6875, - "learning_rate": 1.9999990002539965e-05, - "loss": 2.9168, - "step": 811 - }, - { - "epoch": 0.003609699065790322, - "grad_norm": 8.6875, - "learning_rate": 1.9999989977839647e-05, - "loss": 3.2657, - "step": 812 - }, - { - "epoch": 0.003614144507989571, - "grad_norm": 9.375, - "learning_rate": 1.999998995310885e-05, - "loss": 3.3166, - "step": 813 - }, - { - "epoch": 0.00361858995018882, - "grad_norm": 9.625, - "learning_rate": 1.999998992834758e-05, - "loss": 3.2212, - "step": 814 - }, - { - "epoch": 0.0036230353923880692, - "grad_norm": 12.9375, - "learning_rate": 1.9999989903555833e-05, - "loss": 2.7343, - "step": 815 - }, - { - "epoch": 0.0036274808345873183, - "grad_norm": 10.125, - "learning_rate": 1.999998987873361e-05, - "loss": 3.0512, - "step": 816 - }, - { - "epoch": 0.003631926276786568, - "grad_norm": 10.6875, - "learning_rate": 1.9999989853880915e-05, - "loss": 3.0209, - "step": 817 - }, - { - "epoch": 0.003636371718985817, - "grad_norm": 6.375, - "learning_rate": 1.999998982899774e-05, - "loss": 3.4947, - "step": 818 - }, - { - "epoch": 0.003640817161185066, - "grad_norm": 7.3125, - "learning_rate": 1.9999989804084096e-05, - "loss": 3.3555, - "step": 819 - }, - { - "epoch": 0.003645262603384315, - "grad_norm": 10.1875, - "learning_rate": 1.999998977913997e-05, - "loss": 3.0038, - "step": 820 - }, - { - "epoch": 0.0036497080455835645, - "grad_norm": 11.1875, - "learning_rate": 1.999998975416537e-05, - "loss": 3.0145, - "step": 821 - }, - { - "epoch": 0.0036541534877828135, - "grad_norm": 12.3125, - "learning_rate": 1.9999989729160298e-05, - "loss": 2.8447, - "step": 822 - }, - { - "epoch": 0.0036585989299820626, - "grad_norm": 12.125, - "learning_rate": 1.999998970412475e-05, - "loss": 2.8707, - "step": 823 - }, - { - "epoch": 0.0036630443721813117, - "grad_norm": 9.5625, - "learning_rate": 1.9999989679058726e-05, - "loss": 3.3991, - "step": 824 - }, - { - "epoch": 0.003667489814380561, - "grad_norm": 9.125, - "learning_rate": 1.9999989653962224e-05, - "loss": 3.0443, - "step": 825 - }, - { - "epoch": 0.0036719352565798102, - "grad_norm": 9.375, - "learning_rate": 1.999998962883525e-05, - "loss": 3.0836, - "step": 826 - }, - { - "epoch": 0.0036763806987790593, - "grad_norm": 10.0, - "learning_rate": 1.9999989603677798e-05, - "loss": 3.1075, - "step": 827 - }, - { - "epoch": 0.0036808261409783083, - "grad_norm": 9.75, - "learning_rate": 1.999998957848987e-05, - "loss": 3.1439, - "step": 828 - }, - { - "epoch": 0.003685271583177558, - "grad_norm": 6.59375, - "learning_rate": 1.999998955327147e-05, - "loss": 3.519, - "step": 829 - }, - { - "epoch": 0.003689717025376807, - "grad_norm": 12.6875, - "learning_rate": 1.999998952802259e-05, - "loss": 2.7734, - "step": 830 - }, - { - "epoch": 0.003694162467576056, - "grad_norm": 8.625, - "learning_rate": 1.999998950274324e-05, - "loss": 3.2566, - "step": 831 - }, - { - "epoch": 0.003698607909775305, - "grad_norm": 12.1875, - "learning_rate": 1.999998947743341e-05, - "loss": 2.8918, - "step": 832 - }, - { - "epoch": 0.003703053351974554, - "grad_norm": 9.875, - "learning_rate": 1.9999989452093107e-05, - "loss": 2.9881, - "step": 833 - }, - { - "epoch": 0.0037074987941738036, - "grad_norm": 10.8125, - "learning_rate": 1.999998942672233e-05, - "loss": 2.9711, - "step": 834 - }, - { - "epoch": 0.0037119442363730526, - "grad_norm": 11.6875, - "learning_rate": 1.9999989401321075e-05, - "loss": 2.9503, - "step": 835 - }, - { - "epoch": 0.0037163896785723017, - "grad_norm": 10.4375, - "learning_rate": 1.9999989375889347e-05, - "loss": 3.013, - "step": 836 - }, - { - "epoch": 0.0037208351207715508, - "grad_norm": 10.5, - "learning_rate": 1.9999989350427144e-05, - "loss": 3.0924, - "step": 837 - }, - { - "epoch": 0.0037252805629708003, - "grad_norm": 7.28125, - "learning_rate": 1.999998932493446e-05, - "loss": 3.4179, - "step": 838 - }, - { - "epoch": 0.0037297260051700493, - "grad_norm": 10.0625, - "learning_rate": 1.9999989299411305e-05, - "loss": 3.1468, - "step": 839 - }, - { - "epoch": 0.0037341714473692984, - "grad_norm": 7.3125, - "learning_rate": 1.9999989273857676e-05, - "loss": 3.2912, - "step": 840 - }, - { - "epoch": 0.0037386168895685475, - "grad_norm": 10.3125, - "learning_rate": 1.9999989248273568e-05, - "loss": 2.9532, - "step": 841 - }, - { - "epoch": 0.003743062331767797, - "grad_norm": 8.1875, - "learning_rate": 1.9999989222658987e-05, - "loss": 3.2604, - "step": 842 - }, - { - "epoch": 0.003747507773967046, - "grad_norm": 11.25, - "learning_rate": 1.999998919701393e-05, - "loss": 2.9118, - "step": 843 - }, - { - "epoch": 0.003751953216166295, - "grad_norm": 8.875, - "learning_rate": 1.9999989171338395e-05, - "loss": 3.2011, - "step": 844 - }, - { - "epoch": 0.003756398658365544, - "grad_norm": 9.375, - "learning_rate": 1.999998914563239e-05, - "loss": 3.1491, - "step": 845 - }, - { - "epoch": 0.0037608441005647936, - "grad_norm": 10.375, - "learning_rate": 1.9999989119895906e-05, - "loss": 2.9671, - "step": 846 - }, - { - "epoch": 0.0037652895427640427, - "grad_norm": 11.0, - "learning_rate": 1.9999989094128947e-05, - "loss": 2.9712, - "step": 847 - }, - { - "epoch": 0.0037697349849632918, - "grad_norm": 9.1875, - "learning_rate": 1.9999989068331515e-05, - "loss": 3.1568, - "step": 848 - }, - { - "epoch": 0.003774180427162541, - "grad_norm": 10.5625, - "learning_rate": 1.9999989042503603e-05, - "loss": 3.1477, - "step": 849 - }, - { - "epoch": 0.00377862586936179, - "grad_norm": 13.1875, - "learning_rate": 1.999998901664522e-05, - "loss": 2.6454, - "step": 850 - }, - { - "epoch": 0.0037830713115610394, - "grad_norm": 8.8125, - "learning_rate": 1.9999988990756358e-05, - "loss": 3.1792, - "step": 851 - }, - { - "epoch": 0.0037875167537602884, - "grad_norm": 7.84375, - "learning_rate": 1.999998896483702e-05, - "loss": 3.2745, - "step": 852 - }, - { - "epoch": 0.0037919621959595375, - "grad_norm": 10.25, - "learning_rate": 1.999998893888721e-05, - "loss": 3.2199, - "step": 853 - }, - { - "epoch": 0.0037964076381587866, - "grad_norm": 9.375, - "learning_rate": 1.9999988912906925e-05, - "loss": 3.0699, - "step": 854 - }, - { - "epoch": 0.003800853080358036, - "grad_norm": 9.6875, - "learning_rate": 1.9999988886896163e-05, - "loss": 3.0755, - "step": 855 - }, - { - "epoch": 0.003805298522557285, - "grad_norm": 9.5625, - "learning_rate": 1.9999988860854928e-05, - "loss": 3.1034, - "step": 856 - }, - { - "epoch": 0.003809743964756534, - "grad_norm": 10.125, - "learning_rate": 1.9999988834783217e-05, - "loss": 2.9654, - "step": 857 - }, - { - "epoch": 0.0038141894069557833, - "grad_norm": 8.6875, - "learning_rate": 1.9999988808681026e-05, - "loss": 3.174, - "step": 858 - }, - { - "epoch": 0.0038186348491550328, - "grad_norm": 9.9375, - "learning_rate": 1.999998878254836e-05, - "loss": 3.176, - "step": 859 - }, - { - "epoch": 0.003823080291354282, - "grad_norm": 9.5625, - "learning_rate": 1.9999988756385225e-05, - "loss": 3.3015, - "step": 860 - }, - { - "epoch": 0.003827525733553531, - "grad_norm": 8.9375, - "learning_rate": 1.999998873019161e-05, - "loss": 3.2252, - "step": 861 - }, - { - "epoch": 0.00383197117575278, - "grad_norm": 12.0625, - "learning_rate": 1.999998870396752e-05, - "loss": 2.8381, - "step": 862 - }, - { - "epoch": 0.0038364166179520294, - "grad_norm": 11.5625, - "learning_rate": 1.9999988677712957e-05, - "loss": 2.8901, - "step": 863 - }, - { - "epoch": 0.0038408620601512785, - "grad_norm": 8.8125, - "learning_rate": 1.9999988651427915e-05, - "loss": 3.3976, - "step": 864 - }, - { - "epoch": 0.0038453075023505276, - "grad_norm": 11.0625, - "learning_rate": 1.99999886251124e-05, - "loss": 2.8603, - "step": 865 - }, - { - "epoch": 0.0038497529445497766, - "grad_norm": 10.5625, - "learning_rate": 1.999998859876641e-05, - "loss": 3.1128, - "step": 866 - }, - { - "epoch": 0.0038541983867490257, - "grad_norm": 9.1875, - "learning_rate": 1.9999988572389943e-05, - "loss": 3.0814, - "step": 867 - }, - { - "epoch": 0.003858643828948275, - "grad_norm": 5.90625, - "learning_rate": 1.9999988545983e-05, - "loss": 3.5917, - "step": 868 - }, - { - "epoch": 0.0038630892711475242, - "grad_norm": 10.125, - "learning_rate": 1.9999988519545584e-05, - "loss": 3.0457, - "step": 869 - }, - { - "epoch": 0.0038675347133467733, - "grad_norm": 11.5, - "learning_rate": 1.9999988493077688e-05, - "loss": 2.903, - "step": 870 - }, - { - "epoch": 0.0038719801555460224, - "grad_norm": 9.875, - "learning_rate": 1.9999988466579322e-05, - "loss": 3.0651, - "step": 871 - }, - { - "epoch": 0.003876425597745272, - "grad_norm": 9.3125, - "learning_rate": 1.999998844005048e-05, - "loss": 3.0476, - "step": 872 - }, - { - "epoch": 0.003880871039944521, - "grad_norm": 8.3125, - "learning_rate": 1.999998841349116e-05, - "loss": 3.1938, - "step": 873 - }, - { - "epoch": 0.00388531648214377, - "grad_norm": 9.75, - "learning_rate": 1.999998838690137e-05, - "loss": 3.0521, - "step": 874 - }, - { - "epoch": 0.003889761924343019, - "grad_norm": 9.375, - "learning_rate": 1.99999883602811e-05, - "loss": 3.1516, - "step": 875 - }, - { - "epoch": 0.0038942073665422685, - "grad_norm": 9.625, - "learning_rate": 1.9999988333630352e-05, - "loss": 3.3108, - "step": 876 - }, - { - "epoch": 0.0038986528087415176, - "grad_norm": 12.625, - "learning_rate": 1.9999988306949133e-05, - "loss": 2.8075, - "step": 877 - }, - { - "epoch": 0.0039030982509407667, - "grad_norm": 8.75, - "learning_rate": 1.9999988280237438e-05, - "loss": 3.0388, - "step": 878 - }, - { - "epoch": 0.003907543693140016, - "grad_norm": 10.375, - "learning_rate": 1.9999988253495262e-05, - "loss": 3.07, - "step": 879 - }, - { - "epoch": 0.003911989135339265, - "grad_norm": 8.375, - "learning_rate": 1.9999988226722618e-05, - "loss": 3.2086, - "step": 880 - }, - { - "epoch": 0.003916434577538514, - "grad_norm": 10.875, - "learning_rate": 1.9999988199919497e-05, - "loss": 2.9301, - "step": 881 - }, - { - "epoch": 0.003920880019737763, - "grad_norm": 8.5, - "learning_rate": 1.99999881730859e-05, - "loss": 3.2838, - "step": 882 - }, - { - "epoch": 0.003925325461937012, - "grad_norm": 7.0, - "learning_rate": 1.9999988146221826e-05, - "loss": 3.3348, - "step": 883 - }, - { - "epoch": 0.0039297709041362615, - "grad_norm": 9.8125, - "learning_rate": 1.999998811932728e-05, - "loss": 3.1866, - "step": 884 - }, - { - "epoch": 0.0039342163463355105, - "grad_norm": 11.125, - "learning_rate": 1.9999988092402254e-05, - "loss": 2.964, - "step": 885 - }, - { - "epoch": 0.00393866178853476, - "grad_norm": 12.625, - "learning_rate": 1.999998806544676e-05, - "loss": 2.8321, - "step": 886 - }, - { - "epoch": 0.0039431072307340095, - "grad_norm": 10.125, - "learning_rate": 1.9999988038460783e-05, - "loss": 3.1829, - "step": 887 - }, - { - "epoch": 0.003947552672933259, - "grad_norm": 9.25, - "learning_rate": 1.9999988011444332e-05, - "loss": 3.0765, - "step": 888 - }, - { - "epoch": 0.003951998115132508, - "grad_norm": 8.75, - "learning_rate": 1.9999987984397408e-05, - "loss": 3.1571, - "step": 889 - }, - { - "epoch": 0.003956443557331757, - "grad_norm": 11.9375, - "learning_rate": 1.9999987957320008e-05, - "loss": 2.8281, - "step": 890 - }, - { - "epoch": 0.003960888999531006, - "grad_norm": 9.25, - "learning_rate": 1.999998793021213e-05, - "loss": 3.0621, - "step": 891 - }, - { - "epoch": 0.003965334441730255, - "grad_norm": 9.75, - "learning_rate": 1.999998790307378e-05, - "loss": 3.1101, - "step": 892 - }, - { - "epoch": 0.003969779883929504, - "grad_norm": 7.90625, - "learning_rate": 1.9999987875904952e-05, - "loss": 3.3226, - "step": 893 - }, - { - "epoch": 0.003974225326128753, - "grad_norm": 8.9375, - "learning_rate": 1.9999987848705654e-05, - "loss": 3.1219, - "step": 894 - }, - { - "epoch": 0.003978670768328003, - "grad_norm": 11.125, - "learning_rate": 1.9999987821475876e-05, - "loss": 2.9798, - "step": 895 - }, - { - "epoch": 0.003983116210527252, - "grad_norm": 10.0, - "learning_rate": 1.9999987794215624e-05, - "loss": 3.0584, - "step": 896 - }, - { - "epoch": 0.003987561652726501, - "grad_norm": 9.875, - "learning_rate": 1.9999987766924894e-05, - "loss": 2.9607, - "step": 897 - }, - { - "epoch": 0.00399200709492575, - "grad_norm": 8.75, - "learning_rate": 1.9999987739603694e-05, - "loss": 3.264, - "step": 898 - }, - { - "epoch": 0.003996452537124999, - "grad_norm": 10.5, - "learning_rate": 1.9999987712252014e-05, - "loss": 2.9793, - "step": 899 - }, - { - "epoch": 0.004000897979324248, - "grad_norm": 10.4375, - "learning_rate": 1.9999987684869858e-05, - "loss": 2.9847, - "step": 900 - }, - { - "epoch": 0.004005343421523497, - "grad_norm": 11.125, - "learning_rate": 1.999998765745723e-05, - "loss": 2.8297, - "step": 901 - }, - { - "epoch": 0.004009788863722746, - "grad_norm": 10.0625, - "learning_rate": 1.9999987630014127e-05, - "loss": 3.1077, - "step": 902 - }, - { - "epoch": 0.004014234305921995, - "grad_norm": 8.0, - "learning_rate": 1.9999987602540546e-05, - "loss": 3.3845, - "step": 903 - }, - { - "epoch": 0.004018679748121245, - "grad_norm": 11.25, - "learning_rate": 1.999998757503649e-05, - "loss": 2.9843, - "step": 904 - }, - { - "epoch": 0.004023125190320494, - "grad_norm": 8.3125, - "learning_rate": 1.9999987547501957e-05, - "loss": 3.1954, - "step": 905 - }, - { - "epoch": 0.0040275706325197435, - "grad_norm": 10.4375, - "learning_rate": 1.999998751993695e-05, - "loss": 2.9387, - "step": 906 - }, - { - "epoch": 0.0040320160747189925, - "grad_norm": 10.0625, - "learning_rate": 1.999998749234147e-05, - "loss": 2.9554, - "step": 907 - }, - { - "epoch": 0.004036461516918242, - "grad_norm": 9.6875, - "learning_rate": 1.9999987464715512e-05, - "loss": 2.9809, - "step": 908 - }, - { - "epoch": 0.004040906959117491, - "grad_norm": 10.5625, - "learning_rate": 1.999998743705908e-05, - "loss": 3.024, - "step": 909 - }, - { - "epoch": 0.00404535240131674, - "grad_norm": 10.125, - "learning_rate": 1.999998740937217e-05, - "loss": 3.0485, - "step": 910 - }, - { - "epoch": 0.004049797843515989, - "grad_norm": 9.9375, - "learning_rate": 1.999998738165479e-05, - "loss": 3.0016, - "step": 911 - }, - { - "epoch": 0.004054243285715239, - "grad_norm": 9.5, - "learning_rate": 1.9999987353906933e-05, - "loss": 3.0214, - "step": 912 - }, - { - "epoch": 0.004058688727914488, - "grad_norm": 9.1875, - "learning_rate": 1.9999987326128596e-05, - "loss": 3.1299, - "step": 913 - }, - { - "epoch": 0.004063134170113737, - "grad_norm": 11.625, - "learning_rate": 1.9999987298319786e-05, - "loss": 2.8434, - "step": 914 - }, - { - "epoch": 0.004067579612312986, - "grad_norm": 8.3125, - "learning_rate": 1.99999872704805e-05, - "loss": 3.2026, - "step": 915 - }, - { - "epoch": 0.004072025054512235, - "grad_norm": 6.84375, - "learning_rate": 1.999998724261074e-05, - "loss": 3.4353, - "step": 916 - }, - { - "epoch": 0.004076470496711484, - "grad_norm": 7.53125, - "learning_rate": 1.9999987214710505e-05, - "loss": 3.2922, - "step": 917 - }, - { - "epoch": 0.004080915938910733, - "grad_norm": 8.375, - "learning_rate": 1.9999987186779794e-05, - "loss": 3.2543, - "step": 918 - }, - { - "epoch": 0.004085361381109982, - "grad_norm": 10.625, - "learning_rate": 1.999998715881861e-05, - "loss": 3.0364, - "step": 919 - }, - { - "epoch": 0.004089806823309231, - "grad_norm": 10.0, - "learning_rate": 1.9999987130826945e-05, - "loss": 3.1003, - "step": 920 - }, - { - "epoch": 0.004094252265508481, - "grad_norm": 10.3125, - "learning_rate": 1.999998710280481e-05, - "loss": 3.0632, - "step": 921 - }, - { - "epoch": 0.00409869770770773, - "grad_norm": 10.25, - "learning_rate": 1.9999987074752195e-05, - "loss": 3.0327, - "step": 922 - }, - { - "epoch": 0.004103143149906979, - "grad_norm": 10.9375, - "learning_rate": 1.999998704666911e-05, - "loss": 2.8159, - "step": 923 - }, - { - "epoch": 0.004107588592106228, - "grad_norm": 9.8125, - "learning_rate": 1.9999987018555544e-05, - "loss": 3.0028, - "step": 924 - }, - { - "epoch": 0.004112034034305477, - "grad_norm": 10.1875, - "learning_rate": 1.9999986990411505e-05, - "loss": 2.9518, - "step": 925 - }, - { - "epoch": 0.0041164794765047264, - "grad_norm": 9.25, - "learning_rate": 1.9999986962236994e-05, - "loss": 3.1715, - "step": 926 - }, - { - "epoch": 0.0041209249187039755, - "grad_norm": 7.5625, - "learning_rate": 1.9999986934032006e-05, - "loss": 3.3351, - "step": 927 - }, - { - "epoch": 0.004125370360903225, - "grad_norm": 8.5625, - "learning_rate": 1.999998690579654e-05, - "loss": 3.2445, - "step": 928 - }, - { - "epoch": 0.0041298158031024745, - "grad_norm": 6.71875, - "learning_rate": 1.99999868775306e-05, - "loss": 3.3478, - "step": 929 - }, - { - "epoch": 0.0041342612453017236, - "grad_norm": 9.6875, - "learning_rate": 1.9999986849234183e-05, - "loss": 3.1339, - "step": 930 - }, - { - "epoch": 0.004138706687500973, - "grad_norm": 11.375, - "learning_rate": 1.999998682090729e-05, - "loss": 2.8642, - "step": 931 - }, - { - "epoch": 0.004143152129700222, - "grad_norm": 9.4375, - "learning_rate": 1.9999986792549925e-05, - "loss": 3.2864, - "step": 932 - }, - { - "epoch": 0.004147597571899471, - "grad_norm": 10.25, - "learning_rate": 1.9999986764162083e-05, - "loss": 3.0004, - "step": 933 - }, - { - "epoch": 0.00415204301409872, - "grad_norm": 10.3125, - "learning_rate": 1.9999986735743765e-05, - "loss": 2.8484, - "step": 934 - }, - { - "epoch": 0.004156488456297969, - "grad_norm": 12.125, - "learning_rate": 1.9999986707294978e-05, - "loss": 2.8308, - "step": 935 - }, - { - "epoch": 0.004160933898497218, - "grad_norm": 10.3125, - "learning_rate": 1.9999986678815707e-05, - "loss": 3.1088, - "step": 936 - }, - { - "epoch": 0.004165379340696467, - "grad_norm": 10.25, - "learning_rate": 1.9999986650305964e-05, - "loss": 3.259, - "step": 937 - }, - { - "epoch": 0.004169824782895717, - "grad_norm": 9.875, - "learning_rate": 1.9999986621765744e-05, - "loss": 2.9283, - "step": 938 - }, - { - "epoch": 0.004174270225094966, - "grad_norm": 8.1875, - "learning_rate": 1.9999986593195052e-05, - "loss": 3.3917, - "step": 939 - }, - { - "epoch": 0.004178715667294215, - "grad_norm": 9.4375, - "learning_rate": 1.999998656459388e-05, - "loss": 3.1127, - "step": 940 } ], "logging_steps": 1, @@ -6607,7 +2407,7 @@ "attributes": {} } }, - "total_flos": 3.72121100353536e+16, + "total_flos": 1.34596993744896e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null