{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 12.0, "eval_steps": 500, "global_step": 1536, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0390625, "grad_norm": 1.6853455305099487, "learning_rate": 8e-05, "loss": 2.2667, "step": 5 }, { "epoch": 0.078125, "grad_norm": 1.127389907836914, "learning_rate": 0.00018, "loss": 1.2747, "step": 10 }, { "epoch": 0.1171875, "grad_norm": 0.6251115798950195, "learning_rate": 0.00019947575360419398, "loss": 1.2874, "step": 15 }, { "epoch": 0.15625, "grad_norm": 0.8021469116210938, "learning_rate": 0.00019882044560943645, "loss": 1.1816, "step": 20 }, { "epoch": 0.1953125, "grad_norm": 0.7008321285247803, "learning_rate": 0.0001981651376146789, "loss": 0.9091, "step": 25 }, { "epoch": 0.234375, "grad_norm": 0.8806556463241577, "learning_rate": 0.00019750982961992138, "loss": 1.18, "step": 30 }, { "epoch": 0.2734375, "grad_norm": 0.4898707866668701, "learning_rate": 0.00019685452162516385, "loss": 0.8613, "step": 35 }, { "epoch": 0.3125, "grad_norm": 0.818252444267273, "learning_rate": 0.0001961992136304063, "loss": 0.9613, "step": 40 }, { "epoch": 0.3515625, "grad_norm": 0.7560004591941833, "learning_rate": 0.00019554390563564878, "loss": 0.9433, "step": 45 }, { "epoch": 0.390625, "grad_norm": 0.5985464453697205, "learning_rate": 0.00019488859764089122, "loss": 0.8993, "step": 50 }, { "epoch": 0.4296875, "grad_norm": 0.7984416484832764, "learning_rate": 0.0001942332896461337, "loss": 0.9395, "step": 55 }, { "epoch": 0.46875, "grad_norm": 0.5905727744102478, "learning_rate": 0.00019357798165137616, "loss": 0.9598, "step": 60 }, { "epoch": 0.5078125, "grad_norm": 0.47333383560180664, "learning_rate": 0.00019292267365661863, "loss": 0.9031, "step": 65 }, { "epoch": 0.546875, "grad_norm": 0.5469959378242493, "learning_rate": 0.00019226736566186107, "loss": 1.1451, "step": 70 }, { "epoch": 0.5859375, "grad_norm": 0.5932920575141907, "learning_rate": 0.00019161205766710356, "loss": 0.8857, "step": 75 }, { "epoch": 0.625, "grad_norm": 0.5339898467063904, "learning_rate": 0.000190956749672346, "loss": 0.7919, "step": 80 }, { "epoch": 0.6640625, "grad_norm": 0.48470577597618103, "learning_rate": 0.00019030144167758847, "loss": 0.9792, "step": 85 }, { "epoch": 0.703125, "grad_norm": 0.4082311689853668, "learning_rate": 0.00018964613368283094, "loss": 0.9229, "step": 90 }, { "epoch": 0.7421875, "grad_norm": 0.6408493518829346, "learning_rate": 0.0001889908256880734, "loss": 0.7129, "step": 95 }, { "epoch": 0.78125, "grad_norm": 0.45103052258491516, "learning_rate": 0.00018833551769331587, "loss": 0.9419, "step": 100 }, { "epoch": 0.8203125, "grad_norm": 0.6506906747817993, "learning_rate": 0.00018768020969855834, "loss": 0.836, "step": 105 }, { "epoch": 0.859375, "grad_norm": 0.6588282585144043, "learning_rate": 0.0001870249017038008, "loss": 0.885, "step": 110 }, { "epoch": 0.8984375, "grad_norm": 0.5844029188156128, "learning_rate": 0.00018636959370904325, "loss": 0.8624, "step": 115 }, { "epoch": 0.9375, "grad_norm": 0.538287878036499, "learning_rate": 0.00018571428571428572, "loss": 0.9129, "step": 120 }, { "epoch": 0.9765625, "grad_norm": 0.3959498107433319, "learning_rate": 0.00018505897771952819, "loss": 1.0968, "step": 125 }, { "epoch": 1.015625, "grad_norm": 0.46326589584350586, "learning_rate": 0.00018440366972477065, "loss": 0.7563, "step": 130 }, { "epoch": 1.0546875, "grad_norm": 0.6401046514511108, "learning_rate": 0.00018374836173001312, "loss": 0.7854, "step": 135 }, { "epoch": 1.09375, "grad_norm": 0.6093747615814209, "learning_rate": 0.0001830930537352556, "loss": 0.7031, "step": 140 }, { "epoch": 1.1328125, "grad_norm": 0.48366278409957886, "learning_rate": 0.00018243774574049803, "loss": 0.4555, "step": 145 }, { "epoch": 1.171875, "grad_norm": 0.5257757902145386, "learning_rate": 0.0001817824377457405, "loss": 0.6499, "step": 150 }, { "epoch": 1.2109375, "grad_norm": 0.7223408818244934, "learning_rate": 0.00018112712975098296, "loss": 0.5893, "step": 155 }, { "epoch": 1.25, "grad_norm": 0.4492509663105011, "learning_rate": 0.00018047182175622543, "loss": 0.8623, "step": 160 }, { "epoch": 1.2890625, "grad_norm": 0.6466461420059204, "learning_rate": 0.0001798165137614679, "loss": 0.7997, "step": 165 }, { "epoch": 1.328125, "grad_norm": 0.6021189093589783, "learning_rate": 0.00017916120576671037, "loss": 0.7151, "step": 170 }, { "epoch": 1.3671875, "grad_norm": 0.43464839458465576, "learning_rate": 0.00017850589777195283, "loss": 0.626, "step": 175 }, { "epoch": 1.40625, "grad_norm": 0.49049654603004456, "learning_rate": 0.00017785058977719527, "loss": 0.7601, "step": 180 }, { "epoch": 1.4453125, "grad_norm": 0.6579009294509888, "learning_rate": 0.00017719528178243777, "loss": 0.6411, "step": 185 }, { "epoch": 1.484375, "grad_norm": 0.7494032382965088, "learning_rate": 0.0001765399737876802, "loss": 0.589, "step": 190 }, { "epoch": 1.5234375, "grad_norm": 0.5080376267433167, "learning_rate": 0.00017588466579292268, "loss": 0.6848, "step": 195 }, { "epoch": 1.5625, "grad_norm": 0.49630534648895264, "learning_rate": 0.00017522935779816515, "loss": 0.6273, "step": 200 }, { "epoch": 1.6015625, "grad_norm": 0.6087814569473267, "learning_rate": 0.0001745740498034076, "loss": 0.6083, "step": 205 }, { "epoch": 1.640625, "grad_norm": 0.607954740524292, "learning_rate": 0.00017391874180865005, "loss": 0.8664, "step": 210 }, { "epoch": 1.6796875, "grad_norm": 0.44959601759910583, "learning_rate": 0.00017326343381389255, "loss": 0.5538, "step": 215 }, { "epoch": 1.71875, "grad_norm": 0.5550365447998047, "learning_rate": 0.000172608125819135, "loss": 0.4869, "step": 220 }, { "epoch": 1.7578125, "grad_norm": 0.6531190872192383, "learning_rate": 0.00017195281782437746, "loss": 0.7142, "step": 225 }, { "epoch": 1.796875, "grad_norm": 0.6506574153900146, "learning_rate": 0.00017129750982961995, "loss": 0.6573, "step": 230 }, { "epoch": 1.8359375, "grad_norm": 0.5597310662269592, "learning_rate": 0.0001706422018348624, "loss": 0.6261, "step": 235 }, { "epoch": 1.875, "grad_norm": 0.5404195189476013, "learning_rate": 0.00016998689384010486, "loss": 0.5054, "step": 240 }, { "epoch": 1.9140625, "grad_norm": 0.611003041267395, "learning_rate": 0.00016933158584534733, "loss": 0.6949, "step": 245 }, { "epoch": 1.953125, "grad_norm": 0.4925813674926758, "learning_rate": 0.0001686762778505898, "loss": 0.6684, "step": 250 }, { "epoch": 1.9921875, "grad_norm": 0.5423117876052856, "learning_rate": 0.00016802096985583224, "loss": 0.7782, "step": 255 }, { "epoch": 2.03125, "grad_norm": 0.4928165078163147, "learning_rate": 0.00016736566186107473, "loss": 0.4515, "step": 260 }, { "epoch": 2.0703125, "grad_norm": 0.6966648101806641, "learning_rate": 0.00016671035386631717, "loss": 0.4123, "step": 265 }, { "epoch": 2.109375, "grad_norm": 0.7156907916069031, "learning_rate": 0.00016605504587155964, "loss": 0.591, "step": 270 }, { "epoch": 2.1484375, "grad_norm": 0.5283113718032837, "learning_rate": 0.0001653997378768021, "loss": 0.4631, "step": 275 }, { "epoch": 2.1875, "grad_norm": 0.7045680284500122, "learning_rate": 0.00016474442988204457, "loss": 0.4343, "step": 280 }, { "epoch": 2.2265625, "grad_norm": 0.7731931805610657, "learning_rate": 0.00016408912188728701, "loss": 0.4591, "step": 285 }, { "epoch": 2.265625, "grad_norm": 0.7124219536781311, "learning_rate": 0.0001634338138925295, "loss": 0.4534, "step": 290 }, { "epoch": 2.3046875, "grad_norm": 0.66915363073349, "learning_rate": 0.00016277850589777198, "loss": 0.5908, "step": 295 }, { "epoch": 2.34375, "grad_norm": 0.6559345722198486, "learning_rate": 0.00016212319790301442, "loss": 0.5065, "step": 300 }, { "epoch": 2.3828125, "grad_norm": 0.776062548160553, "learning_rate": 0.00016146788990825688, "loss": 0.4548, "step": 305 }, { "epoch": 2.421875, "grad_norm": 0.5407435297966003, "learning_rate": 0.00016081258191349935, "loss": 0.4586, "step": 310 }, { "epoch": 2.4609375, "grad_norm": 0.7619644403457642, "learning_rate": 0.00016015727391874182, "loss": 0.6065, "step": 315 }, { "epoch": 2.5, "grad_norm": 0.6659480333328247, "learning_rate": 0.0001595019659239843, "loss": 0.4892, "step": 320 }, { "epoch": 2.5390625, "grad_norm": 0.586632490158081, "learning_rate": 0.00015884665792922676, "loss": 0.5094, "step": 325 }, { "epoch": 2.578125, "grad_norm": 0.6501973867416382, "learning_rate": 0.0001581913499344692, "loss": 0.5017, "step": 330 }, { "epoch": 2.6171875, "grad_norm": 0.5939526557922363, "learning_rate": 0.00015753604193971166, "loss": 0.357, "step": 335 }, { "epoch": 2.65625, "grad_norm": 0.6541431546211243, "learning_rate": 0.00015688073394495413, "loss": 0.5687, "step": 340 }, { "epoch": 2.6953125, "grad_norm": 0.7392444014549255, "learning_rate": 0.0001562254259501966, "loss": 0.5922, "step": 345 }, { "epoch": 2.734375, "grad_norm": 0.7246791124343872, "learning_rate": 0.00015557011795543907, "loss": 0.388, "step": 350 }, { "epoch": 2.7734375, "grad_norm": 1.0469605922698975, "learning_rate": 0.00015491480996068153, "loss": 0.4028, "step": 355 }, { "epoch": 2.8125, "grad_norm": 0.7362831830978394, "learning_rate": 0.000154259501965924, "loss": 0.6255, "step": 360 }, { "epoch": 2.8515625, "grad_norm": 0.6058784127235413, "learning_rate": 0.00015360419397116644, "loss": 0.5189, "step": 365 }, { "epoch": 2.890625, "grad_norm": 0.6939958333969116, "learning_rate": 0.00015294888597640894, "loss": 0.501, "step": 370 }, { "epoch": 2.9296875, "grad_norm": 0.8468016982078552, "learning_rate": 0.00015229357798165138, "loss": 0.5747, "step": 375 }, { "epoch": 2.96875, "grad_norm": 0.6065675616264343, "learning_rate": 0.00015163826998689384, "loss": 0.3813, "step": 380 }, { "epoch": 3.0078125, "grad_norm": 0.5093637108802795, "learning_rate": 0.0001509829619921363, "loss": 0.5125, "step": 385 }, { "epoch": 3.046875, "grad_norm": 0.7048936486244202, "learning_rate": 0.00015032765399737878, "loss": 0.3766, "step": 390 }, { "epoch": 3.0859375, "grad_norm": 1.191715955734253, "learning_rate": 0.00014967234600262122, "loss": 0.4224, "step": 395 }, { "epoch": 3.125, "grad_norm": 0.6624323129653931, "learning_rate": 0.00014901703800786372, "loss": 0.4212, "step": 400 }, { "epoch": 3.1640625, "grad_norm": 1.3422083854675293, "learning_rate": 0.00014836173001310616, "loss": 0.3319, "step": 405 }, { "epoch": 3.203125, "grad_norm": 0.5813243985176086, "learning_rate": 0.00014770642201834862, "loss": 0.4468, "step": 410 }, { "epoch": 3.2421875, "grad_norm": 0.7296664118766785, "learning_rate": 0.0001470511140235911, "loss": 0.3234, "step": 415 }, { "epoch": 3.28125, "grad_norm": 0.7492959499359131, "learning_rate": 0.00014639580602883356, "loss": 0.3102, "step": 420 }, { "epoch": 3.3203125, "grad_norm": 0.994613528251648, "learning_rate": 0.000145740498034076, "loss": 0.4286, "step": 425 }, { "epoch": 3.359375, "grad_norm": 0.9514994025230408, "learning_rate": 0.0001450851900393185, "loss": 0.3267, "step": 430 }, { "epoch": 3.3984375, "grad_norm": 0.7083520293235779, "learning_rate": 0.00014442988204456096, "loss": 0.4743, "step": 435 }, { "epoch": 3.4375, "grad_norm": 0.6460224390029907, "learning_rate": 0.0001437745740498034, "loss": 0.3128, "step": 440 }, { "epoch": 3.4765625, "grad_norm": 0.7406665086746216, "learning_rate": 0.0001431192660550459, "loss": 0.3762, "step": 445 }, { "epoch": 3.515625, "grad_norm": 0.7346643805503845, "learning_rate": 0.00014246395806028834, "loss": 0.3573, "step": 450 }, { "epoch": 3.5546875, "grad_norm": 0.3775249123573303, "learning_rate": 0.0001418086500655308, "loss": 0.3488, "step": 455 }, { "epoch": 3.59375, "grad_norm": 0.9807206988334656, "learning_rate": 0.00014115334207077327, "loss": 0.2672, "step": 460 }, { "epoch": 3.6328125, "grad_norm": 0.5825705528259277, "learning_rate": 0.00014049803407601574, "loss": 0.2388, "step": 465 }, { "epoch": 3.671875, "grad_norm": 1.1724300384521484, "learning_rate": 0.00013984272608125818, "loss": 0.2998, "step": 470 }, { "epoch": 3.7109375, "grad_norm": 0.6543852090835571, "learning_rate": 0.00013918741808650068, "loss": 0.3704, "step": 475 }, { "epoch": 3.75, "grad_norm": 0.6687126755714417, "learning_rate": 0.00013853211009174312, "loss": 0.3478, "step": 480 }, { "epoch": 3.7890625, "grad_norm": 0.8228131532669067, "learning_rate": 0.00013787680209698558, "loss": 0.3139, "step": 485 }, { "epoch": 3.828125, "grad_norm": 0.65690678358078, "learning_rate": 0.00013722149410222805, "loss": 0.4469, "step": 490 }, { "epoch": 3.8671875, "grad_norm": 0.7769365906715393, "learning_rate": 0.00013656618610747052, "loss": 0.4247, "step": 495 }, { "epoch": 3.90625, "grad_norm": 0.7008833289146423, "learning_rate": 0.000135910878112713, "loss": 0.4327, "step": 500 }, { "epoch": 3.9453125, "grad_norm": 0.6874434947967529, "learning_rate": 0.00013525557011795545, "loss": 0.5225, "step": 505 }, { "epoch": 3.984375, "grad_norm": 0.4368758499622345, "learning_rate": 0.00013460026212319792, "loss": 0.3697, "step": 510 }, { "epoch": 4.0234375, "grad_norm": 1.0020313262939453, "learning_rate": 0.00013394495412844036, "loss": 0.2611, "step": 515 }, { "epoch": 4.0625, "grad_norm": 0.8647730946540833, "learning_rate": 0.00013328964613368286, "loss": 0.2297, "step": 520 }, { "epoch": 4.1015625, "grad_norm": 1.0684905052185059, "learning_rate": 0.0001326343381389253, "loss": 0.2594, "step": 525 }, { "epoch": 4.140625, "grad_norm": 0.6783558130264282, "learning_rate": 0.00013197903014416777, "loss": 0.2302, "step": 530 }, { "epoch": 4.1796875, "grad_norm": 0.7600467205047607, "learning_rate": 0.00013132372214941023, "loss": 0.1469, "step": 535 }, { "epoch": 4.21875, "grad_norm": 0.9370886087417603, "learning_rate": 0.0001306684141546527, "loss": 0.3683, "step": 540 }, { "epoch": 4.2578125, "grad_norm": 0.6307783722877502, "learning_rate": 0.00013001310615989514, "loss": 0.1998, "step": 545 }, { "epoch": 4.296875, "grad_norm": 0.9554206728935242, "learning_rate": 0.0001293577981651376, "loss": 0.3081, "step": 550 }, { "epoch": 4.3359375, "grad_norm": 0.8178610801696777, "learning_rate": 0.00012870249017038008, "loss": 0.2767, "step": 555 }, { "epoch": 4.375, "grad_norm": 0.6448714137077332, "learning_rate": 0.00012804718217562254, "loss": 0.2248, "step": 560 }, { "epoch": 4.4140625, "grad_norm": 0.9795539379119873, "learning_rate": 0.000127391874180865, "loss": 0.3152, "step": 565 }, { "epoch": 4.453125, "grad_norm": 0.7778314352035522, "learning_rate": 0.00012673656618610748, "loss": 0.3028, "step": 570 }, { "epoch": 4.4921875, "grad_norm": 0.9457613825798035, "learning_rate": 0.00012608125819134995, "loss": 0.309, "step": 575 }, { "epoch": 4.53125, "grad_norm": 0.7530558705329895, "learning_rate": 0.0001254259501965924, "loss": 0.3125, "step": 580 }, { "epoch": 4.5703125, "grad_norm": 0.7017265558242798, "learning_rate": 0.00012477064220183488, "loss": 0.273, "step": 585 }, { "epoch": 4.609375, "grad_norm": 0.8178383708000183, "learning_rate": 0.00012411533420707732, "loss": 0.3205, "step": 590 }, { "epoch": 4.6484375, "grad_norm": 1.5198026895523071, "learning_rate": 0.0001234600262123198, "loss": 0.3607, "step": 595 }, { "epoch": 4.6875, "grad_norm": 0.8270261883735657, "learning_rate": 0.00012280471821756226, "loss": 0.3297, "step": 600 }, { "epoch": 4.7265625, "grad_norm": 0.8817920088768005, "learning_rate": 0.00012214941022280473, "loss": 0.1601, "step": 605 }, { "epoch": 4.765625, "grad_norm": 0.9366243481636047, "learning_rate": 0.00012149410222804718, "loss": 0.1668, "step": 610 }, { "epoch": 4.8046875, "grad_norm": 0.706917405128479, "learning_rate": 0.00012083879423328965, "loss": 0.2827, "step": 615 }, { "epoch": 4.84375, "grad_norm": 0.8291599154472351, "learning_rate": 0.0001201834862385321, "loss": 0.2726, "step": 620 }, { "epoch": 4.8828125, "grad_norm": 0.6848894357681274, "learning_rate": 0.00011952817824377458, "loss": 0.1808, "step": 625 }, { "epoch": 4.921875, "grad_norm": 0.9057679176330566, "learning_rate": 0.00011887287024901705, "loss": 0.3169, "step": 630 }, { "epoch": 4.9609375, "grad_norm": 0.570704460144043, "learning_rate": 0.0001182175622542595, "loss": 0.3689, "step": 635 }, { "epoch": 5.0, "grad_norm": 0.8146092295646667, "learning_rate": 0.00011756225425950199, "loss": 0.3264, "step": 640 }, { "epoch": 5.0390625, "grad_norm": 0.5888718366622925, "learning_rate": 0.00011690694626474443, "loss": 0.167, "step": 645 }, { "epoch": 5.078125, "grad_norm": 0.9465442299842834, "learning_rate": 0.00011625163826998691, "loss": 0.1873, "step": 650 }, { "epoch": 5.1171875, "grad_norm": 0.5365155339241028, "learning_rate": 0.00011559633027522936, "loss": 0.1689, "step": 655 }, { "epoch": 5.15625, "grad_norm": 0.9071202278137207, "learning_rate": 0.00011494102228047183, "loss": 0.1666, "step": 660 }, { "epoch": 5.1953125, "grad_norm": 0.7092397212982178, "learning_rate": 0.00011428571428571428, "loss": 0.1557, "step": 665 }, { "epoch": 5.234375, "grad_norm": 0.7074161767959595, "learning_rate": 0.00011363040629095676, "loss": 0.2316, "step": 670 }, { "epoch": 5.2734375, "grad_norm": 0.9464021325111389, "learning_rate": 0.0001129750982961992, "loss": 0.2005, "step": 675 }, { "epoch": 5.3125, "grad_norm": 0.6366726160049438, "learning_rate": 0.00011231979030144169, "loss": 0.1569, "step": 680 }, { "epoch": 5.3515625, "grad_norm": 0.6061714291572571, "learning_rate": 0.00011166448230668414, "loss": 0.2708, "step": 685 }, { "epoch": 5.390625, "grad_norm": 0.8460837602615356, "learning_rate": 0.00011100917431192661, "loss": 0.1406, "step": 690 }, { "epoch": 5.4296875, "grad_norm": 0.7344151139259338, "learning_rate": 0.00011035386631716909, "loss": 0.1954, "step": 695 }, { "epoch": 5.46875, "grad_norm": 1.1099109649658203, "learning_rate": 0.00010969855832241154, "loss": 0.1321, "step": 700 }, { "epoch": 5.5078125, "grad_norm": 0.8708857297897339, "learning_rate": 0.00010904325032765401, "loss": 0.2918, "step": 705 }, { "epoch": 5.546875, "grad_norm": 0.8755677938461304, "learning_rate": 0.00010838794233289647, "loss": 0.2321, "step": 710 }, { "epoch": 5.5859375, "grad_norm": 0.7879914045333862, "learning_rate": 0.00010773263433813893, "loss": 0.2525, "step": 715 }, { "epoch": 5.625, "grad_norm": 0.8013678193092346, "learning_rate": 0.00010707732634338139, "loss": 0.1955, "step": 720 }, { "epoch": 5.6640625, "grad_norm": 0.6184900999069214, "learning_rate": 0.00010642201834862387, "loss": 0.1356, "step": 725 }, { "epoch": 5.703125, "grad_norm": 1.0018306970596313, "learning_rate": 0.00010576671035386632, "loss": 0.2501, "step": 730 }, { "epoch": 5.7421875, "grad_norm": 0.9539072513580322, "learning_rate": 0.00010511140235910879, "loss": 0.3376, "step": 735 }, { "epoch": 5.78125, "grad_norm": 0.7880743741989136, "learning_rate": 0.00010445609436435124, "loss": 0.176, "step": 740 }, { "epoch": 5.8203125, "grad_norm": 0.7900522351264954, "learning_rate": 0.00010380078636959371, "loss": 0.2542, "step": 745 }, { "epoch": 5.859375, "grad_norm": 0.7261202931404114, "learning_rate": 0.00010314547837483617, "loss": 0.1128, "step": 750 }, { "epoch": 5.8984375, "grad_norm": 0.8230142593383789, "learning_rate": 0.00010249017038007865, "loss": 0.2231, "step": 755 }, { "epoch": 5.9375, "grad_norm": 0.3808448612689972, "learning_rate": 0.0001018348623853211, "loss": 0.151, "step": 760 }, { "epoch": 5.9765625, "grad_norm": 0.4841325581073761, "learning_rate": 0.00010117955439056357, "loss": 0.1336, "step": 765 }, { "epoch": 6.015625, "grad_norm": 0.29059118032455444, "learning_rate": 0.00010052424639580605, "loss": 0.2318, "step": 770 }, { "epoch": 6.0546875, "grad_norm": 0.6378641724586487, "learning_rate": 9.986893840104849e-05, "loss": 0.1794, "step": 775 }, { "epoch": 6.09375, "grad_norm": 1.158392310142517, "learning_rate": 9.921363040629096e-05, "loss": 0.1575, "step": 780 }, { "epoch": 6.1328125, "grad_norm": 0.4778974652290344, "learning_rate": 9.855832241153343e-05, "loss": 0.1201, "step": 785 }, { "epoch": 6.171875, "grad_norm": 0.4302467107772827, "learning_rate": 9.790301441677588e-05, "loss": 0.0934, "step": 790 }, { "epoch": 6.2109375, "grad_norm": 0.5038356781005859, "learning_rate": 9.724770642201836e-05, "loss": 0.1497, "step": 795 }, { "epoch": 6.25, "grad_norm": 0.6529866456985474, "learning_rate": 9.659239842726083e-05, "loss": 0.1439, "step": 800 }, { "epoch": 6.2890625, "grad_norm": 0.8134426474571228, "learning_rate": 9.593709043250328e-05, "loss": 0.1469, "step": 805 }, { "epoch": 6.328125, "grad_norm": 1.0741759538650513, "learning_rate": 9.528178243774575e-05, "loss": 0.0793, "step": 810 }, { "epoch": 6.3671875, "grad_norm": 0.37064865231513977, "learning_rate": 9.462647444298822e-05, "loss": 0.0891, "step": 815 }, { "epoch": 6.40625, "grad_norm": 0.6766513586044312, "learning_rate": 9.397116644823067e-05, "loss": 0.2092, "step": 820 }, { "epoch": 6.4453125, "grad_norm": 0.45673248171806335, "learning_rate": 9.331585845347314e-05, "loss": 0.1063, "step": 825 }, { "epoch": 6.484375, "grad_norm": 0.6083511710166931, "learning_rate": 9.266055045871561e-05, "loss": 0.0958, "step": 830 }, { "epoch": 6.5234375, "grad_norm": 0.954582691192627, "learning_rate": 9.200524246395806e-05, "loss": 0.1522, "step": 835 }, { "epoch": 6.5625, "grad_norm": 0.6275842785835266, "learning_rate": 9.134993446920053e-05, "loss": 0.1387, "step": 840 }, { "epoch": 6.6015625, "grad_norm": 0.826816976070404, "learning_rate": 9.069462647444298e-05, "loss": 0.1997, "step": 845 }, { "epoch": 6.640625, "grad_norm": 0.5855023264884949, "learning_rate": 9.003931847968545e-05, "loss": 0.1423, "step": 850 }, { "epoch": 6.6796875, "grad_norm": 0.37608104944229126, "learning_rate": 8.938401048492792e-05, "loss": 0.1563, "step": 855 }, { "epoch": 6.71875, "grad_norm": 1.1068248748779297, "learning_rate": 8.872870249017037e-05, "loss": 0.1066, "step": 860 }, { "epoch": 6.7578125, "grad_norm": 0.8714601397514343, "learning_rate": 8.807339449541285e-05, "loss": 0.1076, "step": 865 }, { "epoch": 6.796875, "grad_norm": 0.6995155215263367, "learning_rate": 8.741808650065532e-05, "loss": 0.0935, "step": 870 }, { "epoch": 6.8359375, "grad_norm": 0.895413875579834, "learning_rate": 8.676277850589778e-05, "loss": 0.1802, "step": 875 }, { "epoch": 6.875, "grad_norm": 0.8599961400032043, "learning_rate": 8.610747051114024e-05, "loss": 0.2149, "step": 880 }, { "epoch": 6.9140625, "grad_norm": 0.6649323105812073, "learning_rate": 8.545216251638271e-05, "loss": 0.1272, "step": 885 }, { "epoch": 6.953125, "grad_norm": 0.6272252798080444, "learning_rate": 8.479685452162516e-05, "loss": 0.1269, "step": 890 }, { "epoch": 6.9921875, "grad_norm": 0.837714672088623, "learning_rate": 8.414154652686763e-05, "loss": 0.1932, "step": 895 }, { "epoch": 7.03125, "grad_norm": 0.7026847004890442, "learning_rate": 8.34862385321101e-05, "loss": 0.1475, "step": 900 }, { "epoch": 7.0703125, "grad_norm": 0.24409687519073486, "learning_rate": 8.283093053735255e-05, "loss": 0.0635, "step": 905 }, { "epoch": 7.109375, "grad_norm": 0.26595592498779297, "learning_rate": 8.217562254259502e-05, "loss": 0.0448, "step": 910 }, { "epoch": 7.1484375, "grad_norm": 0.6503292322158813, "learning_rate": 8.152031454783749e-05, "loss": 0.0652, "step": 915 }, { "epoch": 7.1875, "grad_norm": 1.0240068435668945, "learning_rate": 8.086500655307994e-05, "loss": 0.077, "step": 920 }, { "epoch": 7.2265625, "grad_norm": 0.36204642057418823, "learning_rate": 8.020969855832241e-05, "loss": 0.0827, "step": 925 }, { "epoch": 7.265625, "grad_norm": 0.8305175304412842, "learning_rate": 7.955439056356488e-05, "loss": 0.1117, "step": 930 }, { "epoch": 7.3046875, "grad_norm": 0.31086069345474243, "learning_rate": 7.889908256880735e-05, "loss": 0.0912, "step": 935 }, { "epoch": 7.34375, "grad_norm": 0.5125362873077393, "learning_rate": 7.824377457404981e-05, "loss": 0.0819, "step": 940 }, { "epoch": 7.3828125, "grad_norm": 0.6713749766349792, "learning_rate": 7.758846657929227e-05, "loss": 0.1059, "step": 945 }, { "epoch": 7.421875, "grad_norm": 0.6156826615333557, "learning_rate": 7.693315858453474e-05, "loss": 0.1326, "step": 950 }, { "epoch": 7.4609375, "grad_norm": 0.7549245953559875, "learning_rate": 7.62778505897772e-05, "loss": 0.0854, "step": 955 }, { "epoch": 7.5, "grad_norm": 0.8916281461715698, "learning_rate": 7.562254259501966e-05, "loss": 0.0898, "step": 960 }, { "epoch": 7.5390625, "grad_norm": 0.7383102178573608, "learning_rate": 7.496723460026212e-05, "loss": 0.0699, "step": 965 }, { "epoch": 7.578125, "grad_norm": 0.7137540578842163, "learning_rate": 7.431192660550459e-05, "loss": 0.0991, "step": 970 }, { "epoch": 7.6171875, "grad_norm": 0.3973597586154938, "learning_rate": 7.365661861074705e-05, "loss": 0.089, "step": 975 }, { "epoch": 7.65625, "grad_norm": 0.8355888724327087, "learning_rate": 7.300131061598951e-05, "loss": 0.0913, "step": 980 }, { "epoch": 7.6953125, "grad_norm": 0.5336706042289734, "learning_rate": 7.234600262123198e-05, "loss": 0.1516, "step": 985 }, { "epoch": 7.734375, "grad_norm": 0.8123258352279663, "learning_rate": 7.169069462647444e-05, "loss": 0.1144, "step": 990 }, { "epoch": 7.7734375, "grad_norm": 0.4773609936237335, "learning_rate": 7.103538663171692e-05, "loss": 0.0785, "step": 995 }, { "epoch": 7.8125, "grad_norm": 0.8169093132019043, "learning_rate": 7.038007863695938e-05, "loss": 0.1227, "step": 1000 }, { "epoch": 7.8515625, "grad_norm": 0.27247804403305054, "learning_rate": 6.972477064220184e-05, "loss": 0.086, "step": 1005 }, { "epoch": 7.890625, "grad_norm": 0.5810950398445129, "learning_rate": 6.90694626474443e-05, "loss": 0.0948, "step": 1010 }, { "epoch": 7.9296875, "grad_norm": 0.9459575414657593, "learning_rate": 6.841415465268677e-05, "loss": 0.0889, "step": 1015 }, { "epoch": 7.96875, "grad_norm": 0.127482607960701, "learning_rate": 6.775884665792923e-05, "loss": 0.0981, "step": 1020 }, { "epoch": 8.0078125, "grad_norm": 0.3091827630996704, "learning_rate": 6.71035386631717e-05, "loss": 0.1035, "step": 1025 }, { "epoch": 8.046875, "grad_norm": 0.4560360610485077, "learning_rate": 6.644823066841416e-05, "loss": 0.0834, "step": 1030 }, { "epoch": 8.0859375, "grad_norm": 0.6077558398246765, "learning_rate": 6.579292267365662e-05, "loss": 0.0711, "step": 1035 }, { "epoch": 8.125, "grad_norm": 0.7591610550880432, "learning_rate": 6.513761467889909e-05, "loss": 0.0877, "step": 1040 }, { "epoch": 8.1640625, "grad_norm": 0.38261088728904724, "learning_rate": 6.448230668414155e-05, "loss": 0.0422, "step": 1045 }, { "epoch": 8.203125, "grad_norm": 0.8100435137748718, "learning_rate": 6.382699868938401e-05, "loss": 0.0883, "step": 1050 }, { "epoch": 8.2421875, "grad_norm": 0.25259244441986084, "learning_rate": 6.317169069462647e-05, "loss": 0.0696, "step": 1055 }, { "epoch": 8.28125, "grad_norm": 0.7604616284370422, "learning_rate": 6.251638269986894e-05, "loss": 0.0761, "step": 1060 }, { "epoch": 8.3203125, "grad_norm": 0.395271897315979, "learning_rate": 6.186107470511141e-05, "loss": 0.0769, "step": 1065 }, { "epoch": 8.359375, "grad_norm": 0.6258074045181274, "learning_rate": 6.120576671035388e-05, "loss": 0.0543, "step": 1070 }, { "epoch": 8.3984375, "grad_norm": 0.13846486806869507, "learning_rate": 6.055045871559634e-05, "loss": 0.0526, "step": 1075 }, { "epoch": 8.4375, "grad_norm": 0.7763333916664124, "learning_rate": 5.98951507208388e-05, "loss": 0.0649, "step": 1080 }, { "epoch": 8.4765625, "grad_norm": 0.6369220614433289, "learning_rate": 5.923984272608126e-05, "loss": 0.043, "step": 1085 }, { "epoch": 8.515625, "grad_norm": 0.6248875856399536, "learning_rate": 5.858453473132373e-05, "loss": 0.0648, "step": 1090 }, { "epoch": 8.5546875, "grad_norm": 0.1796266883611679, "learning_rate": 5.792922673656619e-05, "loss": 0.0487, "step": 1095 }, { "epoch": 8.59375, "grad_norm": 0.7085462212562561, "learning_rate": 5.727391874180865e-05, "loss": 0.0745, "step": 1100 }, { "epoch": 8.6328125, "grad_norm": 0.27906715869903564, "learning_rate": 5.661861074705112e-05, "loss": 0.0382, "step": 1105 }, { "epoch": 8.671875, "grad_norm": 0.4836632013320923, "learning_rate": 5.596330275229358e-05, "loss": 0.0714, "step": 1110 }, { "epoch": 8.7109375, "grad_norm": 0.5871438384056091, "learning_rate": 5.530799475753604e-05, "loss": 0.0427, "step": 1115 }, { "epoch": 8.75, "grad_norm": 0.3616584241390228, "learning_rate": 5.4652686762778507e-05, "loss": 0.0458, "step": 1120 }, { "epoch": 8.7890625, "grad_norm": 0.9580535292625427, "learning_rate": 5.399737876802097e-05, "loss": 0.0759, "step": 1125 }, { "epoch": 8.828125, "grad_norm": 0.6458576321601868, "learning_rate": 5.334207077326344e-05, "loss": 0.069, "step": 1130 }, { "epoch": 8.8671875, "grad_norm": 0.21817967295646667, "learning_rate": 5.26867627785059e-05, "loss": 0.0511, "step": 1135 }, { "epoch": 8.90625, "grad_norm": 0.6552639603614807, "learning_rate": 5.203145478374837e-05, "loss": 0.0707, "step": 1140 }, { "epoch": 8.9453125, "grad_norm": 0.5542663931846619, "learning_rate": 5.137614678899083e-05, "loss": 0.0811, "step": 1145 }, { "epoch": 8.984375, "grad_norm": 0.2486066222190857, "learning_rate": 5.072083879423329e-05, "loss": 0.0491, "step": 1150 }, { "epoch": 9.0234375, "grad_norm": 0.10281497240066528, "learning_rate": 5.006553079947576e-05, "loss": 0.0513, "step": 1155 }, { "epoch": 9.0625, "grad_norm": 0.5462967753410339, "learning_rate": 4.941022280471822e-05, "loss": 0.0444, "step": 1160 }, { "epoch": 9.1015625, "grad_norm": 0.32824379205703735, "learning_rate": 4.875491480996068e-05, "loss": 0.0419, "step": 1165 }, { "epoch": 9.140625, "grad_norm": 0.15365761518478394, "learning_rate": 4.809960681520315e-05, "loss": 0.0532, "step": 1170 }, { "epoch": 9.1796875, "grad_norm": 0.4261007308959961, "learning_rate": 4.744429882044561e-05, "loss": 0.0476, "step": 1175 }, { "epoch": 9.21875, "grad_norm": 0.5910694599151611, "learning_rate": 4.678899082568808e-05, "loss": 0.0437, "step": 1180 }, { "epoch": 9.2578125, "grad_norm": 0.30444568395614624, "learning_rate": 4.613368283093054e-05, "loss": 0.0485, "step": 1185 }, { "epoch": 9.296875, "grad_norm": 0.21978724002838135, "learning_rate": 4.5478374836173006e-05, "loss": 0.0422, "step": 1190 }, { "epoch": 9.3359375, "grad_norm": 0.754964292049408, "learning_rate": 4.482306684141547e-05, "loss": 0.0474, "step": 1195 }, { "epoch": 9.375, "grad_norm": 0.08515404164791107, "learning_rate": 4.416775884665793e-05, "loss": 0.0333, "step": 1200 }, { "epoch": 9.4140625, "grad_norm": 0.22733353078365326, "learning_rate": 4.3512450851900395e-05, "loss": 0.0408, "step": 1205 }, { "epoch": 9.453125, "grad_norm": 0.2974601089954376, "learning_rate": 4.2857142857142856e-05, "loss": 0.0493, "step": 1210 }, { "epoch": 9.4921875, "grad_norm": 0.7275934815406799, "learning_rate": 4.2201834862385324e-05, "loss": 0.0538, "step": 1215 }, { "epoch": 9.53125, "grad_norm": 0.4996713101863861, "learning_rate": 4.154652686762779e-05, "loss": 0.0383, "step": 1220 }, { "epoch": 9.5703125, "grad_norm": 0.4535525441169739, "learning_rate": 4.089121887287025e-05, "loss": 0.0654, "step": 1225 }, { "epoch": 9.609375, "grad_norm": 0.44223669171333313, "learning_rate": 4.023591087811271e-05, "loss": 0.0369, "step": 1230 }, { "epoch": 9.6484375, "grad_norm": 0.08399149775505066, "learning_rate": 3.958060288335518e-05, "loss": 0.0374, "step": 1235 }, { "epoch": 9.6875, "grad_norm": 0.4127291142940521, "learning_rate": 3.892529488859764e-05, "loss": 0.0379, "step": 1240 }, { "epoch": 9.7265625, "grad_norm": 0.21898190677165985, "learning_rate": 3.82699868938401e-05, "loss": 0.0323, "step": 1245 }, { "epoch": 9.765625, "grad_norm": 0.1628551185131073, "learning_rate": 3.761467889908257e-05, "loss": 0.0438, "step": 1250 }, { "epoch": 9.8046875, "grad_norm": 0.42952653765678406, "learning_rate": 3.695937090432504e-05, "loss": 0.0358, "step": 1255 }, { "epoch": 9.84375, "grad_norm": 0.23393145203590393, "learning_rate": 3.63040629095675e-05, "loss": 0.0424, "step": 1260 }, { "epoch": 9.8828125, "grad_norm": 0.3994542956352234, "learning_rate": 3.564875491480996e-05, "loss": 0.0416, "step": 1265 }, { "epoch": 9.921875, "grad_norm": 0.32643911242485046, "learning_rate": 3.499344692005243e-05, "loss": 0.0378, "step": 1270 }, { "epoch": 9.9609375, "grad_norm": 0.13228672742843628, "learning_rate": 3.433813892529489e-05, "loss": 0.0302, "step": 1275 }, { "epoch": 10.0, "grad_norm": 0.5824608206748962, "learning_rate": 3.3682830930537356e-05, "loss": 0.0534, "step": 1280 }, { "epoch": 10.0390625, "grad_norm": 0.36108893156051636, "learning_rate": 3.302752293577982e-05, "loss": 0.0316, "step": 1285 }, { "epoch": 10.078125, "grad_norm": 0.21287214756011963, "learning_rate": 3.2372214941022284e-05, "loss": 0.0325, "step": 1290 }, { "epoch": 10.1171875, "grad_norm": 0.3202109932899475, "learning_rate": 3.1716906946264745e-05, "loss": 0.0299, "step": 1295 }, { "epoch": 10.15625, "grad_norm": 0.21489982306957245, "learning_rate": 3.1061598951507206e-05, "loss": 0.0272, "step": 1300 }, { "epoch": 10.1953125, "grad_norm": 0.1839279681444168, "learning_rate": 3.0406290956749674e-05, "loss": 0.0284, "step": 1305 }, { "epoch": 10.234375, "grad_norm": 0.36773788928985596, "learning_rate": 2.9750982961992135e-05, "loss": 0.0304, "step": 1310 }, { "epoch": 10.2734375, "grad_norm": 0.13723714649677277, "learning_rate": 2.9095674967234606e-05, "loss": 0.0367, "step": 1315 }, { "epoch": 10.3125, "grad_norm": 0.40129488706588745, "learning_rate": 2.8440366972477066e-05, "loss": 0.04, "step": 1320 }, { "epoch": 10.3515625, "grad_norm": 0.08511374145746231, "learning_rate": 2.778505897771953e-05, "loss": 0.0246, "step": 1325 }, { "epoch": 10.390625, "grad_norm": 0.43909040093421936, "learning_rate": 2.7129750982961995e-05, "loss": 0.0318, "step": 1330 }, { "epoch": 10.4296875, "grad_norm": 0.11963684856891632, "learning_rate": 2.6474442988204456e-05, "loss": 0.0313, "step": 1335 }, { "epoch": 10.46875, "grad_norm": 0.07002709805965424, "learning_rate": 2.581913499344692e-05, "loss": 0.0234, "step": 1340 }, { "epoch": 10.5078125, "grad_norm": 0.5138005614280701, "learning_rate": 2.5163826998689384e-05, "loss": 0.0333, "step": 1345 }, { "epoch": 10.546875, "grad_norm": 0.5884416699409485, "learning_rate": 2.450851900393185e-05, "loss": 0.0334, "step": 1350 }, { "epoch": 10.5859375, "grad_norm": 0.2413049191236496, "learning_rate": 2.3853211009174313e-05, "loss": 0.029, "step": 1355 }, { "epoch": 10.625, "grad_norm": 0.07464331388473511, "learning_rate": 2.3197903014416777e-05, "loss": 0.0294, "step": 1360 }, { "epoch": 10.6640625, "grad_norm": 0.38783198595046997, "learning_rate": 2.254259501965924e-05, "loss": 0.0304, "step": 1365 }, { "epoch": 10.703125, "grad_norm": 0.46144524216651917, "learning_rate": 2.1887287024901702e-05, "loss": 0.0321, "step": 1370 }, { "epoch": 10.7421875, "grad_norm": 0.10308784991502762, "learning_rate": 2.123197903014417e-05, "loss": 0.0329, "step": 1375 }, { "epoch": 10.78125, "grad_norm": 0.33099478483200073, "learning_rate": 2.0576671035386634e-05, "loss": 0.0323, "step": 1380 }, { "epoch": 10.8203125, "grad_norm": 0.381788045167923, "learning_rate": 1.9921363040629095e-05, "loss": 0.0268, "step": 1385 }, { "epoch": 10.859375, "grad_norm": 0.08423493057489395, "learning_rate": 1.9266055045871563e-05, "loss": 0.0304, "step": 1390 }, { "epoch": 10.8984375, "grad_norm": 0.5857261419296265, "learning_rate": 1.8610747051114023e-05, "loss": 0.0327, "step": 1395 }, { "epoch": 10.9375, "grad_norm": 0.12234900146722794, "learning_rate": 1.7955439056356488e-05, "loss": 0.0298, "step": 1400 }, { "epoch": 10.9765625, "grad_norm": 0.4933612048625946, "learning_rate": 1.7300131061598955e-05, "loss": 0.0329, "step": 1405 }, { "epoch": 11.015625, "grad_norm": 0.21089200675487518, "learning_rate": 1.6644823066841416e-05, "loss": 0.0274, "step": 1410 }, { "epoch": 11.0546875, "grad_norm": 0.10511160641908646, "learning_rate": 1.598951507208388e-05, "loss": 0.0254, "step": 1415 }, { "epoch": 11.09375, "grad_norm": 0.10039519518613815, "learning_rate": 1.5334207077326345e-05, "loss": 0.0211, "step": 1420 }, { "epoch": 11.1328125, "grad_norm": 0.17804010212421417, "learning_rate": 1.4678899082568809e-05, "loss": 0.0239, "step": 1425 }, { "epoch": 11.171875, "grad_norm": 0.21066808700561523, "learning_rate": 1.4023591087811271e-05, "loss": 0.0302, "step": 1430 }, { "epoch": 11.2109375, "grad_norm": 0.14561991393566132, "learning_rate": 1.3368283093053736e-05, "loss": 0.0204, "step": 1435 }, { "epoch": 11.25, "grad_norm": 0.0932135209441185, "learning_rate": 1.2712975098296202e-05, "loss": 0.0284, "step": 1440 }, { "epoch": 11.2890625, "grad_norm": 0.2561885416507721, "learning_rate": 1.2057667103538664e-05, "loss": 0.0246, "step": 1445 }, { "epoch": 11.328125, "grad_norm": 0.20448650419712067, "learning_rate": 1.1402359108781127e-05, "loss": 0.0282, "step": 1450 }, { "epoch": 11.3671875, "grad_norm": 0.11967150866985321, "learning_rate": 1.0747051114023591e-05, "loss": 0.0271, "step": 1455 }, { "epoch": 11.40625, "grad_norm": 0.12555184960365295, "learning_rate": 1.0091743119266055e-05, "loss": 0.0248, "step": 1460 }, { "epoch": 11.4453125, "grad_norm": 0.2812488079071045, "learning_rate": 9.43643512450852e-06, "loss": 0.0312, "step": 1465 }, { "epoch": 11.484375, "grad_norm": 0.06791621446609497, "learning_rate": 8.781127129750984e-06, "loss": 0.0246, "step": 1470 }, { "epoch": 11.5234375, "grad_norm": 0.08099279552698135, "learning_rate": 8.125819134993446e-06, "loss": 0.0243, "step": 1475 }, { "epoch": 11.5625, "grad_norm": 0.1286236196756363, "learning_rate": 7.4705111402359114e-06, "loss": 0.0215, "step": 1480 }, { "epoch": 11.6015625, "grad_norm": 0.2546003460884094, "learning_rate": 6.815203145478376e-06, "loss": 0.0248, "step": 1485 }, { "epoch": 11.640625, "grad_norm": 0.18944767117500305, "learning_rate": 6.159895150720839e-06, "loss": 0.024, "step": 1490 }, { "epoch": 11.6796875, "grad_norm": 0.20657788217067719, "learning_rate": 5.504587155963303e-06, "loss": 0.0206, "step": 1495 }, { "epoch": 11.71875, "grad_norm": 0.2377331256866455, "learning_rate": 4.849279161205767e-06, "loss": 0.0254, "step": 1500 }, { "epoch": 11.7578125, "grad_norm": 0.10646895319223404, "learning_rate": 4.193971166448231e-06, "loss": 0.0214, "step": 1505 }, { "epoch": 11.796875, "grad_norm": 0.3463532328605652, "learning_rate": 3.538663171690695e-06, "loss": 0.0252, "step": 1510 }, { "epoch": 11.8359375, "grad_norm": 0.17272751033306122, "learning_rate": 2.8833551769331587e-06, "loss": 0.0316, "step": 1515 }, { "epoch": 11.875, "grad_norm": 0.3141430914402008, "learning_rate": 2.2280471821756225e-06, "loss": 0.0251, "step": 1520 }, { "epoch": 11.9140625, "grad_norm": 0.28511035442352295, "learning_rate": 1.5727391874180865e-06, "loss": 0.0267, "step": 1525 }, { "epoch": 11.953125, "grad_norm": 0.10313425958156586, "learning_rate": 9.174311926605506e-07, "loss": 0.0196, "step": 1530 }, { "epoch": 11.9921875, "grad_norm": 0.28020963072776794, "learning_rate": 2.6212319790301444e-07, "loss": 0.024, "step": 1535 }, { "epoch": 12.0, "step": 1536, "total_flos": 8.765889328981094e+16, "train_loss": 0.2845887634175597, "train_runtime": 2956.9183, "train_samples_per_second": 4.152, "train_steps_per_second": 0.519 } ], "logging_steps": 5, "max_steps": 1536, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.765889328981094e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }