{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.915492957746478, "eval_steps": 500, "global_step": 880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11, "grad_norm": 0.25962337851524353, "learning_rate": 7.407407407407407e-05, "loss": 1.49, "step": 10 }, { "epoch": 0.23, "grad_norm": 0.30703499913215637, "learning_rate": 0.00014814814814814815, "loss": 1.3093, "step": 20 }, { "epoch": 0.34, "grad_norm": 0.286909818649292, "learning_rate": 0.00019929660023446658, "loss": 1.0751, "step": 30 }, { "epoch": 0.45, "grad_norm": 0.1287854015827179, "learning_rate": 0.00019695193434935522, "loss": 0.9134, "step": 40 }, { "epoch": 0.56, "grad_norm": 0.11795257776975632, "learning_rate": 0.00019460726846424385, "loss": 0.8577, "step": 50 }, { "epoch": 0.68, "grad_norm": 0.10355807095766068, "learning_rate": 0.0001922626025791325, "loss": 0.8221, "step": 60 }, { "epoch": 0.79, "grad_norm": 0.1252811998128891, "learning_rate": 0.00018991793669402113, "loss": 0.8195, "step": 70 }, { "epoch": 0.9, "grad_norm": 0.07339867949485779, "learning_rate": 0.00018757327080890974, "loss": 0.9076, "step": 80 }, { "epoch": 1.01, "grad_norm": 0.08330203592777252, "learning_rate": 0.00018522860492379837, "loss": 0.8739, "step": 90 }, { "epoch": 1.13, "grad_norm": 0.07597564905881882, "learning_rate": 0.00018288393903868698, "loss": 0.8268, "step": 100 }, { "epoch": 1.24, "grad_norm": 0.07317127287387848, "learning_rate": 0.00018053927315357562, "loss": 0.8032, "step": 110 }, { "epoch": 1.35, "grad_norm": 0.08088891208171844, "learning_rate": 0.00017819460726846423, "loss": 0.8332, "step": 120 }, { "epoch": 1.46, "grad_norm": 0.078943632543087, "learning_rate": 0.0001758499413833529, "loss": 0.8537, "step": 130 }, { "epoch": 1.58, "grad_norm": 0.06703902035951614, "learning_rate": 0.0001735052754982415, "loss": 0.8002, "step": 140 }, { "epoch": 1.69, "grad_norm": 0.09882781654596329, "learning_rate": 0.00017116060961313014, "loss": 0.8755, "step": 150 }, { "epoch": 1.8, "grad_norm": 0.08114507794380188, "learning_rate": 0.00016881594372801878, "loss": 0.7776, "step": 160 }, { "epoch": 1.92, "grad_norm": 0.0816078633069992, "learning_rate": 0.00016647127784290739, "loss": 0.8618, "step": 170 }, { "epoch": 2.03, "grad_norm": 0.08353616297245026, "learning_rate": 0.00016412661195779602, "loss": 0.8155, "step": 180 }, { "epoch": 2.14, "grad_norm": 0.08548751473426819, "learning_rate": 0.00016178194607268463, "loss": 0.7937, "step": 190 }, { "epoch": 2.25, "grad_norm": 0.08857505023479462, "learning_rate": 0.00015943728018757327, "loss": 0.8395, "step": 200 }, { "epoch": 2.37, "grad_norm": 0.08709154278039932, "learning_rate": 0.0001570926143024619, "loss": 0.806, "step": 210 }, { "epoch": 2.48, "grad_norm": 0.10497643798589706, "learning_rate": 0.00015474794841735054, "loss": 0.81, "step": 220 }, { "epoch": 2.59, "grad_norm": 0.09364147484302521, "learning_rate": 0.00015240328253223918, "loss": 0.7869, "step": 230 }, { "epoch": 2.7, "grad_norm": 0.09616605192422867, "learning_rate": 0.0001500586166471278, "loss": 0.8653, "step": 240 }, { "epoch": 2.82, "grad_norm": 0.08535677939653397, "learning_rate": 0.00014771395076201643, "loss": 0.8312, "step": 250 }, { "epoch": 2.93, "grad_norm": 0.07922427356243134, "learning_rate": 0.00014536928487690504, "loss": 0.8172, "step": 260 }, { "epoch": 3.04, "grad_norm": 0.08045646548271179, "learning_rate": 0.00014302461899179367, "loss": 0.7989, "step": 270 }, { "epoch": 3.15, "grad_norm": 0.09606581926345825, "learning_rate": 0.00014067995310668228, "loss": 0.8209, "step": 280 }, { "epoch": 3.27, "grad_norm": 0.0926663875579834, "learning_rate": 0.00013833528722157095, "loss": 0.7844, "step": 290 }, { "epoch": 3.38, "grad_norm": 0.1092584952712059, "learning_rate": 0.00013599062133645955, "loss": 0.8255, "step": 300 }, { "epoch": 3.49, "grad_norm": 0.11442829668521881, "learning_rate": 0.0001336459554513482, "loss": 0.8044, "step": 310 }, { "epoch": 3.61, "grad_norm": 0.10057694464921951, "learning_rate": 0.00013130128956623683, "loss": 0.7837, "step": 320 }, { "epoch": 3.72, "grad_norm": 0.11962959170341492, "learning_rate": 0.00012895662368112544, "loss": 0.84, "step": 330 }, { "epoch": 3.83, "grad_norm": 0.10885532945394516, "learning_rate": 0.00012661195779601407, "loss": 0.8587, "step": 340 }, { "epoch": 3.94, "grad_norm": 0.10991047322750092, "learning_rate": 0.00012426729191090268, "loss": 0.7863, "step": 350 }, { "epoch": 4.06, "grad_norm": 0.1319924294948578, "learning_rate": 0.00012192262602579132, "loss": 0.7578, "step": 360 }, { "epoch": 4.17, "grad_norm": 0.11193032562732697, "learning_rate": 0.00011957796014067997, "loss": 0.7627, "step": 370 }, { "epoch": 4.28, "grad_norm": 0.12446644902229309, "learning_rate": 0.0001172332942555686, "loss": 0.7904, "step": 380 }, { "epoch": 4.39, "grad_norm": 0.130494624376297, "learning_rate": 0.00011488862837045722, "loss": 0.8083, "step": 390 }, { "epoch": 4.51, "grad_norm": 0.13371142745018005, "learning_rate": 0.00011254396248534584, "loss": 0.8268, "step": 400 }, { "epoch": 4.62, "grad_norm": 0.14128689467906952, "learning_rate": 0.00011019929660023446, "loss": 0.784, "step": 410 }, { "epoch": 4.73, "grad_norm": 0.1244727149605751, "learning_rate": 0.0001078546307151231, "loss": 0.8522, "step": 420 }, { "epoch": 4.85, "grad_norm": 0.13961324095726013, "learning_rate": 0.00010550996483001172, "loss": 0.8023, "step": 430 }, { "epoch": 4.96, "grad_norm": 0.13047081232070923, "learning_rate": 0.00010316529894490035, "loss": 0.7936, "step": 440 }, { "epoch": 5.07, "grad_norm": 0.13891151547431946, "learning_rate": 0.000100820633059789, "loss": 0.7946, "step": 450 }, { "epoch": 5.18, "grad_norm": 0.1592375487089157, "learning_rate": 9.847596717467761e-05, "loss": 0.7945, "step": 460 }, { "epoch": 5.3, "grad_norm": 0.1546567976474762, "learning_rate": 9.613130128956624e-05, "loss": 0.7681, "step": 470 }, { "epoch": 5.41, "grad_norm": 0.16185538470745087, "learning_rate": 9.378663540445487e-05, "loss": 0.8085, "step": 480 }, { "epoch": 5.52, "grad_norm": 0.13699068129062653, "learning_rate": 9.144196951934349e-05, "loss": 0.7998, "step": 490 }, { "epoch": 5.63, "grad_norm": 0.17498376965522766, "learning_rate": 8.909730363423211e-05, "loss": 0.8519, "step": 500 }, { "epoch": 5.75, "grad_norm": 0.1639554798603058, "learning_rate": 8.675263774912075e-05, "loss": 0.8053, "step": 510 }, { "epoch": 5.86, "grad_norm": 0.14512798190116882, "learning_rate": 8.440797186400939e-05, "loss": 0.7585, "step": 520 }, { "epoch": 5.97, "grad_norm": 0.15474794805049896, "learning_rate": 8.206330597889801e-05, "loss": 0.7231, "step": 530 }, { "epoch": 6.08, "grad_norm": 0.16797874867916107, "learning_rate": 7.971864009378663e-05, "loss": 0.7668, "step": 540 }, { "epoch": 6.2, "grad_norm": 0.16117724776268005, "learning_rate": 7.737397420867527e-05, "loss": 0.7881, "step": 550 }, { "epoch": 6.31, "grad_norm": 0.16954748332500458, "learning_rate": 7.50293083235639e-05, "loss": 0.7327, "step": 560 }, { "epoch": 6.42, "grad_norm": 0.17700648307800293, "learning_rate": 7.268464243845252e-05, "loss": 0.8237, "step": 570 }, { "epoch": 6.54, "grad_norm": 0.180480495095253, "learning_rate": 7.033997655334114e-05, "loss": 0.7942, "step": 580 }, { "epoch": 6.65, "grad_norm": 0.17488466203212738, "learning_rate": 6.799531066822978e-05, "loss": 0.7894, "step": 590 }, { "epoch": 6.76, "grad_norm": 0.17924261093139648, "learning_rate": 6.565064478311841e-05, "loss": 0.76, "step": 600 }, { "epoch": 6.87, "grad_norm": 0.19026713073253632, "learning_rate": 6.330597889800704e-05, "loss": 0.7691, "step": 610 }, { "epoch": 6.99, "grad_norm": 0.18890035152435303, "learning_rate": 6.096131301289566e-05, "loss": 0.7678, "step": 620 }, { "epoch": 7.1, "grad_norm": 0.17174021899700165, "learning_rate": 5.86166471277843e-05, "loss": 0.762, "step": 630 }, { "epoch": 7.21, "grad_norm": 0.1877211183309555, "learning_rate": 5.627198124267292e-05, "loss": 0.8003, "step": 640 }, { "epoch": 7.32, "grad_norm": 0.18858322501182556, "learning_rate": 5.392731535756155e-05, "loss": 0.7147, "step": 650 }, { "epoch": 7.44, "grad_norm": 0.20587189495563507, "learning_rate": 5.1582649472450174e-05, "loss": 0.8007, "step": 660 }, { "epoch": 7.55, "grad_norm": 0.1961081176996231, "learning_rate": 4.9237983587338804e-05, "loss": 0.7319, "step": 670 }, { "epoch": 7.66, "grad_norm": 0.18121910095214844, "learning_rate": 4.6893317702227434e-05, "loss": 0.7718, "step": 680 }, { "epoch": 7.77, "grad_norm": 0.21982868015766144, "learning_rate": 4.454865181711606e-05, "loss": 0.7633, "step": 690 }, { "epoch": 7.89, "grad_norm": 0.2214839607477188, "learning_rate": 4.2203985932004694e-05, "loss": 0.7596, "step": 700 }, { "epoch": 8.0, "grad_norm": 0.2265605926513672, "learning_rate": 3.985932004689332e-05, "loss": 0.7734, "step": 710 }, { "epoch": 8.11, "grad_norm": 0.19514672458171844, "learning_rate": 3.751465416178195e-05, "loss": 0.7566, "step": 720 }, { "epoch": 8.23, "grad_norm": 0.20062579214572906, "learning_rate": 3.516998827667057e-05, "loss": 0.727, "step": 730 }, { "epoch": 8.34, "grad_norm": 0.1994123011827469, "learning_rate": 3.282532239155921e-05, "loss": 0.7859, "step": 740 }, { "epoch": 8.45, "grad_norm": 0.19639483094215393, "learning_rate": 3.048065650644783e-05, "loss": 0.7591, "step": 750 }, { "epoch": 8.56, "grad_norm": 0.21627792716026306, "learning_rate": 2.813599062133646e-05, "loss": 0.7817, "step": 760 }, { "epoch": 8.68, "grad_norm": 0.22825635969638824, "learning_rate": 2.5791324736225087e-05, "loss": 0.7656, "step": 770 }, { "epoch": 8.79, "grad_norm": 0.22350715100765228, "learning_rate": 2.3446658851113717e-05, "loss": 0.7532, "step": 780 }, { "epoch": 8.9, "grad_norm": 0.19662606716156006, "learning_rate": 2.1101992966002347e-05, "loss": 0.7261, "step": 790 }, { "epoch": 9.01, "grad_norm": 0.23435229063034058, "learning_rate": 1.8757327080890974e-05, "loss": 0.7549, "step": 800 }, { "epoch": 9.13, "grad_norm": 0.21477362513542175, "learning_rate": 1.6412661195779604e-05, "loss": 0.7281, "step": 810 }, { "epoch": 9.24, "grad_norm": 0.21965694427490234, "learning_rate": 1.406799531066823e-05, "loss": 0.7395, "step": 820 }, { "epoch": 9.35, "grad_norm": 0.23001712560653687, "learning_rate": 1.1723329425556858e-05, "loss": 0.7111, "step": 830 }, { "epoch": 9.46, "grad_norm": 0.2103584110736847, "learning_rate": 9.378663540445487e-06, "loss": 0.732, "step": 840 }, { "epoch": 9.58, "grad_norm": 0.2366890013217926, "learning_rate": 7.033997655334115e-06, "loss": 0.7846, "step": 850 }, { "epoch": 9.69, "grad_norm": 0.24565377831459045, "learning_rate": 4.689331770222743e-06, "loss": 0.736, "step": 860 }, { "epoch": 9.8, "grad_norm": 0.24520188570022583, "learning_rate": 2.3446658851113717e-06, "loss": 0.7424, "step": 870 }, { "epoch": 9.92, "grad_norm": 0.2370171844959259, "learning_rate": 0.0, "loss": 0.772, "step": 880 } ], "logging_steps": 10, "max_steps": 880, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 5.719447641532662e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }