| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.915492957746478, | |
| "eval_steps": 500, | |
| "global_step": 880, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.25962337851524353, | |
| "learning_rate": 7.407407407407407e-05, | |
| "loss": 1.49, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.30703499913215637, | |
| "learning_rate": 0.00014814814814814815, | |
| "loss": 1.3093, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.286909818649292, | |
| "learning_rate": 0.00019929660023446658, | |
| "loss": 1.0751, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.1287854015827179, | |
| "learning_rate": 0.00019695193434935522, | |
| "loss": 0.9134, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.11795257776975632, | |
| "learning_rate": 0.00019460726846424385, | |
| "loss": 0.8577, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.10355807095766068, | |
| "learning_rate": 0.0001922626025791325, | |
| "loss": 0.8221, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.1252811998128891, | |
| "learning_rate": 0.00018991793669402113, | |
| "loss": 0.8195, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.07339867949485779, | |
| "learning_rate": 0.00018757327080890974, | |
| "loss": 0.9076, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.08330203592777252, | |
| "learning_rate": 0.00018522860492379837, | |
| "loss": 0.8739, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.07597564905881882, | |
| "learning_rate": 0.00018288393903868698, | |
| "loss": 0.8268, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.07317127287387848, | |
| "learning_rate": 0.00018053927315357562, | |
| "loss": 0.8032, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.08088891208171844, | |
| "learning_rate": 0.00017819460726846423, | |
| "loss": 0.8332, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.078943632543087, | |
| "learning_rate": 0.0001758499413833529, | |
| "loss": 0.8537, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.06703902035951614, | |
| "learning_rate": 0.0001735052754982415, | |
| "loss": 0.8002, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.09882781654596329, | |
| "learning_rate": 0.00017116060961313014, | |
| "loss": 0.8755, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.08114507794380188, | |
| "learning_rate": 0.00016881594372801878, | |
| "loss": 0.7776, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.0816078633069992, | |
| "learning_rate": 0.00016647127784290739, | |
| "loss": 0.8618, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.08353616297245026, | |
| "learning_rate": 0.00016412661195779602, | |
| "loss": 0.8155, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.08548751473426819, | |
| "learning_rate": 0.00016178194607268463, | |
| "loss": 0.7937, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.08857505023479462, | |
| "learning_rate": 0.00015943728018757327, | |
| "loss": 0.8395, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.08709154278039932, | |
| "learning_rate": 0.0001570926143024619, | |
| "loss": 0.806, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.10497643798589706, | |
| "learning_rate": 0.00015474794841735054, | |
| "loss": 0.81, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.09364147484302521, | |
| "learning_rate": 0.00015240328253223918, | |
| "loss": 0.7869, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.09616605192422867, | |
| "learning_rate": 0.0001500586166471278, | |
| "loss": 0.8653, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.08535677939653397, | |
| "learning_rate": 0.00014771395076201643, | |
| "loss": 0.8312, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.07922427356243134, | |
| "learning_rate": 0.00014536928487690504, | |
| "loss": 0.8172, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.08045646548271179, | |
| "learning_rate": 0.00014302461899179367, | |
| "loss": 0.7989, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 0.09606581926345825, | |
| "learning_rate": 0.00014067995310668228, | |
| "loss": 0.8209, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 0.0926663875579834, | |
| "learning_rate": 0.00013833528722157095, | |
| "loss": 0.7844, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.1092584952712059, | |
| "learning_rate": 0.00013599062133645955, | |
| "loss": 0.8255, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 0.11442829668521881, | |
| "learning_rate": 0.0001336459554513482, | |
| "loss": 0.8044, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 0.10057694464921951, | |
| "learning_rate": 0.00013130128956623683, | |
| "loss": 0.7837, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 0.11962959170341492, | |
| "learning_rate": 0.00012895662368112544, | |
| "loss": 0.84, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.10885532945394516, | |
| "learning_rate": 0.00012661195779601407, | |
| "loss": 0.8587, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 0.10991047322750092, | |
| "learning_rate": 0.00012426729191090268, | |
| "loss": 0.7863, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 0.1319924294948578, | |
| "learning_rate": 0.00012192262602579132, | |
| "loss": 0.7578, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 0.11193032562732697, | |
| "learning_rate": 0.00011957796014067997, | |
| "loss": 0.7627, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.12446644902229309, | |
| "learning_rate": 0.0001172332942555686, | |
| "loss": 0.7904, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 0.130494624376297, | |
| "learning_rate": 0.00011488862837045722, | |
| "loss": 0.8083, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 0.13371142745018005, | |
| "learning_rate": 0.00011254396248534584, | |
| "loss": 0.8268, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 0.14128689467906952, | |
| "learning_rate": 0.00011019929660023446, | |
| "loss": 0.784, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 0.1244727149605751, | |
| "learning_rate": 0.0001078546307151231, | |
| "loss": 0.8522, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 0.13961324095726013, | |
| "learning_rate": 0.00010550996483001172, | |
| "loss": 0.8023, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.13047081232070923, | |
| "learning_rate": 0.00010316529894490035, | |
| "loss": 0.7936, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.07, | |
| "grad_norm": 0.13891151547431946, | |
| "learning_rate": 0.000100820633059789, | |
| "loss": 0.7946, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.18, | |
| "grad_norm": 0.1592375487089157, | |
| "learning_rate": 9.847596717467761e-05, | |
| "loss": 0.7945, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 0.1546567976474762, | |
| "learning_rate": 9.613130128956624e-05, | |
| "loss": 0.7681, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.41, | |
| "grad_norm": 0.16185538470745087, | |
| "learning_rate": 9.378663540445487e-05, | |
| "loss": 0.8085, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 0.13699068129062653, | |
| "learning_rate": 9.144196951934349e-05, | |
| "loss": 0.7998, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.63, | |
| "grad_norm": 0.17498376965522766, | |
| "learning_rate": 8.909730363423211e-05, | |
| "loss": 0.8519, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 0.1639554798603058, | |
| "learning_rate": 8.675263774912075e-05, | |
| "loss": 0.8053, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.86, | |
| "grad_norm": 0.14512798190116882, | |
| "learning_rate": 8.440797186400939e-05, | |
| "loss": 0.7585, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "grad_norm": 0.15474794805049896, | |
| "learning_rate": 8.206330597889801e-05, | |
| "loss": 0.7231, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.16797874867916107, | |
| "learning_rate": 7.971864009378663e-05, | |
| "loss": 0.7668, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 0.16117724776268005, | |
| "learning_rate": 7.737397420867527e-05, | |
| "loss": 0.7881, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.31, | |
| "grad_norm": 0.16954748332500458, | |
| "learning_rate": 7.50293083235639e-05, | |
| "loss": 0.7327, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.42, | |
| "grad_norm": 0.17700648307800293, | |
| "learning_rate": 7.268464243845252e-05, | |
| "loss": 0.8237, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.54, | |
| "grad_norm": 0.180480495095253, | |
| "learning_rate": 7.033997655334114e-05, | |
| "loss": 0.7942, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 6.65, | |
| "grad_norm": 0.17488466203212738, | |
| "learning_rate": 6.799531066822978e-05, | |
| "loss": 0.7894, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.76, | |
| "grad_norm": 0.17924261093139648, | |
| "learning_rate": 6.565064478311841e-05, | |
| "loss": 0.76, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "grad_norm": 0.19026713073253632, | |
| "learning_rate": 6.330597889800704e-05, | |
| "loss": 0.7691, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 6.99, | |
| "grad_norm": 0.18890035152435303, | |
| "learning_rate": 6.096131301289566e-05, | |
| "loss": 0.7678, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 0.17174021899700165, | |
| "learning_rate": 5.86166471277843e-05, | |
| "loss": 0.762, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.21, | |
| "grad_norm": 0.1877211183309555, | |
| "learning_rate": 5.627198124267292e-05, | |
| "loss": 0.8003, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.32, | |
| "grad_norm": 0.18858322501182556, | |
| "learning_rate": 5.392731535756155e-05, | |
| "loss": 0.7147, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.44, | |
| "grad_norm": 0.20587189495563507, | |
| "learning_rate": 5.1582649472450174e-05, | |
| "loss": 0.8007, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 7.55, | |
| "grad_norm": 0.1961081176996231, | |
| "learning_rate": 4.9237983587338804e-05, | |
| "loss": 0.7319, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 7.66, | |
| "grad_norm": 0.18121910095214844, | |
| "learning_rate": 4.6893317702227434e-05, | |
| "loss": 0.7718, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 7.77, | |
| "grad_norm": 0.21982868015766144, | |
| "learning_rate": 4.454865181711606e-05, | |
| "loss": 0.7633, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 7.89, | |
| "grad_norm": 0.2214839607477188, | |
| "learning_rate": 4.2203985932004694e-05, | |
| "loss": 0.7596, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.2265605926513672, | |
| "learning_rate": 3.985932004689332e-05, | |
| "loss": 0.7734, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 8.11, | |
| "grad_norm": 0.19514672458171844, | |
| "learning_rate": 3.751465416178195e-05, | |
| "loss": 0.7566, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.23, | |
| "grad_norm": 0.20062579214572906, | |
| "learning_rate": 3.516998827667057e-05, | |
| "loss": 0.727, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.34, | |
| "grad_norm": 0.1994123011827469, | |
| "learning_rate": 3.282532239155921e-05, | |
| "loss": 0.7859, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 8.45, | |
| "grad_norm": 0.19639483094215393, | |
| "learning_rate": 3.048065650644783e-05, | |
| "loss": 0.7591, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 8.56, | |
| "grad_norm": 0.21627792716026306, | |
| "learning_rate": 2.813599062133646e-05, | |
| "loss": 0.7817, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 8.68, | |
| "grad_norm": 0.22825635969638824, | |
| "learning_rate": 2.5791324736225087e-05, | |
| "loss": 0.7656, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 8.79, | |
| "grad_norm": 0.22350715100765228, | |
| "learning_rate": 2.3446658851113717e-05, | |
| "loss": 0.7532, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 8.9, | |
| "grad_norm": 0.19662606716156006, | |
| "learning_rate": 2.1101992966002347e-05, | |
| "loss": 0.7261, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 9.01, | |
| "grad_norm": 0.23435229063034058, | |
| "learning_rate": 1.8757327080890974e-05, | |
| "loss": 0.7549, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9.13, | |
| "grad_norm": 0.21477362513542175, | |
| "learning_rate": 1.6412661195779604e-05, | |
| "loss": 0.7281, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.24, | |
| "grad_norm": 0.21965694427490234, | |
| "learning_rate": 1.406799531066823e-05, | |
| "loss": 0.7395, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 9.35, | |
| "grad_norm": 0.23001712560653687, | |
| "learning_rate": 1.1723329425556858e-05, | |
| "loss": 0.7111, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 9.46, | |
| "grad_norm": 0.2103584110736847, | |
| "learning_rate": 9.378663540445487e-06, | |
| "loss": 0.732, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 9.58, | |
| "grad_norm": 0.2366890013217926, | |
| "learning_rate": 7.033997655334115e-06, | |
| "loss": 0.7846, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 9.69, | |
| "grad_norm": 0.24565377831459045, | |
| "learning_rate": 4.689331770222743e-06, | |
| "loss": 0.736, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 9.8, | |
| "grad_norm": 0.24520188570022583, | |
| "learning_rate": 2.3446658851113717e-06, | |
| "loss": 0.7424, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 0.2370171844959259, | |
| "learning_rate": 0.0, | |
| "loss": 0.772, | |
| "step": 880 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 880, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "total_flos": 5.719447641532662e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |