| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.142095914742452, | |
| "eval_steps": 1000, | |
| "global_step": 720, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.14209591474245115, | |
| "grad_norm": 2.0780832767486572, | |
| "learning_rate": 0.0003999999920056205, | |
| "loss": 2.8691, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2841918294849023, | |
| "grad_norm": 1.7782244682312012, | |
| "learning_rate": 0.0003999999643707292, | |
| "loss": 1.628, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.42628774422735344, | |
| "grad_norm": 1.9934731721878052, | |
| "learning_rate": 0.0003999999169966327, | |
| "loss": 1.2529, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5683836589698046, | |
| "grad_norm": 0.9428825378417969, | |
| "learning_rate": 0.00039999984988333586, | |
| "loss": 1.2003, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7104795737122558, | |
| "grad_norm": 0.8945791721343994, | |
| "learning_rate": 0.00039999976303084516, | |
| "loss": 1.1966, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8525754884547069, | |
| "grad_norm": 0.7803293466567993, | |
| "learning_rate": 0.0003999996564391692, | |
| "loss": 1.1803, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.9946714031971581, | |
| "grad_norm": 0.6183452010154724, | |
| "learning_rate": 0.00039999953010831846, | |
| "loss": 1.1677, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.1278863232682061, | |
| "grad_norm": 0.4529220461845398, | |
| "learning_rate": 0.0003999993840383055, | |
| "loss": 1.1569, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.2699822380106571, | |
| "grad_norm": 0.4726000130176544, | |
| "learning_rate": 0.0003999992182291447, | |
| "loss": 1.1582, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.4120781527531083, | |
| "grad_norm": 0.4730006754398346, | |
| "learning_rate": 0.0003999990326808524, | |
| "loss": 1.1674, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.5541740674955595, | |
| "grad_norm": 0.44843927025794983, | |
| "learning_rate": 0.000399998827393447, | |
| "loss": 1.1764, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.6962699822380105, | |
| "grad_norm": 0.3917418420314789, | |
| "learning_rate": 0.00039999860236694857, | |
| "loss": 1.1582, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.838365896980462, | |
| "grad_norm": 0.4569181203842163, | |
| "learning_rate": 0.00039999835760137954, | |
| "loss": 1.164, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.980461811722913, | |
| "grad_norm": 0.40389877557754517, | |
| "learning_rate": 0.0003999980930967639, | |
| "loss": 1.1747, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.113676731793961, | |
| "grad_norm": 0.540137767791748, | |
| "learning_rate": 0.0003999978088531279, | |
| "loss": 1.1524, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.2557726465364123, | |
| "grad_norm": 0.515496551990509, | |
| "learning_rate": 0.00039999750487049944, | |
| "loss": 1.1403, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.3978685612788633, | |
| "grad_norm": 0.8278722763061523, | |
| "learning_rate": 0.0003999971811489086, | |
| "loss": 1.1333, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.5399644760213143, | |
| "grad_norm": 0.7663260102272034, | |
| "learning_rate": 0.0003999968376883874, | |
| "loss": 1.1295, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.6820603907637657, | |
| "grad_norm": 0.9943056106567383, | |
| "learning_rate": 0.0003999964744889696, | |
| "loss": 1.1283, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.8241563055062167, | |
| "grad_norm": 0.8294516205787659, | |
| "learning_rate": 0.0003999960915506912, | |
| "loss": 1.1071, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.966252220248668, | |
| "grad_norm": 0.8728694915771484, | |
| "learning_rate": 0.00039999568887358984, | |
| "loss": 1.1187, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.0994671403197156, | |
| "grad_norm": 0.7422124147415161, | |
| "learning_rate": 0.00039999526645770534, | |
| "loss": 1.0982, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.241563055062167, | |
| "grad_norm": 0.7153337597846985, | |
| "learning_rate": 0.0003999948243030794, | |
| "loss": 1.0884, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.383658969804618, | |
| "grad_norm": 1.1282627582550049, | |
| "learning_rate": 0.0003999943624097556, | |
| "loss": 1.0818, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.5257548845470694, | |
| "grad_norm": 0.6651580333709717, | |
| "learning_rate": 0.0003999938807777796, | |
| "loss": 1.088, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.6678507992895204, | |
| "grad_norm": 0.9163554310798645, | |
| "learning_rate": 0.0003999933794071989, | |
| "loss": 1.0817, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.8099467140319714, | |
| "grad_norm": 1.1498912572860718, | |
| "learning_rate": 0.000399992858298063, | |
| "loss": 1.0977, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.952042628774423, | |
| "grad_norm": 0.840567409992218, | |
| "learning_rate": 0.0003999923174504233, | |
| "loss": 1.08, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.085257548845471, | |
| "grad_norm": 0.8114287257194519, | |
| "learning_rate": 0.00039999175686433324, | |
| "loss": 1.0712, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.227353463587922, | |
| "grad_norm": 0.6106327772140503, | |
| "learning_rate": 0.00039999117653984805, | |
| "loss": 1.0403, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.369449378330373, | |
| "grad_norm": 0.6414577960968018, | |
| "learning_rate": 0.0003999905764770251, | |
| "loss": 1.0464, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.511545293072825, | |
| "grad_norm": 0.6204348802566528, | |
| "learning_rate": 0.00039998995667592355, | |
| "loss": 1.0475, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.653641207815276, | |
| "grad_norm": 0.6873607039451599, | |
| "learning_rate": 0.0003999893171366046, | |
| "loss": 1.0575, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.7957371225577266, | |
| "grad_norm": 0.6087599396705627, | |
| "learning_rate": 0.0003999886578591313, | |
| "loss": 1.0519, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.9378330373001775, | |
| "grad_norm": 0.582972526550293, | |
| "learning_rate": 0.0003999879788435689, | |
| "loss": 1.0688, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.071047957371226, | |
| "grad_norm": 0.759579062461853, | |
| "learning_rate": 0.0003999872800899843, | |
| "loss": 1.0349, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.213143872113677, | |
| "grad_norm": 1.00662100315094, | |
| "learning_rate": 0.00039998656159844636, | |
| "loss": 0.9849, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.355239786856128, | |
| "grad_norm": 0.9577768445014954, | |
| "learning_rate": 0.00039998582336902617, | |
| "loss": 0.9796, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.497335701598579, | |
| "grad_norm": 0.7910038232803345, | |
| "learning_rate": 0.0003999850654017965, | |
| "loss": 1.0001, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.63943161634103, | |
| "grad_norm": 0.9464731216430664, | |
| "learning_rate": 0.00039998428769683216, | |
| "loss": 1.0018, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.781527531083482, | |
| "grad_norm": 0.8687145709991455, | |
| "learning_rate": 0.0003999834902542099, | |
| "loss": 1.0081, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.923623445825933, | |
| "grad_norm": 1.0073539018630981, | |
| "learning_rate": 0.00039998267307400854, | |
| "loss": 1.0058, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.05683836589698, | |
| "grad_norm": 0.9562686085700989, | |
| "learning_rate": 0.00039998183615630857, | |
| "loss": 0.9797, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 6.198934280639431, | |
| "grad_norm": 1.238220453262329, | |
| "learning_rate": 0.00039998097950119264, | |
| "loss": 0.8872, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.341030195381883, | |
| "grad_norm": 1.7170666456222534, | |
| "learning_rate": 0.00039998010310874537, | |
| "loss": 0.8911, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.483126110124334, | |
| "grad_norm": 1.2328404188156128, | |
| "learning_rate": 0.0003999792069790532, | |
| "loss": 0.8988, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 6.625222024866785, | |
| "grad_norm": 1.1609599590301514, | |
| "learning_rate": 0.00039997829111220454, | |
| "loss": 0.8969, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 6.767317939609236, | |
| "grad_norm": 1.470108985900879, | |
| "learning_rate": 0.00039997735550828984, | |
| "loss": 0.9174, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.909413854351687, | |
| "grad_norm": 1.2714250087738037, | |
| "learning_rate": 0.00039997640016740146, | |
| "loss": 0.9379, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.042628774422735, | |
| "grad_norm": 1.3590861558914185, | |
| "learning_rate": 0.00039997542508963364, | |
| "loss": 0.8876, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.184724689165186, | |
| "grad_norm": 1.5455981492996216, | |
| "learning_rate": 0.0003999744302750826, | |
| "loss": 0.7601, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 7.326820603907637, | |
| "grad_norm": 1.5377577543258667, | |
| "learning_rate": 0.00039997341572384654, | |
| "loss": 0.7651, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 7.468916518650089, | |
| "grad_norm": 1.4564142227172852, | |
| "learning_rate": 0.0003999723814360256, | |
| "loss": 0.7778, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 7.61101243339254, | |
| "grad_norm": 1.4464644193649292, | |
| "learning_rate": 0.00039997132741172186, | |
| "loss": 0.776, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 7.753108348134991, | |
| "grad_norm": 1.7535293102264404, | |
| "learning_rate": 0.0003999702536510394, | |
| "loss": 0.8003, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 7.895204262877442, | |
| "grad_norm": 1.4069218635559082, | |
| "learning_rate": 0.00039996916015408413, | |
| "loss": 0.8235, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 8.02841918294849, | |
| "grad_norm": 1.3131003379821777, | |
| "learning_rate": 0.00039996804692096397, | |
| "loss": 0.7923, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 8.170515097690942, | |
| "grad_norm": 2.1319029331207275, | |
| "learning_rate": 0.0003999669139517888, | |
| "loss": 0.6309, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 8.312611012433393, | |
| "grad_norm": 2.138604164123535, | |
| "learning_rate": 0.00039996576124667053, | |
| "loss": 0.6185, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 8.454706927175843, | |
| "grad_norm": 1.6516451835632324, | |
| "learning_rate": 0.0003999645888057228, | |
| "loss": 0.6471, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.596802841918295, | |
| "grad_norm": 1.8185665607452393, | |
| "learning_rate": 0.00039996339662906133, | |
| "loss": 0.6771, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 8.738898756660745, | |
| "grad_norm": 1.5840052366256714, | |
| "learning_rate": 0.0003999621847168039, | |
| "loss": 0.6825, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 8.880994671403197, | |
| "grad_norm": 1.642910122871399, | |
| "learning_rate": 0.00039996095306907, | |
| "loss": 0.6948, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 9.014209591474245, | |
| "grad_norm": 1.924458384513855, | |
| "learning_rate": 0.0003999597016859813, | |
| "loss": 0.6845, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 9.156305506216697, | |
| "grad_norm": 1.9011436700820923, | |
| "learning_rate": 0.0003999584305676612, | |
| "loss": 0.5251, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 9.298401420959147, | |
| "grad_norm": 1.732225775718689, | |
| "learning_rate": 0.0003999571397142352, | |
| "loss": 0.5294, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 9.440497335701599, | |
| "grad_norm": 1.7909804582595825, | |
| "learning_rate": 0.00039995582912583067, | |
| "loss": 0.5454, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 9.58259325044405, | |
| "grad_norm": 1.893707036972046, | |
| "learning_rate": 0.000399954498802577, | |
| "loss": 0.5569, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 9.7246891651865, | |
| "grad_norm": 1.6991349458694458, | |
| "learning_rate": 0.0003999531487446055, | |
| "loss": 0.5672, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 9.866785079928952, | |
| "grad_norm": 2.1794662475585938, | |
| "learning_rate": 0.0003999517789520494, | |
| "loss": 0.5933, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 2.9984130859375, | |
| "learning_rate": 0.0003999503894250439, | |
| "loss": 0.5991, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 10.142095914742452, | |
| "grad_norm": 1.656080961227417, | |
| "learning_rate": 0.0003999489801637261, | |
| "loss": 0.4284, | |
| "step": 720 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1429, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.950552752639181e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |