{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1232, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016233766233766232, "grad_norm": 64.73149845425546, "learning_rate": 7.258064516129033e-07, "loss": 3.3198, "step": 10 }, { "epoch": 0.032467532467532464, "grad_norm": 33.62473079554, "learning_rate": 1.5322580645161292e-06, "loss": 3.0328, "step": 20 }, { "epoch": 0.048701298701298704, "grad_norm": 10.429545095678938, "learning_rate": 2.338709677419355e-06, "loss": 1.8054, "step": 30 }, { "epoch": 0.06493506493506493, "grad_norm": 2.1555639207187185, "learning_rate": 3.145161290322581e-06, "loss": 1.2848, "step": 40 }, { "epoch": 0.08116883116883117, "grad_norm": 2.109927665115888, "learning_rate": 3.951612903225807e-06, "loss": 0.9412, "step": 50 }, { "epoch": 0.09740259740259741, "grad_norm": 3.835176262683339, "learning_rate": 4.758064516129033e-06, "loss": 0.9246, "step": 60 }, { "epoch": 0.11363636363636363, "grad_norm": 2.3665002147902174, "learning_rate": 5.564516129032258e-06, "loss": 0.6407, "step": 70 }, { "epoch": 0.12987012987012986, "grad_norm": 2.0758395514553736, "learning_rate": 6.370967741935485e-06, "loss": 0.4818, "step": 80 }, { "epoch": 0.1461038961038961, "grad_norm": 1.3197745285431102, "learning_rate": 7.177419354838711e-06, "loss": 0.3839, "step": 90 }, { "epoch": 0.16233766233766234, "grad_norm": 1.167725739422115, "learning_rate": 7.983870967741935e-06, "loss": 0.3852, "step": 100 }, { "epoch": 0.17857142857142858, "grad_norm": 1.4271468826660396, "learning_rate": 8.790322580645163e-06, "loss": 0.4254, "step": 110 }, { "epoch": 0.19480519480519481, "grad_norm": 1.3041783516757963, "learning_rate": 9.596774193548389e-06, "loss": 0.3735, "step": 120 }, { "epoch": 0.21103896103896103, "grad_norm": 1.6445704066341653, "learning_rate": 9.999497549864013e-06, "loss": 0.3139, "step": 130 }, { "epoch": 0.22727272727272727, "grad_norm": 2.26609564584476, "learning_rate": 9.995478554650548e-06, "loss": 0.3639, "step": 140 }, { "epoch": 0.2435064935064935, "grad_norm": 1.3241916296568497, "learning_rate": 9.987443795012786e-06, "loss": 0.3085, "step": 150 }, { "epoch": 0.2597402597402597, "grad_norm": 1.8795485094172018, "learning_rate": 9.975399729931894e-06, "loss": 0.3401, "step": 160 }, { "epoch": 0.275974025974026, "grad_norm": 1.2518399347916536, "learning_rate": 9.959356041388799e-06, "loss": 0.3232, "step": 170 }, { "epoch": 0.2922077922077922, "grad_norm": 1.3254250765286968, "learning_rate": 9.939325626581032e-06, "loss": 0.3167, "step": 180 }, { "epoch": 0.30844155844155846, "grad_norm": 1.3464203708686748, "learning_rate": 9.915324587554933e-06, "loss": 0.2968, "step": 190 }, { "epoch": 0.3246753246753247, "grad_norm": 0.9391334903099222, "learning_rate": 9.887372218261547e-06, "loss": 0.2933, "step": 200 }, { "epoch": 0.3409090909090909, "grad_norm": 1.3224980977395502, "learning_rate": 9.8554909890466e-06, "loss": 0.3062, "step": 210 }, { "epoch": 0.35714285714285715, "grad_norm": 2.727455179143666, "learning_rate": 9.819706528587036e-06, "loss": 0.2979, "step": 220 }, { "epoch": 0.37337662337662336, "grad_norm": 1.5019191545607462, "learning_rate": 9.780047603288656e-06, "loss": 0.2838, "step": 230 }, { "epoch": 0.38961038961038963, "grad_norm": 1.6879557721862124, "learning_rate": 9.736546094161375e-06, "loss": 0.2995, "step": 240 }, { "epoch": 0.40584415584415584, "grad_norm": 1.4809220185905931, "learning_rate": 9.689236971190715e-06, "loss": 0.2975, "step": 250 }, { "epoch": 0.42207792207792205, "grad_norm": 1.9452778336930967, "learning_rate": 9.638158265226155e-06, "loss": 0.2862, "step": 260 }, { "epoch": 0.4383116883116883, "grad_norm": 2.1580249714158084, "learning_rate": 9.583351037408886e-06, "loss": 0.2805, "step": 270 }, { "epoch": 0.45454545454545453, "grad_norm": 2.2778945637898516, "learning_rate": 9.52485934616359e-06, "loss": 0.2783, "step": 280 }, { "epoch": 0.4707792207792208, "grad_norm": 1.3731917008379857, "learning_rate": 9.46273021178077e-06, "loss": 0.2539, "step": 290 }, { "epoch": 0.487012987012987, "grad_norm": 0.9640448637905654, "learning_rate": 9.397013578618073e-06, "loss": 0.2732, "step": 300 }, { "epoch": 0.5032467532467533, "grad_norm": 1.879194030041846, "learning_rate": 9.327762274951042e-06, "loss": 0.2789, "step": 310 }, { "epoch": 0.5194805194805194, "grad_norm": 2.0274067648187875, "learning_rate": 9.255031970505518e-06, "loss": 0.2995, "step": 320 }, { "epoch": 0.5357142857142857, "grad_norm": 1.1439713018352973, "learning_rate": 9.178881131705882e-06, "loss": 0.2626, "step": 330 }, { "epoch": 0.551948051948052, "grad_norm": 1.217033674995871, "learning_rate": 9.099370974675074e-06, "loss": 0.2437, "step": 340 }, { "epoch": 0.5681818181818182, "grad_norm": 1.1243152390495403, "learning_rate": 9.016565416024181e-06, "loss": 0.2676, "step": 350 }, { "epoch": 0.5844155844155844, "grad_norm": 1.4806002470901694, "learning_rate": 8.930531021471167e-06, "loss": 0.2656, "step": 360 }, { "epoch": 0.6006493506493507, "grad_norm": 1.1288154615791715, "learning_rate": 8.841336952330033e-06, "loss": 0.2906, "step": 370 }, { "epoch": 0.6168831168831169, "grad_norm": 1.492508732730717, "learning_rate": 8.749054909913439e-06, "loss": 0.2576, "step": 380 }, { "epoch": 0.6331168831168831, "grad_norm": 1.1458749024369619, "learning_rate": 8.653759077893453e-06, "loss": 0.264, "step": 390 }, { "epoch": 0.6493506493506493, "grad_norm": 0.9392039887042416, "learning_rate": 8.555526062666803e-06, "loss": 0.2606, "step": 400 }, { "epoch": 0.6655844155844156, "grad_norm": 1.891912924932221, "learning_rate": 8.454434831772544e-06, "loss": 0.2685, "step": 410 }, { "epoch": 0.6818181818181818, "grad_norm": 1.313388756424559, "learning_rate": 8.350566650411633e-06, "loss": 0.2611, "step": 420 }, { "epoch": 0.698051948051948, "grad_norm": 1.0777638252230697, "learning_rate": 8.244005016119482e-06, "loss": 0.2475, "step": 430 }, { "epoch": 0.7142857142857143, "grad_norm": 1.0188317103197562, "learning_rate": 8.13483559164398e-06, "loss": 0.2855, "step": 440 }, { "epoch": 0.7305194805194806, "grad_norm": 0.8703924340642463, "learning_rate": 8.02314613608292e-06, "loss": 0.2518, "step": 450 }, { "epoch": 0.7467532467532467, "grad_norm": 1.0173301268541866, "learning_rate": 7.909026434336252e-06, "loss": 0.2696, "step": 460 }, { "epoch": 0.762987012987013, "grad_norm": 1.2218157167293324, "learning_rate": 7.792568224929797e-06, "loss": 0.2612, "step": 470 }, { "epoch": 0.7792207792207793, "grad_norm": 1.1796113115060929, "learning_rate": 7.673865126268506e-06, "loss": 0.2506, "step": 480 }, { "epoch": 0.7954545454545454, "grad_norm": 1.376127532504407, "learning_rate": 7.55301256137851e-06, "loss": 0.2459, "step": 490 }, { "epoch": 0.8116883116883117, "grad_norm": 0.8520527207533138, "learning_rate": 7.430107681198477e-06, "loss": 0.2296, "step": 500 }, { "epoch": 0.827922077922078, "grad_norm": 1.6057696993562458, "learning_rate": 7.305249286481928e-06, "loss": 0.2707, "step": 510 }, { "epoch": 0.8441558441558441, "grad_norm": 1.1118264521022472, "learning_rate": 7.1785377483733045e-06, "loss": 0.2453, "step": 520 }, { "epoch": 0.8603896103896104, "grad_norm": 0.986033743735511, "learning_rate": 7.050074927721639e-06, "loss": 0.2653, "step": 530 }, { "epoch": 0.8766233766233766, "grad_norm": 1.1047991137937976, "learning_rate": 6.9199640931966615e-06, "loss": 0.2401, "step": 540 }, { "epoch": 0.8928571428571429, "grad_norm": 0.8182265905464219, "learning_rate": 6.788309838273211e-06, "loss": 0.2453, "step": 550 }, { "epoch": 0.9090909090909091, "grad_norm": 1.0242844709669037, "learning_rate": 6.655217997150642e-06, "loss": 0.2562, "step": 560 }, { "epoch": 0.9253246753246753, "grad_norm": 1.7261090545065465, "learning_rate": 6.520795559674851e-06, "loss": 0.2618, "step": 570 }, { "epoch": 0.9415584415584416, "grad_norm": 1.166838903170947, "learning_rate": 6.385150585331299e-06, "loss": 0.2445, "step": 580 }, { "epoch": 0.9577922077922078, "grad_norm": 1.0102156282992951, "learning_rate": 6.248392116378167e-06, "loss": 0.2381, "step": 590 }, { "epoch": 0.974025974025974, "grad_norm": 1.2728489163523269, "learning_rate": 6.110630090189493e-06, "loss": 0.2495, "step": 600 }, { "epoch": 0.9902597402597403, "grad_norm": 1.0402345685218206, "learning_rate": 5.971975250878722e-06, "loss": 0.2607, "step": 610 }, { "epoch": 1.0064935064935066, "grad_norm": 0.9570526081816403, "learning_rate": 5.832539060273763e-06, "loss": 0.2594, "step": 620 }, { "epoch": 1.0227272727272727, "grad_norm": 0.8166533778024863, "learning_rate": 5.692433608315059e-06, "loss": 0.1734, "step": 630 }, { "epoch": 1.0389610389610389, "grad_norm": 0.8521218142738738, "learning_rate": 5.5517715229487554e-06, "loss": 0.1661, "step": 640 }, { "epoch": 1.0551948051948052, "grad_norm": 2.0344524322648225, "learning_rate": 5.410665879587366e-06, "loss": 0.1773, "step": 650 }, { "epoch": 1.0714285714285714, "grad_norm": 1.0760586346052228, "learning_rate": 5.269230110210725e-06, "loss": 0.1832, "step": 660 }, { "epoch": 1.0876623376623376, "grad_norm": 0.6221075887847101, "learning_rate": 5.127577912180312e-06, "loss": 0.171, "step": 670 }, { "epoch": 1.103896103896104, "grad_norm": 2.4289309131257037, "learning_rate": 4.9858231568402325e-06, "loss": 0.1869, "step": 680 }, { "epoch": 1.12012987012987, "grad_norm": 0.7117445693553235, "learning_rate": 4.844079797978345e-06, "loss": 0.1715, "step": 690 }, { "epoch": 1.1363636363636362, "grad_norm": 2.0791195295672558, "learning_rate": 4.7024617802211105e-06, "loss": 0.1918, "step": 700 }, { "epoch": 1.1525974025974026, "grad_norm": 1.3384619147891226, "learning_rate": 4.5610829474358056e-06, "loss": 0.1849, "step": 710 }, { "epoch": 1.1688311688311688, "grad_norm": 0.7431163309985721, "learning_rate": 4.420056951213726e-06, "loss": 0.1706, "step": 720 }, { "epoch": 1.1850649350649352, "grad_norm": 1.0917320780107198, "learning_rate": 4.279497159507984e-06, "loss": 0.1774, "step": 730 }, { "epoch": 1.2012987012987013, "grad_norm": 0.6433212768530576, "learning_rate": 4.139516565499277e-06, "loss": 0.1725, "step": 740 }, { "epoch": 1.2175324675324675, "grad_norm": 1.0367698466098962, "learning_rate": 4.000227696762967e-06, "loss": 0.2098, "step": 750 }, { "epoch": 1.2337662337662338, "grad_norm": 1.1673609045846502, "learning_rate": 3.861742524810421e-06, "loss": 0.1837, "step": 760 }, { "epoch": 1.25, "grad_norm": 1.036708817011231, "learning_rate": 3.7241723750773812e-06, "loss": 0.1819, "step": 770 }, { "epoch": 1.2662337662337662, "grad_norm": 2.085452988223766, "learning_rate": 3.587627837431679e-06, "loss": 0.168, "step": 780 }, { "epoch": 1.2824675324675325, "grad_norm": 1.1525296924575061, "learning_rate": 3.4522186772722915e-06, "loss": 0.1516, "step": 790 }, { "epoch": 1.2987012987012987, "grad_norm": 1.9400938633777363, "learning_rate": 3.3180537472911334e-06, "loss": 0.1749, "step": 800 }, { "epoch": 1.314935064935065, "grad_norm": 0.8648790732830897, "learning_rate": 3.185240899968587e-06, "loss": 0.1665, "step": 810 }, { "epoch": 1.3311688311688312, "grad_norm": 1.1576768419228283, "learning_rate": 3.053886900873062e-06, "loss": 0.1847, "step": 820 }, { "epoch": 1.3474025974025974, "grad_norm": 0.7801672735784758, "learning_rate": 2.9240973428343135e-06, "loss": 0.1852, "step": 830 }, { "epoch": 1.3636363636363638, "grad_norm": 0.6679216934793272, "learning_rate": 2.79597656105949e-06, "loss": 0.1622, "step": 840 }, { "epoch": 1.37987012987013, "grad_norm": 2.0257365743503892, "learning_rate": 2.6696275492601726e-06, "loss": 0.2013, "step": 850 }, { "epoch": 1.396103896103896, "grad_norm": 1.3270716986636226, "learning_rate": 2.545151876857803e-06, "loss": 0.1926, "step": 860 }, { "epoch": 1.4123376623376624, "grad_norm": 1.4591313868086215, "learning_rate": 2.422649607334083e-06, "loss": 0.1865, "step": 870 }, { "epoch": 1.4285714285714286, "grad_norm": 0.718475522026256, "learning_rate": 2.3022192177919465e-06, "loss": 0.1704, "step": 880 }, { "epoch": 1.4448051948051948, "grad_norm": 0.8779553819300654, "learning_rate": 2.1839575197918156e-06, "loss": 0.1704, "step": 890 }, { "epoch": 1.4610389610389611, "grad_norm": 0.8981166608440325, "learning_rate": 2.0679595815267395e-06, "loss": 0.1894, "step": 900 }, { "epoch": 1.4772727272727273, "grad_norm": 1.8819536746716359, "learning_rate": 1.954318651398977e-06, "loss": 0.1838, "step": 910 }, { "epoch": 1.4935064935064934, "grad_norm": 1.2305604028991983, "learning_rate": 1.8431260830595126e-06, "loss": 0.1667, "step": 920 }, { "epoch": 1.5097402597402598, "grad_norm": 0.8829569169119944, "learning_rate": 1.7344712619706772e-06, "loss": 0.1588, "step": 930 }, { "epoch": 1.525974025974026, "grad_norm": 0.7948818738193303, "learning_rate": 1.6284415335509879e-06, "loss": 0.1743, "step": 940 }, { "epoch": 1.5422077922077921, "grad_norm": 2.0055570853229376, "learning_rate": 1.525122132959933e-06, "loss": 0.2021, "step": 950 }, { "epoch": 1.5584415584415585, "grad_norm": 1.0617311431240737, "learning_rate": 1.4245961165791344e-06, "loss": 0.1842, "step": 960 }, { "epoch": 1.5746753246753247, "grad_norm": 1.1336171320027224, "learning_rate": 1.326944295245009e-06, "loss": 0.1679, "step": 970 }, { "epoch": 1.5909090909090908, "grad_norm": 0.8275801739941498, "learning_rate": 1.2322451692865617e-06, "loss": 0.1649, "step": 980 }, { "epoch": 1.6071428571428572, "grad_norm": 1.133730110796439, "learning_rate": 1.1405748654205566e-06, "loss": 0.1455, "step": 990 }, { "epoch": 1.6233766233766234, "grad_norm": 1.4124753406859976, "learning_rate": 1.052007075554789e-06, "loss": 0.178, "step": 1000 }, { "epoch": 1.6396103896103895, "grad_norm": 1.4093028955797295, "learning_rate": 9.666129975486394e-07, "loss": 0.1811, "step": 1010 }, { "epoch": 1.655844155844156, "grad_norm": 1.0258966054963048, "learning_rate": 8.844612779785583e-07, "loss": 0.1714, "step": 1020 }, { "epoch": 1.672077922077922, "grad_norm": 0.8268048059208093, "learning_rate": 8.056179569544642e-07, "loss": 0.1684, "step": 1030 }, { "epoch": 1.6883116883116882, "grad_norm": 1.1758327197059883, "learning_rate": 7.301464150314313e-07, "loss": 0.1578, "step": 1040 }, { "epoch": 1.7045454545454546, "grad_norm": 1.3848141640091782, "learning_rate": 6.581073222593442e-07, "loss": 0.1841, "step": 1050 }, { "epoch": 1.7207792207792207, "grad_norm": 1.6039492944313922, "learning_rate": 5.89558589411463e-07, "loss": 0.1711, "step": 1060 }, { "epoch": 1.737012987012987, "grad_norm": 0.9846284823379949, "learning_rate": 5.245553214311283e-07, "loss": 0.1839, "step": 1070 }, { "epoch": 1.7532467532467533, "grad_norm": 0.8126959112794907, "learning_rate": 4.6314977313400065e-07, "loss": 0.1937, "step": 1080 }, { "epoch": 1.7694805194805194, "grad_norm": 0.912662021903635, "learning_rate": 4.053913072014748e-07, "loss": 0.1858, "step": 1090 }, { "epoch": 1.7857142857142856, "grad_norm": 2.2200942828573904, "learning_rate": 3.513263544990153e-07, "loss": 0.1668, "step": 1100 }, { "epoch": 1.801948051948052, "grad_norm": 1.9601203935539797, "learning_rate": 3.0099837675131525e-07, "loss": 0.1825, "step": 1110 }, { "epoch": 1.8181818181818183, "grad_norm": 1.8341781108000885, "learning_rate": 2.5444783160429975e-07, "loss": 0.1628, "step": 1120 }, { "epoch": 1.8344155844155843, "grad_norm": 0.9358762565242741, "learning_rate": 2.1171214010203723e-07, "loss": 0.1309, "step": 1130 }, { "epoch": 1.8506493506493507, "grad_norm": 0.5034065948539621, "learning_rate": 1.7282565660471483e-07, "loss": 0.1579, "step": 1140 }, { "epoch": 1.866883116883117, "grad_norm": 1.2734857167293319, "learning_rate": 1.3781964117186743e-07, "loss": 0.1515, "step": 1150 }, { "epoch": 1.883116883116883, "grad_norm": 0.9554751930775003, "learning_rate": 1.0672223443304042e-07, "loss": 0.1615, "step": 1160 }, { "epoch": 1.8993506493506493, "grad_norm": 1.2542362308070503, "learning_rate": 7.955843496610882e-08, "loss": 0.1533, "step": 1170 }, { "epoch": 1.9155844155844157, "grad_norm": 1.0939753700107246, "learning_rate": 5.6350079201422655e-08, "loss": 0.1799, "step": 1180 }, { "epoch": 1.9318181818181817, "grad_norm": 0.853147045662764, "learning_rate": 3.711582386794421e-08, "loss": 0.1704, "step": 1190 }, { "epoch": 1.948051948051948, "grad_norm": 1.278737413341965, "learning_rate": 2.1871130995476665e-08, "loss": 0.1924, "step": 1200 }, { "epoch": 1.9642857142857144, "grad_norm": 2.050204404171476, "learning_rate": 1.0628255485052308e-08, "loss": 0.1678, "step": 1210 }, { "epoch": 1.9805194805194806, "grad_norm": 0.8404109096247463, "learning_rate": 3.396235257464575e-09, "loss": 0.1525, "step": 1220 }, { "epoch": 1.9967532467532467, "grad_norm": 0.7586755640395566, "learning_rate": 1.8088398786586525e-10, "loss": 0.1676, "step": 1230 }, { "epoch": 2.0, "step": 1232, "total_flos": 610243285417984.0, "train_loss": 0.3112858794266721, "train_runtime": 22231.3385, "train_samples_per_second": 0.332, "train_steps_per_second": 0.055 } ], "logging_steps": 10, "max_steps": 1232, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 610243285417984.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }