{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995962314939435, "eval_steps": 500, "global_step": 1113, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026917900403768506, "grad_norm": 13.069798469543457, "learning_rate": 1.7857142857142859e-06, "loss": 0.3801, "step": 10 }, { "epoch": 0.05383580080753701, "grad_norm": 2.184347629547119, "learning_rate": 3.5714285714285718e-06, "loss": 0.31, "step": 20 }, { "epoch": 0.08075370121130551, "grad_norm": 0.8302198648452759, "learning_rate": 5.357142857142857e-06, "loss": 0.2645, "step": 30 }, { "epoch": 0.10767160161507403, "grad_norm": 1.0656105279922485, "learning_rate": 7.1428571428571436e-06, "loss": 0.2539, "step": 40 }, { "epoch": 0.13458950201884254, "grad_norm": 0.5782439708709717, "learning_rate": 8.92857142857143e-06, "loss": 0.2512, "step": 50 }, { "epoch": 0.16150740242261102, "grad_norm": 0.5837422609329224, "learning_rate": 1.0714285714285714e-05, "loss": 0.2508, "step": 60 }, { "epoch": 0.18842530282637954, "grad_norm": 0.6315082907676697, "learning_rate": 1.25e-05, "loss": 0.2481, "step": 70 }, { "epoch": 0.21534320323014805, "grad_norm": 0.649541974067688, "learning_rate": 1.4285714285714287e-05, "loss": 0.2418, "step": 80 }, { "epoch": 0.24226110363391656, "grad_norm": 0.5448735356330872, "learning_rate": 1.6071428571428572e-05, "loss": 0.2479, "step": 90 }, { "epoch": 0.2691790040376851, "grad_norm": 0.5558776259422302, "learning_rate": 1.785714285714286e-05, "loss": 0.2421, "step": 100 }, { "epoch": 0.2960969044414536, "grad_norm": 0.3394428789615631, "learning_rate": 1.9642857142857145e-05, "loss": 0.2388, "step": 110 }, { "epoch": 0.32301480484522205, "grad_norm": 0.3705368936061859, "learning_rate": 1.9996848199254315e-05, "loss": 0.2407, "step": 120 }, { "epoch": 0.34993270524899056, "grad_norm": 0.3354800343513489, "learning_rate": 1.9984047413708153e-05, "loss": 0.2325, "step": 130 }, { "epoch": 0.3768506056527591, "grad_norm": 0.2792787253856659, "learning_rate": 1.9961413253717214e-05, "loss": 0.2385, "step": 140 }, { "epoch": 0.4037685060565276, "grad_norm": 0.6989262104034424, "learning_rate": 1.9928968011860973e-05, "loss": 0.2372, "step": 150 }, { "epoch": 0.4306864064602961, "grad_norm": 0.4628732204437256, "learning_rate": 1.988674364373809e-05, "loss": 0.2332, "step": 160 }, { "epoch": 0.4576043068640646, "grad_norm": 1.1485790014266968, "learning_rate": 1.9834781736493057e-05, "loss": 0.2362, "step": 170 }, { "epoch": 0.4845222072678331, "grad_norm": 0.3115156292915344, "learning_rate": 1.9773133467856672e-05, "loss": 0.2347, "step": 180 }, { "epoch": 0.5114401076716016, "grad_norm": 0.2576087415218353, "learning_rate": 1.9701859555740647e-05, "loss": 0.2404, "step": 190 }, { "epoch": 0.5383580080753702, "grad_norm": 0.3003959059715271, "learning_rate": 1.9621030198436007e-05, "loss": 0.234, "step": 200 }, { "epoch": 0.5652759084791387, "grad_norm": 0.22878509759902954, "learning_rate": 1.9530725005474195e-05, "loss": 0.2347, "step": 210 }, { "epoch": 0.5921938088829072, "grad_norm": 0.26122385263442993, "learning_rate": 1.9431032919218957e-05, "loss": 0.2446, "step": 220 }, { "epoch": 0.6191117092866757, "grad_norm": 0.22441260516643524, "learning_rate": 1.9322052127266234e-05, "loss": 0.2398, "step": 230 }, { "epoch": 0.6460296096904441, "grad_norm": 0.2252231240272522, "learning_rate": 1.9203889965738354e-05, "loss": 0.2377, "step": 240 }, { "epoch": 0.6729475100942126, "grad_norm": 0.30187228322029114, "learning_rate": 1.9076662813567772e-05, "loss": 0.2355, "step": 250 }, { "epoch": 0.6998654104979811, "grad_norm": 0.2517610192298889, "learning_rate": 1.894049597787443e-05, "loss": 0.2402, "step": 260 }, { "epoch": 0.7267833109017496, "grad_norm": 0.30307725071907043, "learning_rate": 1.879552357054971e-05, "loss": 0.2378, "step": 270 }, { "epoch": 0.7537012113055181, "grad_norm": 0.26731035113334656, "learning_rate": 1.8641888376168483e-05, "loss": 0.2378, "step": 280 }, { "epoch": 0.7806191117092867, "grad_norm": 0.22943764925003052, "learning_rate": 1.847974171135933e-05, "loss": 0.235, "step": 290 }, { "epoch": 0.8075370121130552, "grad_norm": 0.19347825646400452, "learning_rate": 1.830924327577149e-05, "loss": 0.2329, "step": 300 }, { "epoch": 0.8344549125168237, "grad_norm": 0.22859790921211243, "learning_rate": 1.8130560994785325e-05, "loss": 0.2289, "step": 310 }, { "epoch": 0.8613728129205922, "grad_norm": 0.2617790699005127, "learning_rate": 1.7943870854121126e-05, "loss": 0.2294, "step": 320 }, { "epoch": 0.8882907133243607, "grad_norm": 0.23600426316261292, "learning_rate": 1.7749356726509286e-05, "loss": 0.2304, "step": 330 }, { "epoch": 0.9152086137281292, "grad_norm": 0.2116561233997345, "learning_rate": 1.7547210190592446e-05, "loss": 0.2379, "step": 340 }, { "epoch": 0.9421265141318977, "grad_norm": 0.19537119567394257, "learning_rate": 1.733763034223804e-05, "loss": 0.2309, "step": 350 }, { "epoch": 0.9690444145356663, "grad_norm": 0.22050656378269196, "learning_rate": 1.7120823598447077e-05, "loss": 0.2281, "step": 360 }, { "epoch": 0.9959623149394348, "grad_norm": 0.1890714466571808, "learning_rate": 1.6897003494052217e-05, "loss": 0.2327, "step": 370 }, { "epoch": 1.0228802153432033, "grad_norm": 0.1974857598543167, "learning_rate": 1.6666390471405504e-05, "loss": 0.2265, "step": 380 }, { "epoch": 1.0497981157469718, "grad_norm": 0.2218897044658661, "learning_rate": 1.642921166326278e-05, "loss": 0.2385, "step": 390 }, { "epoch": 1.0767160161507403, "grad_norm": 0.35485249757766724, "learning_rate": 1.6185700669078674e-05, "loss": 0.2274, "step": 400 }, { "epoch": 1.1036339165545088, "grad_norm": 0.40264761447906494, "learning_rate": 1.5936097324932487e-05, "loss": 0.2287, "step": 410 }, { "epoch": 1.1305518169582773, "grad_norm": 0.2551412284374237, "learning_rate": 1.568064746731156e-05, "loss": 0.2395, "step": 420 }, { "epoch": 1.1574697173620458, "grad_norm": 0.19965523481369019, "learning_rate": 1.5419602690984805e-05, "loss": 0.2331, "step": 430 }, { "epoch": 1.1843876177658144, "grad_norm": 0.18600021302700043, "learning_rate": 1.5153220101204839e-05, "loss": 0.2354, "step": 440 }, { "epoch": 1.2113055181695827, "grad_norm": 0.2717427909374237, "learning_rate": 1.4881762060482814e-05, "loss": 0.231, "step": 450 }, { "epoch": 1.2382234185733512, "grad_norm": 0.3491940498352051, "learning_rate": 1.4605495930185303e-05, "loss": 0.2302, "step": 460 }, { "epoch": 1.2651413189771197, "grad_norm": 0.18677066266536713, "learning_rate": 1.4324693807207785e-05, "loss": 0.2311, "step": 470 }, { "epoch": 1.2920592193808882, "grad_norm": 0.24856720864772797, "learning_rate": 1.4039632255984078e-05, "loss": 0.2258, "step": 480 }, { "epoch": 1.3189771197846567, "grad_norm": 0.1940755695104599, "learning_rate": 1.375059203609562e-05, "loss": 0.2304, "step": 490 }, { "epoch": 1.3458950201884252, "grad_norm": 0.2115495502948761, "learning_rate": 1.3457857825748959e-05, "loss": 0.2255, "step": 500 }, { "epoch": 1.3458950201884252, "eval_loss": 0.24458986520767212, "eval_runtime": 62.1158, "eval_samples_per_second": 85.051, "eval_steps_per_second": 21.267, "step": 500 }, { "epoch": 1.3728129205921937, "grad_norm": 0.30106064677238464, "learning_rate": 1.3161717941393703e-05, "loss": 0.2293, "step": 510 }, { "epoch": 1.3997308209959622, "grad_norm": 0.21698522567749023, "learning_rate": 1.2862464053757196e-05, "loss": 0.2301, "step": 520 }, { "epoch": 1.4266487213997308, "grad_norm": 0.21992221474647522, "learning_rate": 1.2560390900575472e-05, "loss": 0.2264, "step": 530 }, { "epoch": 1.4535666218034993, "grad_norm": 0.25674089789390564, "learning_rate": 1.2255795996303526e-05, "loss": 0.2261, "step": 540 }, { "epoch": 1.4804845222072678, "grad_norm": 0.2653080224990845, "learning_rate": 1.1948979339090758e-05, "loss": 0.2243, "step": 550 }, { "epoch": 1.5074024226110363, "grad_norm": 0.3156011998653412, "learning_rate": 1.1640243115310219e-05, "loss": 0.2353, "step": 560 }, { "epoch": 1.5343203230148048, "grad_norm": 0.21554109454154968, "learning_rate": 1.1329891401932631e-05, "loss": 0.2294, "step": 570 }, { "epoch": 1.5612382234185733, "grad_norm": 0.18904979526996613, "learning_rate": 1.1018229867038358e-05, "loss": 0.2272, "step": 580 }, { "epoch": 1.5881561238223418, "grad_norm": 0.23018983006477356, "learning_rate": 1.0705565468762274e-05, "loss": 0.2294, "step": 590 }, { "epoch": 1.6150740242261103, "grad_norm": 0.2061055600643158, "learning_rate": 1.0392206152968058e-05, "loss": 0.2266, "step": 600 }, { "epoch": 1.6419919246298789, "grad_norm": 0.20794202387332916, "learning_rate": 1.0078460549949647e-05, "loss": 0.2357, "step": 610 }, { "epoch": 1.6689098250336474, "grad_norm": 0.19699296355247498, "learning_rate": 9.764637670458595e-06, "loss": 0.224, "step": 620 }, { "epoch": 1.695827725437416, "grad_norm": 0.22355449199676514, "learning_rate": 9.451046601356725e-06, "loss": 0.2365, "step": 630 }, { "epoch": 1.7227456258411844, "grad_norm": 0.20971466600894928, "learning_rate": 9.137996201193807e-06, "loss": 0.2328, "step": 640 }, { "epoch": 1.749663526244953, "grad_norm": 0.24429140985012054, "learning_rate": 8.825794796010101e-06, "loss": 0.2213, "step": 650 }, { "epoch": 1.7765814266487214, "grad_norm": 0.2615514397621155, "learning_rate": 8.514749875663397e-06, "loss": 0.2291, "step": 660 }, { "epoch": 1.80349932705249, "grad_norm": 0.29951363801956177, "learning_rate": 8.20516779097958e-06, "loss": 0.2294, "step": 670 }, { "epoch": 1.8304172274562585, "grad_norm": 0.19812524318695068, "learning_rate": 7.897353452025077e-06, "loss": 0.2288, "step": 680 }, { "epoch": 1.857335127860027, "grad_norm": 0.21179044246673584, "learning_rate": 7.591610027798287e-06, "loss": 0.2294, "step": 690 }, { "epoch": 1.8842530282637955, "grad_norm": 0.193583145737648, "learning_rate": 7.2882386476358304e-06, "loss": 0.227, "step": 700 }, { "epoch": 1.911170928667564, "grad_norm": 0.20502911508083344, "learning_rate": 6.9875381046276605e-06, "loss": 0.2258, "step": 710 }, { "epoch": 1.9380888290713325, "grad_norm": 0.19676484167575836, "learning_rate": 6.689804561333164e-06, "loss": 0.2272, "step": 720 }, { "epoch": 1.965006729475101, "grad_norm": 0.20092357695102692, "learning_rate": 6.39533125808812e-06, "loss": 0.2292, "step": 730 }, { "epoch": 1.9919246298788695, "grad_norm": 0.22104892134666443, "learning_rate": 6.104408224189746e-06, "loss": 0.2269, "step": 740 }, { "epoch": 2.018842530282638, "grad_norm": 0.1946035623550415, "learning_rate": 5.8173219922443516e-06, "loss": 0.2193, "step": 750 }, { "epoch": 2.0457604306864066, "grad_norm": 0.22905437648296356, "learning_rate": 5.5343553159588884e-06, "loss": 0.2353, "step": 760 }, { "epoch": 2.072678331090175, "grad_norm": 0.23081299662590027, "learning_rate": 5.2557868916543996e-06, "loss": 0.2229, "step": 770 }, { "epoch": 2.0995962314939436, "grad_norm": 0.21353456377983093, "learning_rate": 4.981891083775597e-06, "loss": 0.2215, "step": 780 }, { "epoch": 2.126514131897712, "grad_norm": 0.20833438634872437, "learning_rate": 4.712937654666971e-06, "loss": 0.2231, "step": 790 }, { "epoch": 2.1534320323014806, "grad_norm": 0.20027689635753632, "learning_rate": 4.4491914988815055e-06, "loss": 0.2281, "step": 800 }, { "epoch": 2.180349932705249, "grad_norm": 0.22123222053050995, "learning_rate": 4.190912382283749e-06, "loss": 0.2278, "step": 810 }, { "epoch": 2.2072678331090176, "grad_norm": 0.28094470500946045, "learning_rate": 3.9383546862041955e-06, "loss": 0.2228, "step": 820 }, { "epoch": 2.234185733512786, "grad_norm": 0.3237360417842865, "learning_rate": 3.6917671568969006e-06, "loss": 0.2291, "step": 830 }, { "epoch": 2.2611036339165547, "grad_norm": 0.21679522097110748, "learning_rate": 3.4513926605471504e-06, "loss": 0.2285, "step": 840 }, { "epoch": 2.288021534320323, "grad_norm": 0.21422189474105835, "learning_rate": 3.2174679440704616e-06, "loss": 0.2279, "step": 850 }, { "epoch": 2.3149394347240917, "grad_norm": 0.2352222353219986, "learning_rate": 2.9902234019385056e-06, "loss": 0.2264, "step": 860 }, { "epoch": 2.34185733512786, "grad_norm": 0.23439514636993408, "learning_rate": 2.7698828492615992e-06, "loss": 0.2269, "step": 870 }, { "epoch": 2.3687752355316287, "grad_norm": 0.22924348711967468, "learning_rate": 2.5566633013512753e-06, "loss": 0.2267, "step": 880 }, { "epoch": 2.3956931359353972, "grad_norm": 0.23167449235916138, "learning_rate": 2.350774759980027e-06, "loss": 0.2254, "step": 890 }, { "epoch": 2.4226110363391653, "grad_norm": 0.2599547803401947, "learning_rate": 2.1524200065487565e-06, "loss": 0.2291, "step": 900 }, { "epoch": 2.449528936742934, "grad_norm": 0.22817839682102203, "learning_rate": 1.961794402365611e-06, "loss": 0.2284, "step": 910 }, { "epoch": 2.4764468371467023, "grad_norm": 0.2169758379459381, "learning_rate": 1.7790856962329584e-06, "loss": 0.2286, "step": 920 }, { "epoch": 2.503364737550471, "grad_norm": 0.21095937490463257, "learning_rate": 1.6044738395319648e-06, "loss": 0.2253, "step": 930 }, { "epoch": 2.5302826379542394, "grad_norm": 0.21286533772945404, "learning_rate": 1.4381308089869283e-06, "loss": 0.2193, "step": 940 }, { "epoch": 2.557200538358008, "grad_norm": 0.2127334177494049, "learning_rate": 1.2802204372839178e-06, "loss": 0.2198, "step": 950 }, { "epoch": 2.5841184387617764, "grad_norm": 0.19859924912452698, "learning_rate": 1.130898251710547e-06, "loss": 0.2212, "step": 960 }, { "epoch": 2.611036339165545, "grad_norm": 0.23916248977184296, "learning_rate": 9.903113209758098e-07, "loss": 0.2245, "step": 970 }, { "epoch": 2.6379542395693134, "grad_norm": 0.24261216819286346, "learning_rate": 8.585981103608343e-07, "loss": 0.2241, "step": 980 }, { "epoch": 2.664872139973082, "grad_norm": 0.22423197329044342, "learning_rate": 7.358883453432398e-07, "loss": 0.2241, "step": 990 }, { "epoch": 2.6917900403768504, "grad_norm": 0.30151936411857605, "learning_rate": 6.223028838293898e-07, "loss": 0.2265, "step": 1000 }, { "epoch": 2.6917900403768504, "eval_loss": 0.2421317845582962, "eval_runtime": 62.8408, "eval_samples_per_second": 84.07, "eval_steps_per_second": 21.021, "step": 1000 }, { "epoch": 2.718707940780619, "grad_norm": 0.23664213716983795, "learning_rate": 5.179535971203953e-07, "loss": 0.2199, "step": 1010 }, { "epoch": 2.7456258411843875, "grad_norm": 0.21507257223129272, "learning_rate": 4.2294325972911274e-07, "loss": 0.2265, "step": 1020 }, { "epoch": 2.772543741588156, "grad_norm": 0.1968134194612503, "learning_rate": 3.3736544815663017e-07, "loss": 0.2204, "step": 1030 }, { "epoch": 2.7994616419919245, "grad_norm": 0.2121606171131134, "learning_rate": 2.6130444872797143e-07, "loss": 0.2187, "step": 1040 }, { "epoch": 2.826379542395693, "grad_norm": 0.21338069438934326, "learning_rate": 1.9483517457776436e-07, "loss": 0.2156, "step": 1050 }, { "epoch": 2.8532974427994615, "grad_norm": 0.22012507915496826, "learning_rate": 1.3802309186764619e-07, "loss": 0.2176, "step": 1060 }, { "epoch": 2.88021534320323, "grad_norm": 0.2376081794500351, "learning_rate": 9.092415530807975e-08, "loss": 0.2206, "step": 1070 }, { "epoch": 2.9071332436069985, "grad_norm": 0.21504898369312286, "learning_rate": 5.3584753048073756e-08, "loss": 0.2233, "step": 1080 }, { "epoch": 2.934051144010767, "grad_norm": 0.2161342054605484, "learning_rate": 2.604166098709504e-08, "loss": 0.2263, "step": 1090 }, { "epoch": 2.9609690444145356, "grad_norm": 0.26196786761283875, "learning_rate": 8.322006554171147e-09, "loss": 0.23, "step": 1100 }, { "epoch": 2.987886944818304, "grad_norm": 0.26092347502708435, "learning_rate": 4.432419898459106e-10, "loss": 0.2229, "step": 1110 }, { "epoch": 2.995962314939435, "step": 1113, "total_flos": 5.031637962748592e+18, "train_loss": 0.232770404511492, "train_runtime": 3990.8809, "train_samples_per_second": 35.74, "train_steps_per_second": 0.279 } ], "logging_steps": 10, "max_steps": 1113, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.031637962748592e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }