{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1677, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011926058437686345, "grad_norm": 0.859375, "learning_rate": 2.9761904761904765e-07, "loss": 1.3391, "step": 20 }, { "epoch": 0.02385211687537269, "grad_norm": 0.9765625, "learning_rate": 5.952380952380953e-07, "loss": 1.3358, "step": 40 }, { "epoch": 0.03577817531305903, "grad_norm": 0.484375, "learning_rate": 8.928571428571429e-07, "loss": 1.2273, "step": 60 }, { "epoch": 0.04770423375074538, "grad_norm": 0.6015625, "learning_rate": 1.1904761904761906e-06, "loss": 1.2731, "step": 80 }, { "epoch": 0.05963029218843172, "grad_norm": 0.4921875, "learning_rate": 1.4880952380952381e-06, "loss": 1.2757, "step": 100 }, { "epoch": 0.07155635062611806, "grad_norm": 0.5859375, "learning_rate": 1.7857142857142859e-06, "loss": 1.2827, "step": 120 }, { "epoch": 0.08348240906380441, "grad_norm": 0.52734375, "learning_rate": 2.0833333333333334e-06, "loss": 1.2206, "step": 140 }, { "epoch": 0.09540846750149076, "grad_norm": 0.66015625, "learning_rate": 2.380952380952381e-06, "loss": 1.2604, "step": 160 }, { "epoch": 0.1073345259391771, "grad_norm": 0.412109375, "learning_rate": 2.6785714285714285e-06, "loss": 1.2433, "step": 180 }, { "epoch": 0.11926058437686345, "grad_norm": 0.6015625, "learning_rate": 2.9761904761904763e-06, "loss": 1.2834, "step": 200 }, { "epoch": 0.13118664281454978, "grad_norm": 0.69140625, "learning_rate": 3.273809523809524e-06, "loss": 1.263, "step": 220 }, { "epoch": 0.14311270125223613, "grad_norm": 0.3984375, "learning_rate": 3.5714285714285718e-06, "loss": 1.2525, "step": 240 }, { "epoch": 0.15503875968992248, "grad_norm": 0.298828125, "learning_rate": 3.869047619047619e-06, "loss": 1.2064, "step": 260 }, { "epoch": 0.16696481812760883, "grad_norm": 0.431640625, "learning_rate": 4.166666666666667e-06, "loss": 1.1364, "step": 280 }, { "epoch": 0.17889087656529518, "grad_norm": 0.55078125, "learning_rate": 4.464285714285715e-06, "loss": 1.1594, "step": 300 }, { "epoch": 0.19081693500298152, "grad_norm": 0.291015625, "learning_rate": 4.761904761904762e-06, "loss": 1.1471, "step": 320 }, { "epoch": 0.20274299344066785, "grad_norm": 0.2578125, "learning_rate": 4.99989023370455e-06, "loss": 1.138, "step": 340 }, { "epoch": 0.2146690518783542, "grad_norm": 0.232421875, "learning_rate": 4.996049425354717e-06, "loss": 1.1786, "step": 360 }, { "epoch": 0.22659511031604054, "grad_norm": 0.8671875, "learning_rate": 4.986729937340083e-06, "loss": 1.2042, "step": 380 }, { "epoch": 0.2385211687537269, "grad_norm": 0.29296875, "learning_rate": 4.971952225381176e-06, "loss": 1.1528, "step": 400 }, { "epoch": 0.2504472271914132, "grad_norm": 0.26953125, "learning_rate": 4.951748725674643e-06, "loss": 1.1932, "step": 420 }, { "epoch": 0.26237328562909956, "grad_norm": 0.328125, "learning_rate": 4.9261637836977315e-06, "loss": 1.1587, "step": 440 }, { "epoch": 0.2742993440667859, "grad_norm": 0.2119140625, "learning_rate": 4.895253556872611e-06, "loss": 1.2024, "step": 460 }, { "epoch": 0.28622540250447226, "grad_norm": 0.2236328125, "learning_rate": 4.8590858913041775e-06, "loss": 1.1471, "step": 480 }, { "epoch": 0.2981514609421586, "grad_norm": 0.291015625, "learning_rate": 4.817740172861903e-06, "loss": 1.137, "step": 500 }, { "epoch": 0.31007751937984496, "grad_norm": 0.234375, "learning_rate": 4.771307152932579e-06, "loss": 1.1693, "step": 520 }, { "epoch": 0.3220035778175313, "grad_norm": 0.28125, "learning_rate": 4.719888749226442e-06, "loss": 1.1901, "step": 540 }, { "epoch": 0.33392963625521765, "grad_norm": 0.28125, "learning_rate": 4.663597822073865e-06, "loss": 1.1139, "step": 560 }, { "epoch": 0.345855694692904, "grad_norm": 0.26953125, "learning_rate": 4.602557926703675e-06, "loss": 1.1683, "step": 580 }, { "epoch": 0.35778175313059035, "grad_norm": 0.375, "learning_rate": 4.536903042046778e-06, "loss": 1.1746, "step": 600 }, { "epoch": 0.3697078115682767, "grad_norm": 0.216796875, "learning_rate": 4.4667772766604065e-06, "loss": 1.1092, "step": 620 }, { "epoch": 0.38163387000596305, "grad_norm": 0.392578125, "learning_rate": 4.392334552418421e-06, "loss": 1.125, "step": 640 }, { "epoch": 0.3935599284436494, "grad_norm": 0.25390625, "learning_rate": 4.313738266661979e-06, "loss": 1.1584, "step": 660 }, { "epoch": 0.4054859868813357, "grad_norm": 0.2216796875, "learning_rate": 4.231160933552109e-06, "loss": 1.1235, "step": 680 }, { "epoch": 0.41741204531902204, "grad_norm": 0.330078125, "learning_rate": 4.144783805411415e-06, "loss": 1.2566, "step": 700 }, { "epoch": 0.4293381037567084, "grad_norm": 0.208984375, "learning_rate": 4.054796474886038e-06, "loss": 1.164, "step": 720 }, { "epoch": 0.44126416219439474, "grad_norm": 0.248046875, "learning_rate": 3.961396458801099e-06, "loss": 1.1195, "step": 740 }, { "epoch": 0.4531902206320811, "grad_norm": 0.255859375, "learning_rate": 3.864788764623042e-06, "loss": 1.1012, "step": 760 }, { "epoch": 0.46511627906976744, "grad_norm": 0.4296875, "learning_rate": 3.7651854404804757e-06, "loss": 1.1042, "step": 780 }, { "epoch": 0.4770423375074538, "grad_norm": 0.1953125, "learning_rate": 3.662805109731168e-06, "loss": 1.1627, "step": 800 }, { "epoch": 0.48896839594514013, "grad_norm": 0.2119140625, "learning_rate": 3.557872491096812e-06, "loss": 1.1711, "step": 820 }, { "epoch": 0.5008944543828264, "grad_norm": 0.322265625, "learning_rate": 3.450617905418834e-06, "loss": 1.1929, "step": 840 }, { "epoch": 0.5128205128205128, "grad_norm": 0.193359375, "learning_rate": 3.341276770117877e-06, "loss": 1.0958, "step": 860 }, { "epoch": 0.5247465712581991, "grad_norm": 0.2294921875, "learning_rate": 3.2300890824665942e-06, "loss": 1.1335, "step": 880 }, { "epoch": 0.5366726296958855, "grad_norm": 0.3125, "learning_rate": 3.117298892809953e-06, "loss": 1.137, "step": 900 }, { "epoch": 0.5485986881335718, "grad_norm": 0.171875, "learning_rate": 3.003153768889276e-06, "loss": 1.1752, "step": 920 }, { "epoch": 0.5605247465712582, "grad_norm": 0.1826171875, "learning_rate": 2.887904252445806e-06, "loss": 1.1044, "step": 940 }, { "epoch": 0.5724508050089445, "grad_norm": 0.49609375, "learning_rate": 2.7718033092965267e-06, "loss": 1.1124, "step": 960 }, { "epoch": 0.5843768634466309, "grad_norm": 0.25390625, "learning_rate": 2.655105774089278e-06, "loss": 1.2478, "step": 980 }, { "epoch": 0.5963029218843172, "grad_norm": 0.20703125, "learning_rate": 2.538067790955892e-06, "loss": 1.1365, "step": 1000 }, { "epoch": 0.6082289803220036, "grad_norm": 0.2578125, "learning_rate": 2.420946251291103e-06, "loss": 1.0598, "step": 1020 }, { "epoch": 0.6201550387596899, "grad_norm": 0.2109375, "learning_rate": 2.303998229891249e-06, "loss": 1.1299, "step": 1040 }, { "epoch": 0.6320810971973763, "grad_norm": 0.29296875, "learning_rate": 2.18748042069042e-06, "loss": 1.148, "step": 1060 }, { "epoch": 0.6440071556350626, "grad_norm": 0.2412109375, "learning_rate": 2.0716485733325834e-06, "loss": 1.1469, "step": 1080 }, { "epoch": 0.655933214072749, "grad_norm": 0.27734375, "learning_rate": 1.95675693181636e-06, "loss": 1.1275, "step": 1100 }, { "epoch": 0.6678592725104353, "grad_norm": 0.25, "learning_rate": 1.8430576764446046e-06, "loss": 1.1711, "step": 1120 }, { "epoch": 0.6797853309481217, "grad_norm": 0.2412109375, "learning_rate": 1.730800370303683e-06, "loss": 1.1191, "step": 1140 }, { "epoch": 0.691711389385808, "grad_norm": 0.328125, "learning_rate": 1.6202314114873693e-06, "loss": 1.2033, "step": 1160 }, { "epoch": 0.7036374478234944, "grad_norm": 0.24609375, "learning_rate": 1.51159349226773e-06, "loss": 1.1747, "step": 1180 }, { "epoch": 0.7155635062611807, "grad_norm": 0.24609375, "learning_rate": 1.4051250664000515e-06, "loss": 1.1467, "step": 1200 }, { "epoch": 0.727489564698867, "grad_norm": 0.21484375, "learning_rate": 1.3010598257310642e-06, "loss": 1.1213, "step": 1220 }, { "epoch": 0.7394156231365534, "grad_norm": 0.423828125, "learning_rate": 1.1996261872592754e-06, "loss": 1.1539, "step": 1240 }, { "epoch": 0.7513416815742398, "grad_norm": 0.296875, "learning_rate": 1.1010467917732783e-06, "loss": 1.0518, "step": 1260 }, { "epoch": 0.7632677400119261, "grad_norm": 0.263671875, "learning_rate": 1.005538015168487e-06, "loss": 1.1907, "step": 1280 }, { "epoch": 0.7751937984496124, "grad_norm": 0.2109375, "learning_rate": 9.133094935149592e-07, "loss": 1.0732, "step": 1300 }, { "epoch": 0.7871198568872988, "grad_norm": 0.177734375, "learning_rate": 8.245636629187121e-07, "loss": 1.1658, "step": 1320 }, { "epoch": 0.7990459153249851, "grad_norm": 0.19921875, "learning_rate": 7.394953151865444e-07, "loss": 1.0766, "step": 1340 }, { "epoch": 0.8109719737626714, "grad_norm": 0.208984375, "learning_rate": 6.582911702696334e-07, "loss": 1.1737, "step": 1360 }, { "epoch": 0.8228980322003577, "grad_norm": 0.212890625, "learning_rate": 5.811294664243752e-07, "loss": 1.0915, "step": 1380 }, { "epoch": 0.8348240906380441, "grad_norm": 0.1884765625, "learning_rate": 5.081795689900398e-07, "loss": 1.1312, "step": 1400 }, { "epoch": 0.8467501490757304, "grad_norm": 0.181640625, "learning_rate": 4.396015986419483e-07, "loss": 1.1867, "step": 1420 }, { "epoch": 0.8586762075134168, "grad_norm": 0.1904296875, "learning_rate": 3.7554607993613823e-07, "loss": 1.1985, "step": 1440 }, { "epoch": 0.8706022659511031, "grad_norm": 0.25, "learning_rate": 3.1615361091693694e-07, "loss": 1.1426, "step": 1460 }, { "epoch": 0.8825283243887895, "grad_norm": 0.2060546875, "learning_rate": 2.615545545126416e-07, "loss": 1.1924, "step": 1480 }, { "epoch": 0.8944543828264758, "grad_norm": 0.21484375, "learning_rate": 2.118687523966559e-07, "loss": 1.1344, "step": 1500 }, { "epoch": 0.9063804412641622, "grad_norm": 0.2138671875, "learning_rate": 1.6720526194217186e-07, "loss": 1.153, "step": 1520 }, { "epoch": 0.9183064997018485, "grad_norm": 0.1708984375, "learning_rate": 1.2766211684773156e-07, "loss": 1.1558, "step": 1540 }, { "epoch": 0.9302325581395349, "grad_norm": 0.21484375, "learning_rate": 9.332611195910585e-08, "loss": 1.1415, "step": 1560 }, { "epoch": 0.9421586165772212, "grad_norm": 0.349609375, "learning_rate": 6.427261275978369e-08, "loss": 1.1919, "step": 1580 }, { "epoch": 0.9540846750149076, "grad_norm": 0.2353515625, "learning_rate": 4.056538994822945e-08, "loss": 1.0785, "step": 1600 }, { "epoch": 0.9660107334525939, "grad_norm": 0.228515625, "learning_rate": 2.2256479464999315e-08, "loss": 1.1849, "step": 1620 }, { "epoch": 0.9779367918902803, "grad_norm": 0.32421875, "learning_rate": 9.386068276959204e-09, "loss": 1.1015, "step": 1640 }, { "epoch": 0.9898628503279666, "grad_norm": 0.17578125, "learning_rate": 1.982406169283857e-09, "loss": 1.1445, "step": 1660 }, { "epoch": 1.0, "step": 1677, "total_flos": 1.5441332068889395e+17, "train_loss": 1.1665670079849415, "train_runtime": 5262.9957, "train_samples_per_second": 1.274, "train_steps_per_second": 0.319 } ], "logging_steps": 20, "max_steps": 1677, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5441332068889395e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }