{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 34595, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07226477814713109, "grad_norm": 5.687336444854736, "learning_rate": 2.9566411331117216e-05, "loss": 1.6256, "step": 500 }, { "epoch": 0.14452955629426217, "grad_norm": 13.803632736206055, "learning_rate": 2.9132822662234428e-05, "loss": 1.1029, "step": 1000 }, { "epoch": 0.21679433444139326, "grad_norm": 10.664984703063965, "learning_rate": 2.8699233993351643e-05, "loss": 0.9223, "step": 1500 }, { "epoch": 0.28905911258852435, "grad_norm": 10.714991569519043, "learning_rate": 2.8265645324468854e-05, "loss": 0.9753, "step": 2000 }, { "epoch": 0.36132389073565546, "grad_norm": 20.49856185913086, "learning_rate": 2.783205665558607e-05, "loss": 0.9205, "step": 2500 }, { "epoch": 0.4335886688827865, "grad_norm": 32.648468017578125, "learning_rate": 2.739846798670328e-05, "loss": 0.8662, "step": 3000 }, { "epoch": 0.5058534470299176, "grad_norm": 1.8131040334701538, "learning_rate": 2.6964879317820496e-05, "loss": 0.8642, "step": 3500 }, { "epoch": 0.5781182251770487, "grad_norm": 108.92765808105469, "learning_rate": 2.6531290648937708e-05, "loss": 0.8668, "step": 4000 }, { "epoch": 0.6503830033241798, "grad_norm": 70.46207427978516, "learning_rate": 2.6097701980054923e-05, "loss": 0.8507, "step": 4500 }, { "epoch": 0.7226477814713109, "grad_norm": 41.309329986572266, "learning_rate": 2.5664113311172135e-05, "loss": 0.8232, "step": 5000 }, { "epoch": 0.794912559618442, "grad_norm": 66.08831787109375, "learning_rate": 2.523052464228935e-05, "loss": 0.8212, "step": 5500 }, { "epoch": 0.867177337765573, "grad_norm": 8.820863723754883, "learning_rate": 2.4796935973406565e-05, "loss": 0.7775, "step": 6000 }, { "epoch": 0.9394421159127041, "grad_norm": 3.9847397804260254, "learning_rate": 2.4363347304523777e-05, "loss": 0.7593, "step": 6500 }, { "epoch": 1.0117068940598353, "grad_norm": 2.6096384525299072, "learning_rate": 2.392975863564099e-05, "loss": 0.7395, "step": 7000 }, { "epoch": 1.0839716722069663, "grad_norm": 0.2627348303794861, "learning_rate": 2.34961699667582e-05, "loss": 0.5237, "step": 7500 }, { "epoch": 1.1562364503540974, "grad_norm": 13.379576683044434, "learning_rate": 2.3062581297875416e-05, "loss": 0.5469, "step": 8000 }, { "epoch": 1.2285012285012284, "grad_norm": 1.7146592140197754, "learning_rate": 2.2628992628992627e-05, "loss": 0.555, "step": 8500 }, { "epoch": 1.3007660066483595, "grad_norm": 40.48280334472656, "learning_rate": 2.2195403960109842e-05, "loss": 0.503, "step": 9000 }, { "epoch": 1.3730307847954908, "grad_norm": 42.039703369140625, "learning_rate": 2.1761815291227054e-05, "loss": 0.4805, "step": 9500 }, { "epoch": 1.4452955629426218, "grad_norm": 88.76701354980469, "learning_rate": 2.132822662234427e-05, "loss": 0.449, "step": 10000 }, { "epoch": 1.517560341089753, "grad_norm": 2.241396188735962, "learning_rate": 2.089463795346148e-05, "loss": 0.546, "step": 10500 }, { "epoch": 1.589825119236884, "grad_norm": 40.65458297729492, "learning_rate": 2.0461049284578696e-05, "loss": 0.5435, "step": 11000 }, { "epoch": 1.662089897384015, "grad_norm": 8.819799423217773, "learning_rate": 2.002746061569591e-05, "loss": 0.4942, "step": 11500 }, { "epoch": 1.734354675531146, "grad_norm": 2.431903839111328, "learning_rate": 1.9593871946813123e-05, "loss": 0.5563, "step": 12000 }, { "epoch": 1.8066194536782771, "grad_norm": 45.27000427246094, "learning_rate": 1.9160283277930338e-05, "loss": 0.4843, "step": 12500 }, { "epoch": 1.8788842318254084, "grad_norm": 0.5083319544792175, "learning_rate": 1.872669460904755e-05, "loss": 0.5188, "step": 13000 }, { "epoch": 1.9511490099725393, "grad_norm": 146.18194580078125, "learning_rate": 1.8293105940164765e-05, "loss": 0.5161, "step": 13500 }, { "epoch": 2.0234137881196705, "grad_norm": 6.673525333404541, "learning_rate": 1.7859517271281977e-05, "loss": 0.4454, "step": 14000 }, { "epoch": 2.0956785662668014, "grad_norm": 2.377631187438965, "learning_rate": 1.7425928602399192e-05, "loss": 0.2991, "step": 14500 }, { "epoch": 2.1679433444139327, "grad_norm": 2.739825963973999, "learning_rate": 1.6992339933516404e-05, "loss": 0.3334, "step": 15000 }, { "epoch": 2.240208122561064, "grad_norm": 2.2864904403686523, "learning_rate": 1.655875126463362e-05, "loss": 0.3069, "step": 15500 }, { "epoch": 2.312472900708195, "grad_norm": 2.948648691177368, "learning_rate": 1.6125162595750834e-05, "loss": 0.3202, "step": 16000 }, { "epoch": 2.384737678855326, "grad_norm": 0.33637821674346924, "learning_rate": 1.5691573926868046e-05, "loss": 0.3559, "step": 16500 }, { "epoch": 2.457002457002457, "grad_norm": 61.67799377441406, "learning_rate": 1.5257985257985259e-05, "loss": 0.3195, "step": 17000 }, { "epoch": 2.529267235149588, "grad_norm": 0.03277577832341194, "learning_rate": 1.482439658910247e-05, "loss": 0.3241, "step": 17500 }, { "epoch": 2.601532013296719, "grad_norm": 0.016185609623789787, "learning_rate": 1.4390807920219684e-05, "loss": 0.3099, "step": 18000 }, { "epoch": 2.6737967914438503, "grad_norm": 27.654953002929688, "learning_rate": 1.39572192513369e-05, "loss": 0.286, "step": 18500 }, { "epoch": 2.7460615695909816, "grad_norm": 0.4255935251712799, "learning_rate": 1.3523630582454113e-05, "loss": 0.3362, "step": 19000 }, { "epoch": 2.8183263477381124, "grad_norm": 17.48617935180664, "learning_rate": 1.3090041913571326e-05, "loss": 0.346, "step": 19500 }, { "epoch": 2.8905911258852437, "grad_norm": 184.4835968017578, "learning_rate": 1.265645324468854e-05, "loss": 0.3076, "step": 20000 }, { "epoch": 2.9628559040323745, "grad_norm": 4.913192272186279, "learning_rate": 1.2222864575805753e-05, "loss": 0.2971, "step": 20500 }, { "epoch": 3.035120682179506, "grad_norm": 19.735275268554688, "learning_rate": 1.1789275906922966e-05, "loss": 0.2479, "step": 21000 }, { "epoch": 3.1073854603266367, "grad_norm": 0.6235244274139404, "learning_rate": 1.135568723804018e-05, "loss": 0.1986, "step": 21500 }, { "epoch": 3.179650238473768, "grad_norm": 0.0023228218778967857, "learning_rate": 1.0922098569157393e-05, "loss": 0.2097, "step": 22000 }, { "epoch": 3.2519150166208988, "grad_norm": 0.006465112790465355, "learning_rate": 1.0488509900274607e-05, "loss": 0.1967, "step": 22500 }, { "epoch": 3.32417979476803, "grad_norm": 0.03349950537085533, "learning_rate": 1.0054921231391818e-05, "loss": 0.1744, "step": 23000 }, { "epoch": 3.3964445729151613, "grad_norm": 0.0003899051807820797, "learning_rate": 9.621332562509032e-06, "loss": 0.1687, "step": 23500 }, { "epoch": 3.468709351062292, "grad_norm": 0.004888344090431929, "learning_rate": 9.187743893626247e-06, "loss": 0.19, "step": 24000 }, { "epoch": 3.5409741292094234, "grad_norm": 0.2528671324253082, "learning_rate": 8.75415522474346e-06, "loss": 0.1723, "step": 24500 }, { "epoch": 3.6132389073565543, "grad_norm": 32.603153228759766, "learning_rate": 8.320566555860674e-06, "loss": 0.1692, "step": 25000 }, { "epoch": 3.6855036855036856, "grad_norm": 0.0015944232000038028, "learning_rate": 7.886977886977887e-06, "loss": 0.1671, "step": 25500 }, { "epoch": 3.757768463650817, "grad_norm": 0.02464761771261692, "learning_rate": 7.453389218095101e-06, "loss": 0.1688, "step": 26000 }, { "epoch": 3.8300332417979477, "grad_norm": 0.4081017076969147, "learning_rate": 7.019800549212314e-06, "loss": 0.1813, "step": 26500 }, { "epoch": 3.9022980199450785, "grad_norm": 0.32034188508987427, "learning_rate": 6.586211880329528e-06, "loss": 0.1801, "step": 27000 }, { "epoch": 3.97456279809221, "grad_norm": 3.1471328735351562, "learning_rate": 6.152623211446741e-06, "loss": 0.1667, "step": 27500 }, { "epoch": 4.046827576239341, "grad_norm": 0.0016349386423826218, "learning_rate": 5.7190345425639545e-06, "loss": 0.1641, "step": 28000 }, { "epoch": 4.119092354386472, "grad_norm": 1.3927046060562134, "learning_rate": 5.285445873681168e-06, "loss": 0.1069, "step": 28500 }, { "epoch": 4.191357132533603, "grad_norm": 0.0006769502069801092, "learning_rate": 4.851857204798381e-06, "loss": 0.0815, "step": 29000 }, { "epoch": 4.263621910680734, "grad_norm": 0.0053017823956906796, "learning_rate": 4.418268535915595e-06, "loss": 0.1065, "step": 29500 }, { "epoch": 4.335886688827865, "grad_norm": 0.0034885455388575792, "learning_rate": 3.984679867032808e-06, "loss": 0.085, "step": 30000 }, { "epoch": 4.408151466974997, "grad_norm": 1.5222100019454956, "learning_rate": 3.5510911981500216e-06, "loss": 0.1032, "step": 30500 }, { "epoch": 4.480416245122128, "grad_norm": 0.0017290544928982854, "learning_rate": 3.117502529267235e-06, "loss": 0.0941, "step": 31000 }, { "epoch": 4.552681023269258, "grad_norm": 0.0026269140653312206, "learning_rate": 2.683913860384449e-06, "loss": 0.0981, "step": 31500 }, { "epoch": 4.62494580141639, "grad_norm": 5.1134233474731445, "learning_rate": 2.250325191501662e-06, "loss": 0.0995, "step": 32000 }, { "epoch": 4.697210579563521, "grad_norm": 0.0005454424535855651, "learning_rate": 1.8167365226188756e-06, "loss": 0.0927, "step": 32500 }, { "epoch": 4.769475357710652, "grad_norm": 0.0004920898354612291, "learning_rate": 1.383147853736089e-06, "loss": 0.0756, "step": 33000 }, { "epoch": 4.841740135857783, "grad_norm": 0.0010318453423678875, "learning_rate": 9.495591848533026e-07, "loss": 0.0837, "step": 33500 }, { "epoch": 4.914004914004914, "grad_norm": 45.28452682495117, "learning_rate": 5.15970515970516e-07, "loss": 0.0928, "step": 34000 }, { "epoch": 4.986269692152045, "grad_norm": 0.00040028223884291947, "learning_rate": 8.238184708772944e-08, "loss": 0.0767, "step": 34500 }, { "epoch": 5.0, "step": 34595, "total_flos": 3.6156938713344e+16, "train_loss": 0.40735532518307727, "train_runtime": 4631.1638, "train_samples_per_second": 29.879, "train_steps_per_second": 7.47 } ], "logging_steps": 500, "max_steps": 34595, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.6156938713344e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }