{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 852.0, "learning_rate": 2.222222222222222e-07, "loss": 4.3787, "step": 10 }, { "epoch": 0.04, "grad_norm": 972.0, "learning_rate": 4.444444444444444e-07, "loss": 4.7261, "step": 20 }, { "epoch": 0.06, "grad_norm": 1128.0, "learning_rate": 6.666666666666666e-07, "loss": 3.5648, "step": 30 }, { "epoch": 0.08, "grad_norm": 1128.0, "learning_rate": 8.888888888888888e-07, "loss": 2.7698, "step": 40 }, { "epoch": 0.1, "grad_norm": 700.0, "learning_rate": 9.965635738831615e-07, "loss": 1.9925, "step": 50 }, { "epoch": 0.12, "grad_norm": 211.0, "learning_rate": 9.896907216494845e-07, "loss": 0.5663, "step": 60 }, { "epoch": 0.14, "grad_norm": 43.75, "learning_rate": 9.828178694158075e-07, "loss": 0.3221, "step": 70 }, { "epoch": 0.16, "grad_norm": 23.375, "learning_rate": 9.759450171821305e-07, "loss": 0.168, "step": 80 }, { "epoch": 0.18, "grad_norm": 10.125, "learning_rate": 9.690721649484535e-07, "loss": 0.3897, "step": 90 }, { "epoch": 0.2, "grad_norm": 334.0, "learning_rate": 9.621993127147767e-07, "loss": 0.1535, "step": 100 }, { "epoch": 0.22, "grad_norm": 8.0, "learning_rate": 9.553264604810997e-07, "loss": 0.3741, "step": 110 }, { "epoch": 0.24, "grad_norm": 18.375, "learning_rate": 9.484536082474226e-07, "loss": 0.2235, "step": 120 }, { "epoch": 0.26, "grad_norm": 1.0, "learning_rate": 9.415807560137456e-07, "loss": 0.3256, "step": 130 }, { "epoch": 0.28, "grad_norm": 98.5, "learning_rate": 9.347079037800687e-07, "loss": 0.3158, "step": 140 }, { "epoch": 0.3, "grad_norm": 0.796875, "learning_rate": 9.278350515463918e-07, "loss": 0.8076, "step": 150 }, { "epoch": 0.32, "grad_norm": 1.0078125, "learning_rate": 9.209621993127147e-07, "loss": 0.2682, "step": 160 }, { "epoch": 0.34, "grad_norm": 1.1484375, "learning_rate": 9.140893470790378e-07, "loss": 0.4375, "step": 170 }, { "epoch": 0.36, "grad_norm": 0.404296875, "learning_rate": 9.072164948453608e-07, "loss": 0.4296, "step": 180 }, { "epoch": 0.38, "grad_norm": 0.396484375, "learning_rate": 9.003436426116838e-07, "loss": 0.046, "step": 190 }, { "epoch": 0.4, "grad_norm": 196.0, "learning_rate": 8.934707903780069e-07, "loss": 0.3079, "step": 200 }, { "epoch": 0.42, "grad_norm": 20.5, "learning_rate": 8.865979381443298e-07, "loss": 0.5322, "step": 210 }, { "epoch": 0.44, "grad_norm": 169.0, "learning_rate": 8.797250859106528e-07, "loss": 0.4256, "step": 220 }, { "epoch": 0.46, "grad_norm": 0.609375, "learning_rate": 8.728522336769759e-07, "loss": 0.192, "step": 230 }, { "epoch": 0.48, "grad_norm": 159.0, "learning_rate": 8.659793814432989e-07, "loss": 0.4527, "step": 240 }, { "epoch": 0.5, "grad_norm": 9.125, "learning_rate": 8.591065292096219e-07, "loss": 0.3642, "step": 250 }, { "epoch": 0.52, "grad_norm": 284.0, "learning_rate": 8.52233676975945e-07, "loss": 0.1506, "step": 260 }, { "epoch": 0.54, "grad_norm": 0.57421875, "learning_rate": 8.45360824742268e-07, "loss": 0.1674, "step": 270 }, { "epoch": 0.56, "grad_norm": 360.0, "learning_rate": 8.384879725085911e-07, "loss": 0.239, "step": 280 }, { "epoch": 0.58, "grad_norm": 0.408203125, "learning_rate": 8.316151202749141e-07, "loss": 0.1217, "step": 290 }, { "epoch": 0.6, "grad_norm": 0.90625, "learning_rate": 8.24742268041237e-07, "loss": 0.0678, "step": 300 }, { "epoch": 0.62, "grad_norm": 0.52734375, "learning_rate": 8.178694158075601e-07, "loss": 0.2232, "step": 310 }, { "epoch": 0.64, "grad_norm": 8.4375, "learning_rate": 8.109965635738831e-07, "loss": 0.257, "step": 320 }, { "epoch": 0.66, "grad_norm": 348.0, "learning_rate": 8.041237113402062e-07, "loss": 0.5143, "step": 330 }, { "epoch": 0.68, "grad_norm": 0.1279296875, "learning_rate": 7.972508591065292e-07, "loss": 0.1554, "step": 340 }, { "epoch": 0.7, "grad_norm": 0.8203125, "learning_rate": 7.903780068728521e-07, "loss": 0.1433, "step": 350 }, { "epoch": 0.72, "grad_norm": 11.5625, "learning_rate": 7.835051546391752e-07, "loss": 0.1622, "step": 360 }, { "epoch": 0.74, "grad_norm": 1.53125, "learning_rate": 7.766323024054983e-07, "loss": 0.1386, "step": 370 }, { "epoch": 0.76, "grad_norm": 1.765625, "learning_rate": 7.697594501718213e-07, "loss": 0.206, "step": 380 }, { "epoch": 0.78, "grad_norm": 242.0, "learning_rate": 7.628865979381443e-07, "loss": 0.2306, "step": 390 }, { "epoch": 0.8, "grad_norm": 180.0, "learning_rate": 7.560137457044673e-07, "loss": 0.3283, "step": 400 }, { "epoch": 0.82, "grad_norm": 0.7265625, "learning_rate": 7.491408934707904e-07, "loss": 0.1725, "step": 410 }, { "epoch": 0.84, "grad_norm": 0.279296875, "learning_rate": 7.422680412371134e-07, "loss": 0.2467, "step": 420 }, { "epoch": 0.86, "grad_norm": 61.25, "learning_rate": 7.353951890034364e-07, "loss": 0.3496, "step": 430 }, { "epoch": 0.88, "grad_norm": 430.0, "learning_rate": 7.285223367697594e-07, "loss": 0.235, "step": 440 }, { "epoch": 0.9, "grad_norm": 39.0, "learning_rate": 7.216494845360824e-07, "loss": 0.3074, "step": 450 }, { "epoch": 0.92, "grad_norm": 126.0, "learning_rate": 7.147766323024054e-07, "loss": 0.2102, "step": 460 }, { "epoch": 0.94, "grad_norm": 0.349609375, "learning_rate": 7.079037800687286e-07, "loss": 0.128, "step": 470 }, { "epoch": 0.96, "grad_norm": 9.8125, "learning_rate": 7.010309278350515e-07, "loss": 0.2278, "step": 480 }, { "epoch": 0.98, "grad_norm": 3.359375, "learning_rate": 6.941580756013746e-07, "loss": 0.1644, "step": 490 }, { "epoch": 1.0, "grad_norm": 0.244140625, "learning_rate": 6.872852233676976e-07, "loss": 0.3245, "step": 500 }, { "epoch": 1.0, "eval_loss": 0.2981492877006531, "eval_model_preparation_time": 0.0055, "eval_runtime": 90.7988, "eval_samples_per_second": 9.923, "eval_steps_per_second": 2.489, "step": 500 }, { "epoch": 1.02, "grad_norm": 1.4765625, "learning_rate": 6.804123711340206e-07, "loss": 0.2128, "step": 510 }, { "epoch": 1.04, "grad_norm": 0.07958984375, "learning_rate": 6.735395189003437e-07, "loss": 0.0455, "step": 520 }, { "epoch": 1.06, "grad_norm": 175.0, "learning_rate": 6.666666666666666e-07, "loss": 0.2633, "step": 530 }, { "epoch": 1.08, "grad_norm": 0.1328125, "learning_rate": 6.597938144329896e-07, "loss": 0.1368, "step": 540 }, { "epoch": 1.1, "grad_norm": 42.0, "learning_rate": 6.529209621993127e-07, "loss": 0.3963, "step": 550 }, { "epoch": 1.12, "grad_norm": 248.0, "learning_rate": 6.460481099656357e-07, "loss": 0.1807, "step": 560 }, { "epoch": 1.1400000000000001, "grad_norm": 230.0, "learning_rate": 6.391752577319586e-07, "loss": 0.1865, "step": 570 }, { "epoch": 1.16, "grad_norm": 175.0, "learning_rate": 6.323024054982817e-07, "loss": 0.2382, "step": 580 }, { "epoch": 1.18, "grad_norm": 0.671875, "learning_rate": 6.254295532646048e-07, "loss": 0.1976, "step": 590 }, { "epoch": 1.2, "grad_norm": 0.326171875, "learning_rate": 6.185567010309279e-07, "loss": 0.0799, "step": 600 }, { "epoch": 1.22, "grad_norm": 0.69921875, "learning_rate": 6.116838487972509e-07, "loss": 0.252, "step": 610 }, { "epoch": 1.24, "grad_norm": 0.1494140625, "learning_rate": 6.048109965635738e-07, "loss": 0.1807, "step": 620 }, { "epoch": 1.26, "grad_norm": 140.0, "learning_rate": 5.979381443298969e-07, "loss": 0.2544, "step": 630 }, { "epoch": 1.28, "grad_norm": 0.107421875, "learning_rate": 5.910652920962199e-07, "loss": 0.2119, "step": 640 }, { "epoch": 1.3, "grad_norm": 0.51171875, "learning_rate": 5.841924398625429e-07, "loss": 0.2212, "step": 650 }, { "epoch": 1.32, "grad_norm": 100.5, "learning_rate": 5.773195876288659e-07, "loss": 0.2884, "step": 660 }, { "epoch": 1.34, "grad_norm": 0.51171875, "learning_rate": 5.704467353951889e-07, "loss": 0.2897, "step": 670 }, { "epoch": 1.3599999999999999, "grad_norm": 0.53515625, "learning_rate": 5.63573883161512e-07, "loss": 0.2315, "step": 680 }, { "epoch": 1.38, "grad_norm": 0.52734375, "learning_rate": 5.56701030927835e-07, "loss": 0.1976, "step": 690 }, { "epoch": 1.4, "grad_norm": 0.27734375, "learning_rate": 5.498281786941581e-07, "loss": 0.0962, "step": 700 }, { "epoch": 1.42, "grad_norm": 0.52734375, "learning_rate": 5.429553264604811e-07, "loss": 0.1074, "step": 710 }, { "epoch": 1.44, "grad_norm": 212.0, "learning_rate": 5.360824742268041e-07, "loss": 0.2251, "step": 720 }, { "epoch": 1.46, "grad_norm": 146.0, "learning_rate": 5.292096219931271e-07, "loss": 0.198, "step": 730 }, { "epoch": 1.48, "grad_norm": 8.375, "learning_rate": 5.223367697594502e-07, "loss": 0.2187, "step": 740 }, { "epoch": 1.5, "grad_norm": 0.322265625, "learning_rate": 5.154639175257731e-07, "loss": 0.1811, "step": 750 }, { "epoch": 1.52, "grad_norm": 1.609375, "learning_rate": 5.085910652920962e-07, "loss": 0.114, "step": 760 }, { "epoch": 1.54, "grad_norm": 126.0, "learning_rate": 5.017182130584192e-07, "loss": 0.0166, "step": 770 }, { "epoch": 1.56, "grad_norm": 170.0, "learning_rate": 4.948453608247422e-07, "loss": 0.0905, "step": 780 }, { "epoch": 1.58, "grad_norm": 27.875, "learning_rate": 4.879725085910652e-07, "loss": 0.1969, "step": 790 }, { "epoch": 1.6, "grad_norm": 210.0, "learning_rate": 4.810996563573884e-07, "loss": 0.1211, "step": 800 }, { "epoch": 1.62, "grad_norm": 246.0, "learning_rate": 4.742268041237113e-07, "loss": 0.2062, "step": 810 }, { "epoch": 1.6400000000000001, "grad_norm": 0.546875, "learning_rate": 4.6735395189003437e-07, "loss": 0.1751, "step": 820 }, { "epoch": 1.6600000000000001, "grad_norm": 0.310546875, "learning_rate": 4.6048109965635733e-07, "loss": 0.21, "step": 830 }, { "epoch": 1.6800000000000002, "grad_norm": 0.205078125, "learning_rate": 4.536082474226804e-07, "loss": 0.1722, "step": 840 }, { "epoch": 1.7, "grad_norm": 0.31640625, "learning_rate": 4.4673539518900345e-07, "loss": 0.2915, "step": 850 }, { "epoch": 1.72, "grad_norm": 0.1943359375, "learning_rate": 4.398625429553264e-07, "loss": 0.0555, "step": 860 }, { "epoch": 1.74, "grad_norm": 0.72265625, "learning_rate": 4.3298969072164947e-07, "loss": 0.0485, "step": 870 }, { "epoch": 1.76, "grad_norm": 1.3046875, "learning_rate": 4.261168384879725e-07, "loss": 0.0254, "step": 880 }, { "epoch": 1.78, "grad_norm": 143.0, "learning_rate": 4.1924398625429554e-07, "loss": 0.1165, "step": 890 }, { "epoch": 1.8, "grad_norm": 196.0, "learning_rate": 4.123711340206185e-07, "loss": 0.2069, "step": 900 }, { "epoch": 1.8199999999999998, "grad_norm": 0.1796875, "learning_rate": 4.0549828178694155e-07, "loss": 0.2645, "step": 910 }, { "epoch": 1.8399999999999999, "grad_norm": 213.0, "learning_rate": 3.986254295532646e-07, "loss": 0.0918, "step": 920 }, { "epoch": 1.8599999999999999, "grad_norm": 338.0, "learning_rate": 3.917525773195876e-07, "loss": 0.2423, "step": 930 }, { "epoch": 1.88, "grad_norm": 280.0, "learning_rate": 3.8487972508591063e-07, "loss": 0.2567, "step": 940 }, { "epoch": 1.9, "grad_norm": 4.5, "learning_rate": 3.7800687285223364e-07, "loss": 0.0007, "step": 950 }, { "epoch": 1.92, "grad_norm": 1.40625, "learning_rate": 3.711340206185567e-07, "loss": 0.2774, "step": 960 }, { "epoch": 1.94, "grad_norm": 196.0, "learning_rate": 3.642611683848797e-07, "loss": 0.1821, "step": 970 }, { "epoch": 1.96, "grad_norm": 95.0, "learning_rate": 3.573883161512027e-07, "loss": 0.333, "step": 980 }, { "epoch": 1.98, "grad_norm": 0.2177734375, "learning_rate": 3.5051546391752573e-07, "loss": 0.1336, "step": 990 }, { "epoch": 2.0, "grad_norm": 0.314453125, "learning_rate": 3.436426116838488e-07, "loss": 0.1321, "step": 1000 }, { "epoch": 2.0, "eval_loss": 0.2803506553173065, "eval_model_preparation_time": 0.0055, "eval_runtime": 90.8543, "eval_samples_per_second": 9.917, "eval_steps_per_second": 2.487, "step": 1000 }, { "epoch": 2.02, "grad_norm": 0.365234375, "learning_rate": 3.3676975945017185e-07, "loss": 0.2451, "step": 1010 }, { "epoch": 2.04, "grad_norm": 3.890625, "learning_rate": 3.298969072164948e-07, "loss": 0.2094, "step": 1020 }, { "epoch": 2.06, "grad_norm": 115.5, "learning_rate": 3.2302405498281787e-07, "loss": 0.3415, "step": 1030 }, { "epoch": 2.08, "grad_norm": 242.0, "learning_rate": 3.161512027491409e-07, "loss": 0.2187, "step": 1040 }, { "epoch": 2.1, "grad_norm": 158.0, "learning_rate": 3.0927835051546394e-07, "loss": 0.2182, "step": 1050 }, { "epoch": 2.12, "grad_norm": 24.625, "learning_rate": 3.024054982817869e-07, "loss": 0.0973, "step": 1060 }, { "epoch": 2.14, "grad_norm": 0.70703125, "learning_rate": 2.9553264604810995e-07, "loss": 0.1385, "step": 1070 }, { "epoch": 2.16, "grad_norm": 175.0, "learning_rate": 2.8865979381443296e-07, "loss": 0.1739, "step": 1080 }, { "epoch": 2.18, "grad_norm": 0.73828125, "learning_rate": 2.81786941580756e-07, "loss": 0.0563, "step": 1090 }, { "epoch": 2.2, "grad_norm": 308.0, "learning_rate": 2.7491408934707903e-07, "loss": 0.4428, "step": 1100 }, { "epoch": 2.22, "grad_norm": 2.984375, "learning_rate": 2.6804123711340204e-07, "loss": 0.1983, "step": 1110 }, { "epoch": 2.24, "grad_norm": 0.9765625, "learning_rate": 2.611683848797251e-07, "loss": 0.1572, "step": 1120 }, { "epoch": 2.26, "grad_norm": 0.234375, "learning_rate": 2.542955326460481e-07, "loss": 0.2067, "step": 1130 }, { "epoch": 2.2800000000000002, "grad_norm": 246.0, "learning_rate": 2.474226804123711e-07, "loss": 0.3285, "step": 1140 }, { "epoch": 2.3, "grad_norm": 125.5, "learning_rate": 2.405498281786942e-07, "loss": 0.2141, "step": 1150 }, { "epoch": 2.32, "grad_norm": 0.4140625, "learning_rate": 2.3367697594501719e-07, "loss": 0.045, "step": 1160 }, { "epoch": 2.34, "grad_norm": 0.271484375, "learning_rate": 2.268041237113402e-07, "loss": 0.1434, "step": 1170 }, { "epoch": 2.36, "grad_norm": 120.0, "learning_rate": 2.199312714776632e-07, "loss": 0.2549, "step": 1180 }, { "epoch": 2.38, "grad_norm": 0.58203125, "learning_rate": 2.1305841924398624e-07, "loss": 0.1642, "step": 1190 }, { "epoch": 2.4, "grad_norm": 0.26953125, "learning_rate": 2.0618556701030925e-07, "loss": 0.1319, "step": 1200 }, { "epoch": 2.42, "grad_norm": 4.65625, "learning_rate": 1.993127147766323e-07, "loss": 0.0939, "step": 1210 }, { "epoch": 2.44, "grad_norm": 7.40625, "learning_rate": 1.9243986254295532e-07, "loss": 0.0408, "step": 1220 }, { "epoch": 2.46, "grad_norm": 158.0, "learning_rate": 1.8556701030927835e-07, "loss": 0.0666, "step": 1230 }, { "epoch": 2.48, "grad_norm": 0.59765625, "learning_rate": 1.7869415807560136e-07, "loss": 0.1051, "step": 1240 }, { "epoch": 2.5, "grad_norm": 2.046875, "learning_rate": 1.718213058419244e-07, "loss": 0.1325, "step": 1250 }, { "epoch": 2.52, "grad_norm": 196.0, "learning_rate": 1.649484536082474e-07, "loss": 0.1399, "step": 1260 }, { "epoch": 2.54, "grad_norm": 1.2265625, "learning_rate": 1.5807560137457044e-07, "loss": 0.1376, "step": 1270 }, { "epoch": 2.56, "grad_norm": 382.0, "learning_rate": 1.5120274914089345e-07, "loss": 0.4664, "step": 1280 }, { "epoch": 2.58, "grad_norm": 0.203125, "learning_rate": 1.4432989690721648e-07, "loss": 0.0066, "step": 1290 }, { "epoch": 2.6, "grad_norm": 0.232421875, "learning_rate": 1.3745704467353952e-07, "loss": 0.1785, "step": 1300 }, { "epoch": 2.62, "grad_norm": 150.0, "learning_rate": 1.3058419243986255e-07, "loss": 0.215, "step": 1310 }, { "epoch": 2.64, "grad_norm": 176.0, "learning_rate": 1.2371134020618556e-07, "loss": 0.2701, "step": 1320 }, { "epoch": 2.66, "grad_norm": 60.25, "learning_rate": 1.1683848797250859e-07, "loss": 0.1865, "step": 1330 }, { "epoch": 2.68, "grad_norm": 0.55859375, "learning_rate": 1.099656357388316e-07, "loss": 0.0705, "step": 1340 }, { "epoch": 2.7, "grad_norm": 11.5625, "learning_rate": 1.0309278350515462e-07, "loss": 0.1022, "step": 1350 }, { "epoch": 2.7199999999999998, "grad_norm": 130.0, "learning_rate": 9.621993127147766e-08, "loss": 0.1533, "step": 1360 }, { "epoch": 2.74, "grad_norm": 0.19140625, "learning_rate": 8.934707903780068e-08, "loss": 0.2216, "step": 1370 }, { "epoch": 2.76, "grad_norm": 0.107421875, "learning_rate": 8.24742268041237e-08, "loss": 0.1827, "step": 1380 }, { "epoch": 2.7800000000000002, "grad_norm": 144.0, "learning_rate": 7.560137457044672e-08, "loss": 0.1647, "step": 1390 }, { "epoch": 2.8, "grad_norm": 2.71875, "learning_rate": 6.872852233676976e-08, "loss": 0.2745, "step": 1400 }, { "epoch": 2.82, "grad_norm": 0.5390625, "learning_rate": 6.185567010309278e-08, "loss": 0.2536, "step": 1410 }, { "epoch": 2.84, "grad_norm": 0.10693359375, "learning_rate": 5.49828178694158e-08, "loss": 0.1457, "step": 1420 }, { "epoch": 2.86, "grad_norm": 0.1328125, "learning_rate": 4.810996563573883e-08, "loss": 0.1346, "step": 1430 }, { "epoch": 2.88, "grad_norm": 1.28125, "learning_rate": 4.123711340206185e-08, "loss": 0.0228, "step": 1440 }, { "epoch": 2.9, "grad_norm": 37.25, "learning_rate": 3.436426116838488e-08, "loss": 0.1333, "step": 1450 }, { "epoch": 2.92, "grad_norm": 203.0, "learning_rate": 2.74914089347079e-08, "loss": 0.0583, "step": 1460 }, { "epoch": 2.94, "grad_norm": 162.0, "learning_rate": 2.0618556701030925e-08, "loss": 0.0219, "step": 1470 }, { "epoch": 2.96, "grad_norm": 180.0, "learning_rate": 1.374570446735395e-08, "loss": 0.0781, "step": 1480 }, { "epoch": 2.98, "grad_norm": 147.0, "learning_rate": 6.872852233676975e-09, "loss": 0.0778, "step": 1490 }, { "epoch": 3.0, "grad_norm": 179.0, "learning_rate": 0.0, "loss": 0.1859, "step": 1500 }, { "epoch": 3.0, "eval_loss": 0.2740519940853119, "eval_model_preparation_time": 0.0055, "eval_runtime": 90.7126, "eval_samples_per_second": 9.932, "eval_steps_per_second": 2.491, "step": 1500 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0457851977728e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }