{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 115.85907745361328, "learning_rate": 4.997995991983968e-05, "loss": 3.7476, "step": 10 }, { "epoch": 0.008, "grad_norm": 66.54586029052734, "learning_rate": 4.977955911823648e-05, "loss": 2.8414, "step": 20 }, { "epoch": 0.012, "grad_norm": 55.67858123779297, "learning_rate": 4.957915831663327e-05, "loss": 2.3755, "step": 30 }, { "epoch": 0.016, "grad_norm": 49.31084060668945, "learning_rate": 4.937875751503006e-05, "loss": 1.9815, "step": 40 }, { "epoch": 0.02, "grad_norm": 50.2120475769043, "learning_rate": 4.917835671342685e-05, "loss": 2.2454, "step": 50 }, { "epoch": 0.024, "grad_norm": 33.266048431396484, "learning_rate": 4.897795591182365e-05, "loss": 2.4204, "step": 60 }, { "epoch": 0.028, "grad_norm": 25.996694564819336, "learning_rate": 4.877755511022044e-05, "loss": 1.4711, "step": 70 }, { "epoch": 0.032, "grad_norm": 43.97441864013672, "learning_rate": 4.8577154308617234e-05, "loss": 1.697, "step": 80 }, { "epoch": 0.036, "grad_norm": 37.83395767211914, "learning_rate": 4.8376753507014026e-05, "loss": 1.7633, "step": 90 }, { "epoch": 0.04, "grad_norm": 26.54341697692871, "learning_rate": 4.8176352705410824e-05, "loss": 1.3992, "step": 100 }, { "epoch": 0.044, "grad_norm": 42.20634841918945, "learning_rate": 4.797595190380762e-05, "loss": 1.6443, "step": 110 }, { "epoch": 0.048, "grad_norm": 24.657821655273438, "learning_rate": 4.7775551102204415e-05, "loss": 1.8471, "step": 120 }, { "epoch": 0.052, "grad_norm": 38.727420806884766, "learning_rate": 4.7575150300601207e-05, "loss": 1.5293, "step": 130 }, { "epoch": 0.056, "grad_norm": 31.97869873046875, "learning_rate": 4.7374749498998e-05, "loss": 1.6212, "step": 140 }, { "epoch": 0.06, "grad_norm": 31.056962966918945, "learning_rate": 4.717434869739479e-05, "loss": 1.4407, "step": 150 }, { "epoch": 0.064, "grad_norm": 29.63347053527832, "learning_rate": 4.697394789579159e-05, "loss": 1.1833, "step": 160 }, { "epoch": 0.068, "grad_norm": 44.844268798828125, "learning_rate": 4.677354709418838e-05, "loss": 1.5756, "step": 170 }, { "epoch": 0.072, "grad_norm": 31.4070987701416, "learning_rate": 4.657314629258517e-05, "loss": 1.4358, "step": 180 }, { "epoch": 0.076, "grad_norm": 26.982776641845703, "learning_rate": 4.6372745490981964e-05, "loss": 1.234, "step": 190 }, { "epoch": 0.08, "grad_norm": 19.802730560302734, "learning_rate": 4.617234468937876e-05, "loss": 1.3504, "step": 200 }, { "epoch": 0.084, "grad_norm": 33.88198471069336, "learning_rate": 4.5971943887775554e-05, "loss": 1.5029, "step": 210 }, { "epoch": 0.088, "grad_norm": 24.533716201782227, "learning_rate": 4.5771543086172346e-05, "loss": 1.2978, "step": 220 }, { "epoch": 0.092, "grad_norm": 27.563339233398438, "learning_rate": 4.557114228456914e-05, "loss": 1.7014, "step": 230 }, { "epoch": 0.096, "grad_norm": 29.428752899169922, "learning_rate": 4.5370741482965936e-05, "loss": 1.3845, "step": 240 }, { "epoch": 0.1, "grad_norm": 20.272520065307617, "learning_rate": 4.517034068136273e-05, "loss": 1.2192, "step": 250 }, { "epoch": 0.104, "grad_norm": 47.12469482421875, "learning_rate": 4.496993987975952e-05, "loss": 1.2814, "step": 260 }, { "epoch": 0.108, "grad_norm": 18.20330238342285, "learning_rate": 4.476953907815631e-05, "loss": 1.2717, "step": 270 }, { "epoch": 0.112, "grad_norm": 39.07451248168945, "learning_rate": 4.456913827655311e-05, "loss": 1.3291, "step": 280 }, { "epoch": 0.116, "grad_norm": 50.37272644042969, "learning_rate": 4.43687374749499e-05, "loss": 1.3691, "step": 290 }, { "epoch": 0.12, "grad_norm": 23.233367919921875, "learning_rate": 4.4168336673346694e-05, "loss": 1.4183, "step": 300 }, { "epoch": 0.124, "grad_norm": 22.46800422668457, "learning_rate": 4.3967935871743486e-05, "loss": 1.1226, "step": 310 }, { "epoch": 0.128, "grad_norm": 24.424856185913086, "learning_rate": 4.3767535070140284e-05, "loss": 1.3413, "step": 320 }, { "epoch": 0.132, "grad_norm": 14.698283195495605, "learning_rate": 4.3567134268537076e-05, "loss": 1.2009, "step": 330 }, { "epoch": 0.136, "grad_norm": 23.5820369720459, "learning_rate": 4.336673346693387e-05, "loss": 1.0799, "step": 340 }, { "epoch": 0.14, "grad_norm": 26.510631561279297, "learning_rate": 4.316633266533066e-05, "loss": 1.0801, "step": 350 }, { "epoch": 0.144, "grad_norm": 18.498275756835938, "learning_rate": 4.296593186372745e-05, "loss": 1.1631, "step": 360 }, { "epoch": 0.148, "grad_norm": 35.2937126159668, "learning_rate": 4.2765531062124256e-05, "loss": 1.3777, "step": 370 }, { "epoch": 0.152, "grad_norm": 40.05356216430664, "learning_rate": 4.256513026052105e-05, "loss": 1.1189, "step": 380 }, { "epoch": 0.156, "grad_norm": 18.918344497680664, "learning_rate": 4.236472945891784e-05, "loss": 1.0721, "step": 390 }, { "epoch": 0.16, "grad_norm": 20.29583168029785, "learning_rate": 4.216432865731463e-05, "loss": 0.9337, "step": 400 }, { "epoch": 0.164, "grad_norm": 19.447803497314453, "learning_rate": 4.2081993569131834e-05, "loss": 1.2047, "step": 410 }, { "epoch": 0.168, "grad_norm": 26.82716178894043, "learning_rate": 4.188102893890675e-05, "loss": 1.1294, "step": 420 }, { "epoch": 0.172, "grad_norm": 12.995594024658203, "learning_rate": 4.168006430868168e-05, "loss": 1.0033, "step": 430 }, { "epoch": 0.176, "grad_norm": 21.796598434448242, "learning_rate": 4.14790996784566e-05, "loss": 0.8864, "step": 440 }, { "epoch": 0.18, "grad_norm": 13.911988258361816, "learning_rate": 4.1278135048231516e-05, "loss": 0.8974, "step": 450 }, { "epoch": 0.184, "grad_norm": 25.945011138916016, "learning_rate": 4.1077170418006434e-05, "loss": 1.261, "step": 460 }, { "epoch": 0.188, "grad_norm": 19.943857192993164, "learning_rate": 4.087620578778135e-05, "loss": 1.0262, "step": 470 }, { "epoch": 0.192, "grad_norm": 23.558696746826172, "learning_rate": 4.067524115755627e-05, "loss": 0.9572, "step": 480 }, { "epoch": 0.196, "grad_norm": 42.70231628417969, "learning_rate": 4.047427652733119e-05, "loss": 1.044, "step": 490 }, { "epoch": 0.2, "grad_norm": 16.41856575012207, "learning_rate": 4.027331189710611e-05, "loss": 1.0562, "step": 500 }, { "epoch": 0.204, "grad_norm": 13.945029258728027, "learning_rate": 4.0072347266881035e-05, "loss": 0.9637, "step": 510 }, { "epoch": 0.208, "grad_norm": 16.55429458618164, "learning_rate": 3.9871382636655953e-05, "loss": 0.8104, "step": 520 }, { "epoch": 0.212, "grad_norm": 24.434778213500977, "learning_rate": 3.967041800643087e-05, "loss": 1.1942, "step": 530 }, { "epoch": 0.216, "grad_norm": 20.01283836364746, "learning_rate": 3.946945337620579e-05, "loss": 0.9209, "step": 540 }, { "epoch": 0.22, "grad_norm": 18.98524284362793, "learning_rate": 3.926848874598071e-05, "loss": 0.7445, "step": 550 }, { "epoch": 0.224, "grad_norm": 22.44414710998535, "learning_rate": 3.906752411575563e-05, "loss": 0.9528, "step": 560 }, { "epoch": 0.228, "grad_norm": 19.79057502746582, "learning_rate": 3.886655948553055e-05, "loss": 0.9216, "step": 570 }, { "epoch": 0.232, "grad_norm": 17.453460693359375, "learning_rate": 3.866559485530547e-05, "loss": 0.9834, "step": 580 }, { "epoch": 0.236, "grad_norm": 29.218969345092773, "learning_rate": 3.846463022508039e-05, "loss": 0.9945, "step": 590 }, { "epoch": 0.24, "grad_norm": 17.652963638305664, "learning_rate": 3.826366559485531e-05, "loss": 0.9308, "step": 600 }, { "epoch": 0.244, "grad_norm": 20.76468849182129, "learning_rate": 3.806270096463023e-05, "loss": 0.8735, "step": 610 }, { "epoch": 0.248, "grad_norm": 26.41815757751465, "learning_rate": 3.786173633440515e-05, "loss": 1.2878, "step": 620 }, { "epoch": 0.252, "grad_norm": 41.02421951293945, "learning_rate": 3.7660771704180066e-05, "loss": 1.1299, "step": 630 }, { "epoch": 0.256, "grad_norm": 17.34744644165039, "learning_rate": 3.7459807073954985e-05, "loss": 0.8315, "step": 640 }, { "epoch": 0.26, "grad_norm": 14.293941497802734, "learning_rate": 3.725884244372991e-05, "loss": 0.8405, "step": 650 }, { "epoch": 0.264, "grad_norm": 15.149956703186035, "learning_rate": 3.705787781350483e-05, "loss": 1.0297, "step": 660 }, { "epoch": 0.268, "grad_norm": 17.754810333251953, "learning_rate": 3.685691318327975e-05, "loss": 0.9322, "step": 670 }, { "epoch": 0.272, "grad_norm": 21.743669509887695, "learning_rate": 3.6655948553054666e-05, "loss": 0.9069, "step": 680 }, { "epoch": 0.276, "grad_norm": 29.161598205566406, "learning_rate": 3.6454983922829585e-05, "loss": 0.8633, "step": 690 }, { "epoch": 0.28, "grad_norm": 16.16539192199707, "learning_rate": 3.6254019292604503e-05, "loss": 0.7662, "step": 700 }, { "epoch": 0.284, "grad_norm": 25.36922264099121, "learning_rate": 3.605305466237942e-05, "loss": 0.7945, "step": 710 }, { "epoch": 0.288, "grad_norm": 24.251853942871094, "learning_rate": 3.585209003215435e-05, "loss": 0.9693, "step": 720 }, { "epoch": 0.292, "grad_norm": 15.235057830810547, "learning_rate": 3.5651125401929266e-05, "loss": 0.8969, "step": 730 }, { "epoch": 0.296, "grad_norm": 14.464040756225586, "learning_rate": 3.5450160771704185e-05, "loss": 0.7205, "step": 740 }, { "epoch": 0.3, "grad_norm": 23.044424057006836, "learning_rate": 3.5249196141479104e-05, "loss": 0.839, "step": 750 }, { "epoch": 0.304, "grad_norm": 25.620925903320312, "learning_rate": 3.504823151125402e-05, "loss": 0.8887, "step": 760 }, { "epoch": 0.308, "grad_norm": 10.347396850585938, "learning_rate": 3.484726688102894e-05, "loss": 0.645, "step": 770 }, { "epoch": 0.312, "grad_norm": 19.114471435546875, "learning_rate": 3.464630225080386e-05, "loss": 0.8341, "step": 780 }, { "epoch": 0.316, "grad_norm": 17.528043746948242, "learning_rate": 3.4445337620578785e-05, "loss": 0.7108, "step": 790 }, { "epoch": 0.32, "grad_norm": 13.186959266662598, "learning_rate": 3.4244372990353704e-05, "loss": 0.7991, "step": 800 }, { "epoch": 0.324, "grad_norm": 25.02106475830078, "learning_rate": 3.404340836012862e-05, "loss": 0.7284, "step": 810 }, { "epoch": 0.328, "grad_norm": 14.035198211669922, "learning_rate": 3.384244372990354e-05, "loss": 0.7589, "step": 820 }, { "epoch": 0.332, "grad_norm": 11.368013381958008, "learning_rate": 3.364147909967846e-05, "loss": 0.7638, "step": 830 }, { "epoch": 0.336, "grad_norm": 21.951080322265625, "learning_rate": 3.344051446945338e-05, "loss": 0.7869, "step": 840 }, { "epoch": 0.34, "grad_norm": 17.966073989868164, "learning_rate": 3.32395498392283e-05, "loss": 0.6792, "step": 850 }, { "epoch": 0.344, "grad_norm": 36.02198791503906, "learning_rate": 3.3038585209003216e-05, "loss": 0.6968, "step": 860 }, { "epoch": 0.348, "grad_norm": 32.43560791015625, "learning_rate": 3.283762057877814e-05, "loss": 0.7523, "step": 870 }, { "epoch": 0.352, "grad_norm": 30.29490852355957, "learning_rate": 3.263665594855306e-05, "loss": 0.6548, "step": 880 }, { "epoch": 0.356, "grad_norm": 8.957921981811523, "learning_rate": 3.243569131832798e-05, "loss": 0.7151, "step": 890 }, { "epoch": 0.36, "grad_norm": 15.583487510681152, "learning_rate": 3.22347266881029e-05, "loss": 0.625, "step": 900 }, { "epoch": 0.364, "grad_norm": 23.470478057861328, "learning_rate": 3.2033762057877816e-05, "loss": 0.9571, "step": 910 }, { "epoch": 0.368, "grad_norm": 31.515092849731445, "learning_rate": 3.1832797427652735e-05, "loss": 0.7395, "step": 920 }, { "epoch": 0.372, "grad_norm": 14.246073722839355, "learning_rate": 3.1631832797427654e-05, "loss": 0.6884, "step": 930 }, { "epoch": 0.376, "grad_norm": 25.352590560913086, "learning_rate": 3.143086816720258e-05, "loss": 0.772, "step": 940 }, { "epoch": 0.38, "grad_norm": 14.441354751586914, "learning_rate": 3.12299035369775e-05, "loss": 0.7406, "step": 950 }, { "epoch": 0.384, "grad_norm": 29.33234405517578, "learning_rate": 3.102893890675242e-05, "loss": 0.7242, "step": 960 }, { "epoch": 0.388, "grad_norm": 16.018104553222656, "learning_rate": 3.0827974276527335e-05, "loss": 0.9183, "step": 970 }, { "epoch": 0.392, "grad_norm": 14.766180992126465, "learning_rate": 3.0627009646302254e-05, "loss": 0.8971, "step": 980 }, { "epoch": 0.396, "grad_norm": 10.733450889587402, "learning_rate": 3.042604501607717e-05, "loss": 0.5546, "step": 990 }, { "epoch": 0.4, "grad_norm": 15.318602561950684, "learning_rate": 3.0225080385852088e-05, "loss": 0.7582, "step": 1000 }, { "epoch": 0.404, "grad_norm": 16.220378875732422, "learning_rate": 3.0024115755627013e-05, "loss": 0.7034, "step": 1010 }, { "epoch": 0.408, "grad_norm": 24.005294799804688, "learning_rate": 2.9823151125401932e-05, "loss": 0.692, "step": 1020 }, { "epoch": 0.412, "grad_norm": 10.050074577331543, "learning_rate": 2.962218649517685e-05, "loss": 0.6043, "step": 1030 }, { "epoch": 0.416, "grad_norm": 14.405035972595215, "learning_rate": 2.942122186495177e-05, "loss": 0.779, "step": 1040 }, { "epoch": 0.42, "grad_norm": 21.141399383544922, "learning_rate": 2.9220257234726688e-05, "loss": 0.6058, "step": 1050 }, { "epoch": 0.424, "grad_norm": 12.982873916625977, "learning_rate": 2.9019292604501607e-05, "loss": 0.6479, "step": 1060 }, { "epoch": 0.428, "grad_norm": 11.213690757751465, "learning_rate": 2.8818327974276526e-05, "loss": 0.5357, "step": 1070 }, { "epoch": 0.432, "grad_norm": 9.400264739990234, "learning_rate": 2.861736334405145e-05, "loss": 0.7004, "step": 1080 }, { "epoch": 0.436, "grad_norm": 24.078365325927734, "learning_rate": 2.841639871382637e-05, "loss": 0.574, "step": 1090 }, { "epoch": 0.44, "grad_norm": 10.422481536865234, "learning_rate": 2.821543408360129e-05, "loss": 0.5758, "step": 1100 }, { "epoch": 0.444, "grad_norm": 14.57651424407959, "learning_rate": 2.8014469453376207e-05, "loss": 0.6732, "step": 1110 }, { "epoch": 0.448, "grad_norm": 11.885628700256348, "learning_rate": 2.7813504823151126e-05, "loss": 0.5472, "step": 1120 }, { "epoch": 0.452, "grad_norm": 14.696109771728516, "learning_rate": 2.7612540192926045e-05, "loss": 0.6875, "step": 1130 }, { "epoch": 0.456, "grad_norm": 10.769063949584961, "learning_rate": 2.7411575562700963e-05, "loss": 0.63, "step": 1140 }, { "epoch": 0.46, "grad_norm": 14.238389015197754, "learning_rate": 2.7210610932475882e-05, "loss": 0.6574, "step": 1150 }, { "epoch": 0.464, "grad_norm": 10.263813972473145, "learning_rate": 2.7009646302250807e-05, "loss": 0.6817, "step": 1160 }, { "epoch": 0.468, "grad_norm": 18.851377487182617, "learning_rate": 2.6808681672025726e-05, "loss": 0.49, "step": 1170 }, { "epoch": 0.472, "grad_norm": 17.136259078979492, "learning_rate": 2.6607717041800645e-05, "loss": 0.6844, "step": 1180 }, { "epoch": 0.476, "grad_norm": 12.520773887634277, "learning_rate": 2.6406752411575564e-05, "loss": 0.542, "step": 1190 }, { "epoch": 0.48, "grad_norm": 8.42679214477539, "learning_rate": 2.6205787781350482e-05, "loss": 0.517, "step": 1200 }, { "epoch": 0.484, "grad_norm": 16.89554214477539, "learning_rate": 2.60048231511254e-05, "loss": 0.5378, "step": 1210 }, { "epoch": 0.488, "grad_norm": 20.790712356567383, "learning_rate": 2.580385852090032e-05, "loss": 0.4598, "step": 1220 }, { "epoch": 0.492, "grad_norm": 13.592977523803711, "learning_rate": 2.5602893890675245e-05, "loss": 0.5201, "step": 1230 }, { "epoch": 0.496, "grad_norm": 17.89519500732422, "learning_rate": 2.5401929260450164e-05, "loss": 0.5422, "step": 1240 }, { "epoch": 0.5, "grad_norm": 12.296363830566406, "learning_rate": 2.5200964630225083e-05, "loss": 0.5155, "step": 1250 }, { "epoch": 0.504, "grad_norm": 14.820162773132324, "learning_rate": 2.5e-05, "loss": 0.5141, "step": 1260 }, { "epoch": 0.508, "grad_norm": 14.70346736907959, "learning_rate": 2.479903536977492e-05, "loss": 0.5812, "step": 1270 }, { "epoch": 0.512, "grad_norm": 18.85811424255371, "learning_rate": 2.4598070739549842e-05, "loss": 0.5159, "step": 1280 }, { "epoch": 0.516, "grad_norm": 12.222372055053711, "learning_rate": 2.439710610932476e-05, "loss": 0.4363, "step": 1290 }, { "epoch": 0.52, "grad_norm": 49.18086242675781, "learning_rate": 2.419614147909968e-05, "loss": 0.7475, "step": 1300 }, { "epoch": 0.524, "grad_norm": 12.499253273010254, "learning_rate": 2.3995176848874598e-05, "loss": 0.6813, "step": 1310 }, { "epoch": 0.528, "grad_norm": 14.978521347045898, "learning_rate": 2.379421221864952e-05, "loss": 0.5517, "step": 1320 }, { "epoch": 0.532, "grad_norm": 16.008121490478516, "learning_rate": 2.359324758842444e-05, "loss": 0.5561, "step": 1330 }, { "epoch": 0.536, "grad_norm": 17.769926071166992, "learning_rate": 2.3392282958199358e-05, "loss": 0.5745, "step": 1340 }, { "epoch": 0.54, "grad_norm": 11.014586448669434, "learning_rate": 2.3191318327974276e-05, "loss": 0.4567, "step": 1350 }, { "epoch": 0.544, "grad_norm": 10.055672645568848, "learning_rate": 2.29903536977492e-05, "loss": 0.5071, "step": 1360 }, { "epoch": 0.548, "grad_norm": 10.68797779083252, "learning_rate": 2.2789389067524117e-05, "loss": 0.6491, "step": 1370 }, { "epoch": 0.552, "grad_norm": 9.747801780700684, "learning_rate": 2.2588424437299036e-05, "loss": 0.4888, "step": 1380 }, { "epoch": 0.556, "grad_norm": 11.837730407714844, "learning_rate": 2.2387459807073958e-05, "loss": 0.4526, "step": 1390 }, { "epoch": 0.56, "grad_norm": 14.05400562286377, "learning_rate": 2.2186495176848876e-05, "loss": 0.6307, "step": 1400 }, { "epoch": 0.564, "grad_norm": 12.709571838378906, "learning_rate": 2.1985530546623795e-05, "loss": 0.427, "step": 1410 }, { "epoch": 0.568, "grad_norm": 10.895342826843262, "learning_rate": 2.1784565916398714e-05, "loss": 0.4546, "step": 1420 }, { "epoch": 0.572, "grad_norm": 13.848987579345703, "learning_rate": 2.1583601286173636e-05, "loss": 0.4159, "step": 1430 }, { "epoch": 0.576, "grad_norm": 16.61017608642578, "learning_rate": 2.1382636655948555e-05, "loss": 0.733, "step": 1440 }, { "epoch": 0.58, "grad_norm": 10.283616065979004, "learning_rate": 2.1181672025723473e-05, "loss": 0.4581, "step": 1450 }, { "epoch": 0.584, "grad_norm": 11.248019218444824, "learning_rate": 2.0980707395498395e-05, "loss": 0.462, "step": 1460 }, { "epoch": 0.588, "grad_norm": 10.817742347717285, "learning_rate": 2.0779742765273314e-05, "loss": 0.451, "step": 1470 }, { "epoch": 0.592, "grad_norm": 10.615836143493652, "learning_rate": 2.0578778135048233e-05, "loss": 0.5714, "step": 1480 }, { "epoch": 0.596, "grad_norm": 12.22169017791748, "learning_rate": 2.037781350482315e-05, "loss": 0.8718, "step": 1490 }, { "epoch": 0.6, "grad_norm": 13.216890335083008, "learning_rate": 2.0176848874598074e-05, "loss": 0.3766, "step": 1500 }, { "epoch": 0.604, "grad_norm": 13.472972869873047, "learning_rate": 1.9975884244372992e-05, "loss": 0.5891, "step": 1510 }, { "epoch": 0.608, "grad_norm": 20.179187774658203, "learning_rate": 1.977491961414791e-05, "loss": 0.3744, "step": 1520 }, { "epoch": 0.612, "grad_norm": 12.630617141723633, "learning_rate": 1.957395498392283e-05, "loss": 0.4564, "step": 1530 }, { "epoch": 0.616, "grad_norm": 20.459508895874023, "learning_rate": 1.9372990353697752e-05, "loss": 0.4566, "step": 1540 }, { "epoch": 0.62, "grad_norm": 16.580251693725586, "learning_rate": 1.917202572347267e-05, "loss": 0.447, "step": 1550 }, { "epoch": 0.624, "grad_norm": 13.90858268737793, "learning_rate": 1.897106109324759e-05, "loss": 0.4133, "step": 1560 }, { "epoch": 0.628, "grad_norm": 10.297750473022461, "learning_rate": 1.877009646302251e-05, "loss": 0.9703, "step": 1570 }, { "epoch": 0.632, "grad_norm": 19.886884689331055, "learning_rate": 1.856913183279743e-05, "loss": 0.6253, "step": 1580 }, { "epoch": 0.636, "grad_norm": 10.709681510925293, "learning_rate": 1.836816720257235e-05, "loss": 0.4657, "step": 1590 }, { "epoch": 0.64, "grad_norm": 16.986331939697266, "learning_rate": 1.8167202572347267e-05, "loss": 0.3986, "step": 1600 }, { "epoch": 0.644, "grad_norm": 13.649085998535156, "learning_rate": 1.796623794212219e-05, "loss": 0.4337, "step": 1610 }, { "epoch": 0.648, "grad_norm": 7.645134449005127, "learning_rate": 1.7765273311897108e-05, "loss": 0.3901, "step": 1620 }, { "epoch": 0.652, "grad_norm": 11.727263450622559, "learning_rate": 1.7564308681672027e-05, "loss": 0.3545, "step": 1630 }, { "epoch": 0.656, "grad_norm": 6.705881595611572, "learning_rate": 1.736334405144695e-05, "loss": 0.3471, "step": 1640 }, { "epoch": 0.66, "grad_norm": 12.363304138183594, "learning_rate": 1.7162379421221868e-05, "loss": 0.4351, "step": 1650 }, { "epoch": 0.664, "grad_norm": 20.208723068237305, "learning_rate": 1.6961414790996786e-05, "loss": 0.4284, "step": 1660 }, { "epoch": 0.668, "grad_norm": 10.82363224029541, "learning_rate": 1.6760450160771705e-05, "loss": 0.3369, "step": 1670 }, { "epoch": 0.672, "grad_norm": 9.544486045837402, "learning_rate": 1.6559485530546627e-05, "loss": 0.4059, "step": 1680 }, { "epoch": 0.676, "grad_norm": 8.426627159118652, "learning_rate": 1.6358520900321546e-05, "loss": 0.4494, "step": 1690 }, { "epoch": 0.68, "grad_norm": 8.424084663391113, "learning_rate": 1.6157556270096464e-05, "loss": 0.4807, "step": 1700 }, { "epoch": 0.684, "grad_norm": 9.49954891204834, "learning_rate": 1.5956591639871383e-05, "loss": 0.3647, "step": 1710 }, { "epoch": 0.688, "grad_norm": 14.690208435058594, "learning_rate": 1.5755627009646305e-05, "loss": 0.3715, "step": 1720 }, { "epoch": 0.692, "grad_norm": 12.074922561645508, "learning_rate": 1.5554662379421224e-05, "loss": 0.491, "step": 1730 }, { "epoch": 0.696, "grad_norm": 13.278485298156738, "learning_rate": 1.5353697749196143e-05, "loss": 0.4185, "step": 1740 }, { "epoch": 0.7, "grad_norm": 12.987263679504395, "learning_rate": 1.5152733118971063e-05, "loss": 0.5613, "step": 1750 }, { "epoch": 0.704, "grad_norm": 6.863049030303955, "learning_rate": 1.4951768488745982e-05, "loss": 0.3245, "step": 1760 }, { "epoch": 0.708, "grad_norm": 11.087668418884277, "learning_rate": 1.47508038585209e-05, "loss": 0.4174, "step": 1770 }, { "epoch": 0.712, "grad_norm": 5.16309118270874, "learning_rate": 1.4549839228295819e-05, "loss": 0.3233, "step": 1780 }, { "epoch": 0.716, "grad_norm": 12.031776428222656, "learning_rate": 1.4348874598070741e-05, "loss": 0.3574, "step": 1790 }, { "epoch": 0.72, "grad_norm": 13.569413185119629, "learning_rate": 1.414790996784566e-05, "loss": 0.5619, "step": 1800 }, { "epoch": 0.724, "grad_norm": 5.905683517456055, "learning_rate": 1.3946945337620579e-05, "loss": 0.3286, "step": 1810 }, { "epoch": 0.728, "grad_norm": 10.061610221862793, "learning_rate": 1.3745980707395497e-05, "loss": 0.3899, "step": 1820 }, { "epoch": 0.732, "grad_norm": 12.293854713439941, "learning_rate": 1.354501607717042e-05, "loss": 0.4059, "step": 1830 }, { "epoch": 0.736, "grad_norm": 13.248871803283691, "learning_rate": 1.3344051446945338e-05, "loss": 0.4339, "step": 1840 }, { "epoch": 0.74, "grad_norm": 9.589434623718262, "learning_rate": 1.3143086816720257e-05, "loss": 0.4178, "step": 1850 }, { "epoch": 0.744, "grad_norm": 8.538604736328125, "learning_rate": 1.2942122186495179e-05, "loss": 0.3152, "step": 1860 }, { "epoch": 0.748, "grad_norm": 18.58129119873047, "learning_rate": 1.2741157556270097e-05, "loss": 0.4276, "step": 1870 }, { "epoch": 0.752, "grad_norm": 8.69501781463623, "learning_rate": 1.2540192926045016e-05, "loss": 0.4304, "step": 1880 }, { "epoch": 0.756, "grad_norm": 14.74836254119873, "learning_rate": 1.2339228295819937e-05, "loss": 0.3541, "step": 1890 }, { "epoch": 0.76, "grad_norm": 7.415429592132568, "learning_rate": 1.2138263665594855e-05, "loss": 0.3713, "step": 1900 }, { "epoch": 0.764, "grad_norm": 8.78702163696289, "learning_rate": 1.1937299035369776e-05, "loss": 0.3025, "step": 1910 }, { "epoch": 0.768, "grad_norm": 2.6222591400146484, "learning_rate": 1.1736334405144696e-05, "loss": 0.2279, "step": 1920 }, { "epoch": 0.772, "grad_norm": 8.457213401794434, "learning_rate": 1.1535369774919615e-05, "loss": 0.3841, "step": 1930 }, { "epoch": 0.776, "grad_norm": 9.097604751586914, "learning_rate": 1.1334405144694535e-05, "loss": 0.3436, "step": 1940 }, { "epoch": 0.78, "grad_norm": 9.933280944824219, "learning_rate": 1.1133440514469454e-05, "loss": 0.4276, "step": 1950 }, { "epoch": 0.784, "grad_norm": 9.58340072631836, "learning_rate": 1.0932475884244374e-05, "loss": 0.281, "step": 1960 }, { "epoch": 0.788, "grad_norm": 13.846723556518555, "learning_rate": 1.0731511254019293e-05, "loss": 0.2836, "step": 1970 }, { "epoch": 0.792, "grad_norm": 30.122060775756836, "learning_rate": 1.0530546623794213e-05, "loss": 0.3722, "step": 1980 }, { "epoch": 0.796, "grad_norm": 8.666303634643555, "learning_rate": 1.0329581993569132e-05, "loss": 0.2778, "step": 1990 }, { "epoch": 0.8, "grad_norm": 7.968908786773682, "learning_rate": 1.0128617363344052e-05, "loss": 0.2778, "step": 2000 }, { "epoch": 0.804, "grad_norm": 6.987481594085693, "learning_rate": 9.927652733118971e-06, "loss": 0.2778, "step": 2010 }, { "epoch": 0.808, "grad_norm": 11.547746658325195, "learning_rate": 9.726688102893891e-06, "loss": 0.3181, "step": 2020 }, { "epoch": 0.812, "grad_norm": 7.187608242034912, "learning_rate": 9.525723472668812e-06, "loss": 0.3211, "step": 2030 }, { "epoch": 0.816, "grad_norm": 14.975872039794922, "learning_rate": 9.32475884244373e-06, "loss": 0.2335, "step": 2040 }, { "epoch": 0.82, "grad_norm": 5.20744514465332, "learning_rate": 9.123794212218651e-06, "loss": 0.3012, "step": 2050 }, { "epoch": 0.824, "grad_norm": 9.876429557800293, "learning_rate": 8.92282958199357e-06, "loss": 0.3095, "step": 2060 }, { "epoch": 0.828, "grad_norm": 7.847969055175781, "learning_rate": 8.72186495176849e-06, "loss": 0.3336, "step": 2070 }, { "epoch": 0.832, "grad_norm": 5.847342014312744, "learning_rate": 8.520900321543409e-06, "loss": 0.3471, "step": 2080 }, { "epoch": 0.836, "grad_norm": 12.866349220275879, "learning_rate": 8.319935691318329e-06, "loss": 0.5096, "step": 2090 }, { "epoch": 0.84, "grad_norm": 5.676148891448975, "learning_rate": 8.118971061093248e-06, "loss": 0.3028, "step": 2100 }, { "epoch": 0.844, "grad_norm": 10.639591217041016, "learning_rate": 7.918006430868168e-06, "loss": 0.3065, "step": 2110 }, { "epoch": 0.848, "grad_norm": 4.760974884033203, "learning_rate": 7.717041800643089e-06, "loss": 0.2966, "step": 2120 }, { "epoch": 0.852, "grad_norm": 11.585098266601562, "learning_rate": 7.516077170418006e-06, "loss": 0.3466, "step": 2130 }, { "epoch": 0.856, "grad_norm": 4.685300827026367, "learning_rate": 7.315112540192927e-06, "loss": 0.397, "step": 2140 }, { "epoch": 0.86, "grad_norm": 7.504604339599609, "learning_rate": 7.1141479099678455e-06, "loss": 0.2216, "step": 2150 }, { "epoch": 0.864, "grad_norm": 16.2221736907959, "learning_rate": 6.913183279742766e-06, "loss": 0.3863, "step": 2160 }, { "epoch": 0.868, "grad_norm": 9.542070388793945, "learning_rate": 6.732315112540192e-06, "loss": 0.2478, "step": 2170 }, { "epoch": 0.872, "grad_norm": 12.575300216674805, "learning_rate": 6.531350482315113e-06, "loss": 0.3366, "step": 2180 }, { "epoch": 0.876, "grad_norm": 12.5275297164917, "learning_rate": 6.330385852090033e-06, "loss": 0.42, "step": 2190 }, { "epoch": 0.88, "grad_norm": 7.2454514503479, "learning_rate": 6.129421221864952e-06, "loss": 0.2943, "step": 2200 }, { "epoch": 0.884, "grad_norm": 5.587230682373047, "learning_rate": 5.928456591639871e-06, "loss": 0.2124, "step": 2210 }, { "epoch": 0.888, "grad_norm": 12.111313819885254, "learning_rate": 5.727491961414791e-06, "loss": 0.4733, "step": 2220 }, { "epoch": 0.892, "grad_norm": 13.287424087524414, "learning_rate": 5.526527331189711e-06, "loss": 0.3033, "step": 2230 }, { "epoch": 0.896, "grad_norm": 16.977447509765625, "learning_rate": 5.325562700964631e-06, "loss": 0.3256, "step": 2240 }, { "epoch": 0.9, "grad_norm": 6.201746463775635, "learning_rate": 5.12459807073955e-06, "loss": 0.1995, "step": 2250 }, { "epoch": 0.904, "grad_norm": 10.706707954406738, "learning_rate": 4.92363344051447e-06, "loss": 0.3303, "step": 2260 }, { "epoch": 0.908, "grad_norm": 14.989971160888672, "learning_rate": 4.7226688102893895e-06, "loss": 0.2842, "step": 2270 }, { "epoch": 0.912, "grad_norm": 15.570676803588867, "learning_rate": 4.521704180064309e-06, "loss": 0.212, "step": 2280 }, { "epoch": 0.916, "grad_norm": 3.9141690731048584, "learning_rate": 4.320739549839229e-06, "loss": 0.263, "step": 2290 }, { "epoch": 0.92, "grad_norm": 6.531361103057861, "learning_rate": 4.119774919614148e-06, "loss": 0.2503, "step": 2300 }, { "epoch": 0.924, "grad_norm": 8.718388557434082, "learning_rate": 3.918810289389068e-06, "loss": 0.2793, "step": 2310 }, { "epoch": 0.928, "grad_norm": 11.070220947265625, "learning_rate": 3.7178456591639876e-06, "loss": 0.2585, "step": 2320 }, { "epoch": 0.932, "grad_norm": 7.7603759765625, "learning_rate": 3.516881028938907e-06, "loss": 0.5099, "step": 2330 }, { "epoch": 0.936, "grad_norm": 15.190449714660645, "learning_rate": 3.3159163987138267e-06, "loss": 0.2601, "step": 2340 }, { "epoch": 0.94, "grad_norm": 9.578422546386719, "learning_rate": 3.1149517684887463e-06, "loss": 0.264, "step": 2350 }, { "epoch": 0.944, "grad_norm": 10.097108840942383, "learning_rate": 2.913987138263666e-06, "loss": 0.6162, "step": 2360 }, { "epoch": 0.948, "grad_norm": 8.5523042678833, "learning_rate": 2.7130225080385853e-06, "loss": 0.2649, "step": 2370 }, { "epoch": 0.952, "grad_norm": 8.127545356750488, "learning_rate": 2.512057877813505e-06, "loss": 0.4394, "step": 2380 }, { "epoch": 0.956, "grad_norm": 8.142045021057129, "learning_rate": 2.311093247588425e-06, "loss": 0.2339, "step": 2390 }, { "epoch": 0.96, "grad_norm": 6.302300930023193, "learning_rate": 2.1101286173633444e-06, "loss": 0.2135, "step": 2400 }, { "epoch": 0.964, "grad_norm": 4.838590621948242, "learning_rate": 1.909163987138264e-06, "loss": 0.3181, "step": 2410 }, { "epoch": 0.968, "grad_norm": 6.775151252746582, "learning_rate": 1.7081993569131833e-06, "loss": 0.2516, "step": 2420 }, { "epoch": 0.972, "grad_norm": 13.491608619689941, "learning_rate": 1.507234726688103e-06, "loss": 0.2294, "step": 2430 }, { "epoch": 0.976, "grad_norm": 7.067889213562012, "learning_rate": 1.3062700964630226e-06, "loss": 0.2206, "step": 2440 }, { "epoch": 0.98, "grad_norm": 6.7473530769348145, "learning_rate": 1.1053054662379423e-06, "loss": 0.2003, "step": 2450 }, { "epoch": 0.984, "grad_norm": 9.074109077453613, "learning_rate": 9.043408360128617e-07, "loss": 0.2343, "step": 2460 }, { "epoch": 0.988, "grad_norm": 10.73699951171875, "learning_rate": 7.033762057877814e-07, "loss": 0.6721, "step": 2470 }, { "epoch": 0.992, "grad_norm": 5.29847526550293, "learning_rate": 5.02411575562701e-07, "loss": 0.236, "step": 2480 }, { "epoch": 0.996, "grad_norm": 10.172593116760254, "learning_rate": 3.014469453376206e-07, "loss": 0.216, "step": 2490 }, { "epoch": 1.0, "grad_norm": 16.507993698120117, "learning_rate": 1.0048231511254019e-07, "loss": 0.2836, "step": 2500 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.513321594098893e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }