{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999781770568124, "eval_steps": 500, "global_step": 1718, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0058194515166945515, "grad_norm": 292.5841369628906, "learning_rate": 1.1654988945205933e-07, "loss": 3.4335, "step": 10 }, { "epoch": 0.011638903033389103, "grad_norm": 310.4792785644531, "learning_rate": 1.5163490216845022e-07, "loss": 3.3043, "step": 20 }, { "epoch": 0.017458354550083655, "grad_norm": 255.72767639160156, "learning_rate": 1.721583189448638e-07, "loss": 2.6653, "step": 30 }, { "epoch": 0.023277806066778206, "grad_norm": 148.07073974609375, "learning_rate": 1.867199148848411e-07, "loss": 1.4797, "step": 40 }, { "epoch": 0.029097257583472758, "grad_norm": 68.8450927734375, "learning_rate": 1.9801476618772772e-07, "loss": 0.8688, "step": 50 }, { "epoch": 0.03491670910016731, "grad_norm": 17.056119918823242, "learning_rate": 2e-07, "loss": 0.4755, "step": 60 }, { "epoch": 0.04073616061686186, "grad_norm": 18.346956253051758, "learning_rate": 2e-07, "loss": 0.4493, "step": 70 }, { "epoch": 0.04655561213355641, "grad_norm": 12.33590316772461, "learning_rate": 2e-07, "loss": 0.4181, "step": 80 }, { "epoch": 0.052375063650250964, "grad_norm": 10.863751411437988, "learning_rate": 2e-07, "loss": 0.423, "step": 90 }, { "epoch": 0.058194515166945515, "grad_norm": 12.51539421081543, "learning_rate": 2e-07, "loss": 0.4138, "step": 100 }, { "epoch": 0.06401396668364007, "grad_norm": 10.982370376586914, "learning_rate": 2e-07, "loss": 0.3545, "step": 110 }, { "epoch": 0.06983341820033462, "grad_norm": 14.997520446777344, "learning_rate": 2e-07, "loss": 0.4036, "step": 120 }, { "epoch": 0.07565286971702917, "grad_norm": 15.653407096862793, "learning_rate": 2e-07, "loss": 0.3872, "step": 130 }, { "epoch": 0.08147232123372372, "grad_norm": 14.640938758850098, "learning_rate": 2e-07, "loss": 0.3816, "step": 140 }, { "epoch": 0.08729177275041827, "grad_norm": 12.07015609741211, "learning_rate": 2e-07, "loss": 0.3119, "step": 150 }, { "epoch": 0.09311122426711282, "grad_norm": 23.116605758666992, "learning_rate": 2e-07, "loss": 0.3326, "step": 160 }, { "epoch": 0.09893067578380738, "grad_norm": 14.743234634399414, "learning_rate": 2e-07, "loss": 0.3304, "step": 170 }, { "epoch": 0.10475012730050193, "grad_norm": 13.361212730407715, "learning_rate": 2e-07, "loss": 0.3284, "step": 180 }, { "epoch": 0.11056957881719648, "grad_norm": 14.600347518920898, "learning_rate": 2e-07, "loss": 0.2876, "step": 190 }, { "epoch": 0.11638903033389103, "grad_norm": 16.508502960205078, "learning_rate": 2e-07, "loss": 0.3143, "step": 200 }, { "epoch": 0.12220848185058558, "grad_norm": 11.057723999023438, "learning_rate": 2e-07, "loss": 0.2918, "step": 210 }, { "epoch": 0.12802793336728013, "grad_norm": 14.558637619018555, "learning_rate": 2e-07, "loss": 0.3074, "step": 220 }, { "epoch": 0.13384738488397468, "grad_norm": 11.879530906677246, "learning_rate": 2e-07, "loss": 0.28, "step": 230 }, { "epoch": 0.13966683640066924, "grad_norm": 11.994890213012695, "learning_rate": 2e-07, "loss": 0.2601, "step": 240 }, { "epoch": 0.1454862879173638, "grad_norm": 15.544328689575195, "learning_rate": 2e-07, "loss": 0.281, "step": 250 }, { "epoch": 0.15130573943405834, "grad_norm": 11.14696979522705, "learning_rate": 2e-07, "loss": 0.2891, "step": 260 }, { "epoch": 0.1571251909507529, "grad_norm": 8.271623611450195, "learning_rate": 2e-07, "loss": 0.2932, "step": 270 }, { "epoch": 0.16294464246744744, "grad_norm": 16.06687355041504, "learning_rate": 2e-07, "loss": 0.2363, "step": 280 }, { "epoch": 0.168764093984142, "grad_norm": 8.106555938720703, "learning_rate": 2e-07, "loss": 0.2313, "step": 290 }, { "epoch": 0.17458354550083655, "grad_norm": 13.634657859802246, "learning_rate": 2e-07, "loss": 0.2727, "step": 300 }, { "epoch": 0.1804029970175311, "grad_norm": 14.710253715515137, "learning_rate": 2e-07, "loss": 0.2649, "step": 310 }, { "epoch": 0.18622244853422565, "grad_norm": 9.026782035827637, "learning_rate": 2e-07, "loss": 0.2547, "step": 320 }, { "epoch": 0.1920419000509202, "grad_norm": 10.011273384094238, "learning_rate": 2e-07, "loss": 0.2771, "step": 330 }, { "epoch": 0.19786135156761475, "grad_norm": 13.526799201965332, "learning_rate": 2e-07, "loss": 0.2589, "step": 340 }, { "epoch": 0.2036808030843093, "grad_norm": 16.426071166992188, "learning_rate": 2e-07, "loss": 0.2436, "step": 350 }, { "epoch": 0.20950025460100385, "grad_norm": 14.218461036682129, "learning_rate": 2e-07, "loss": 0.2593, "step": 360 }, { "epoch": 0.2153197061176984, "grad_norm": 6.507007122039795, "learning_rate": 2e-07, "loss": 0.2245, "step": 370 }, { "epoch": 0.22113915763439296, "grad_norm": 19.18690299987793, "learning_rate": 2e-07, "loss": 0.2447, "step": 380 }, { "epoch": 0.2269586091510875, "grad_norm": 7.621412754058838, "learning_rate": 2e-07, "loss": 0.2131, "step": 390 }, { "epoch": 0.23277806066778206, "grad_norm": 9.732011795043945, "learning_rate": 2e-07, "loss": 0.2495, "step": 400 }, { "epoch": 0.2385975121844766, "grad_norm": 15.301701545715332, "learning_rate": 2e-07, "loss": 0.2427, "step": 410 }, { "epoch": 0.24441696370117116, "grad_norm": 8.665528297424316, "learning_rate": 2e-07, "loss": 0.2281, "step": 420 }, { "epoch": 0.2502364152178657, "grad_norm": 9.586000442504883, "learning_rate": 2e-07, "loss": 0.232, "step": 430 }, { "epoch": 0.25605586673456027, "grad_norm": 13.73252010345459, "learning_rate": 2e-07, "loss": 0.2326, "step": 440 }, { "epoch": 0.2618753182512548, "grad_norm": 14.053579330444336, "learning_rate": 2e-07, "loss": 0.2483, "step": 450 }, { "epoch": 0.26769476976794937, "grad_norm": 9.641685485839844, "learning_rate": 2e-07, "loss": 0.2461, "step": 460 }, { "epoch": 0.2735142212846439, "grad_norm": 13.011364936828613, "learning_rate": 2e-07, "loss": 0.1929, "step": 470 }, { "epoch": 0.2793336728013385, "grad_norm": 13.232110023498535, "learning_rate": 2e-07, "loss": 0.204, "step": 480 }, { "epoch": 0.285153124318033, "grad_norm": 10.551194190979004, "learning_rate": 2e-07, "loss": 0.2341, "step": 490 }, { "epoch": 0.2909725758347276, "grad_norm": 11.238757133483887, "learning_rate": 2e-07, "loss": 0.2342, "step": 500 }, { "epoch": 0.2967920273514221, "grad_norm": 14.688443183898926, "learning_rate": 2e-07, "loss": 0.2177, "step": 510 }, { "epoch": 0.3026114788681167, "grad_norm": 14.724873542785645, "learning_rate": 2e-07, "loss": 0.1969, "step": 520 }, { "epoch": 0.30843093038481123, "grad_norm": 11.847085952758789, "learning_rate": 2e-07, "loss": 0.1847, "step": 530 }, { "epoch": 0.3142503819015058, "grad_norm": 11.497530937194824, "learning_rate": 2e-07, "loss": 0.2017, "step": 540 }, { "epoch": 0.32006983341820033, "grad_norm": 11.654367446899414, "learning_rate": 2e-07, "loss": 0.2206, "step": 550 }, { "epoch": 0.3258892849348949, "grad_norm": 12.561141967773438, "learning_rate": 2e-07, "loss": 0.2275, "step": 560 }, { "epoch": 0.33170873645158944, "grad_norm": 10.0696382522583, "learning_rate": 2e-07, "loss": 0.2008, "step": 570 }, { "epoch": 0.337528187968284, "grad_norm": 11.895383834838867, "learning_rate": 2e-07, "loss": 0.2192, "step": 580 }, { "epoch": 0.34334763948497854, "grad_norm": 15.39511489868164, "learning_rate": 2e-07, "loss": 0.2113, "step": 590 }, { "epoch": 0.3491670910016731, "grad_norm": 11.026963233947754, "learning_rate": 2e-07, "loss": 0.2042, "step": 600 }, { "epoch": 0.35498654251836764, "grad_norm": 10.234797477722168, "learning_rate": 2e-07, "loss": 0.2026, "step": 610 }, { "epoch": 0.3608059940350622, "grad_norm": 11.014677047729492, "learning_rate": 2e-07, "loss": 0.2051, "step": 620 }, { "epoch": 0.36662544555175675, "grad_norm": 9.119762420654297, "learning_rate": 2e-07, "loss": 0.2137, "step": 630 }, { "epoch": 0.3724448970684513, "grad_norm": 6.889257907867432, "learning_rate": 2e-07, "loss": 0.1647, "step": 640 }, { "epoch": 0.37826434858514585, "grad_norm": 7.318251132965088, "learning_rate": 2e-07, "loss": 0.1973, "step": 650 }, { "epoch": 0.3840838001018404, "grad_norm": 13.953539848327637, "learning_rate": 2e-07, "loss": 0.1962, "step": 660 }, { "epoch": 0.38990325161853495, "grad_norm": 11.493965148925781, "learning_rate": 2e-07, "loss": 0.2017, "step": 670 }, { "epoch": 0.3957227031352295, "grad_norm": 9.594313621520996, "learning_rate": 2e-07, "loss": 0.2475, "step": 680 }, { "epoch": 0.40154215465192405, "grad_norm": 8.401884078979492, "learning_rate": 2e-07, "loss": 0.1946, "step": 690 }, { "epoch": 0.4073616061686186, "grad_norm": 9.508219718933105, "learning_rate": 2e-07, "loss": 0.2113, "step": 700 }, { "epoch": 0.41318105768531316, "grad_norm": 11.010660171508789, "learning_rate": 2e-07, "loss": 0.2058, "step": 710 }, { "epoch": 0.4190005092020077, "grad_norm": 15.212239265441895, "learning_rate": 2e-07, "loss": 0.2155, "step": 720 }, { "epoch": 0.42481996071870226, "grad_norm": 8.596692085266113, "learning_rate": 2e-07, "loss": 0.2143, "step": 730 }, { "epoch": 0.4306394122353968, "grad_norm": 18.07278823852539, "learning_rate": 2e-07, "loss": 0.1929, "step": 740 }, { "epoch": 0.43645886375209136, "grad_norm": 11.786556243896484, "learning_rate": 2e-07, "loss": 0.2149, "step": 750 }, { "epoch": 0.4422783152687859, "grad_norm": 5.36111307144165, "learning_rate": 2e-07, "loss": 0.201, "step": 760 }, { "epoch": 0.44809776678548047, "grad_norm": 17.974634170532227, "learning_rate": 2e-07, "loss": 0.2216, "step": 770 }, { "epoch": 0.453917218302175, "grad_norm": 7.572098731994629, "learning_rate": 2e-07, "loss": 0.2137, "step": 780 }, { "epoch": 0.45973666981886957, "grad_norm": 10.07806396484375, "learning_rate": 2e-07, "loss": 0.2011, "step": 790 }, { "epoch": 0.4655561213355641, "grad_norm": 11.195019721984863, "learning_rate": 2e-07, "loss": 0.1976, "step": 800 }, { "epoch": 0.4713755728522587, "grad_norm": 13.922369956970215, "learning_rate": 2e-07, "loss": 0.1755, "step": 810 }, { "epoch": 0.4771950243689532, "grad_norm": 14.419588088989258, "learning_rate": 2e-07, "loss": 0.2261, "step": 820 }, { "epoch": 0.4830144758856478, "grad_norm": 7.6839070320129395, "learning_rate": 2e-07, "loss": 0.1957, "step": 830 }, { "epoch": 0.4888339274023423, "grad_norm": 18.723840713500977, "learning_rate": 2e-07, "loss": 0.1964, "step": 840 }, { "epoch": 0.4946533789190369, "grad_norm": 11.51264476776123, "learning_rate": 2e-07, "loss": 0.2429, "step": 850 }, { "epoch": 0.5004728304357314, "grad_norm": 14.401782989501953, "learning_rate": 2e-07, "loss": 0.2077, "step": 860 }, { "epoch": 0.506292281952426, "grad_norm": 7.909298896789551, "learning_rate": 2e-07, "loss": 0.1981, "step": 870 }, { "epoch": 0.5121117334691205, "grad_norm": 9.616816520690918, "learning_rate": 2e-07, "loss": 0.2059, "step": 880 }, { "epoch": 0.5179311849858151, "grad_norm": 14.229058265686035, "learning_rate": 2e-07, "loss": 0.1838, "step": 890 }, { "epoch": 0.5237506365025096, "grad_norm": 8.7423734664917, "learning_rate": 2e-07, "loss": 0.1883, "step": 900 }, { "epoch": 0.5295700880192042, "grad_norm": 7.891780853271484, "learning_rate": 2e-07, "loss": 0.2177, "step": 910 }, { "epoch": 0.5353895395358987, "grad_norm": 15.503663063049316, "learning_rate": 2e-07, "loss": 0.2028, "step": 920 }, { "epoch": 0.5412089910525933, "grad_norm": 10.924641609191895, "learning_rate": 2e-07, "loss": 0.1923, "step": 930 }, { "epoch": 0.5470284425692878, "grad_norm": 14.79828929901123, "learning_rate": 2e-07, "loss": 0.2322, "step": 940 }, { "epoch": 0.5528478940859824, "grad_norm": 11.492549896240234, "learning_rate": 2e-07, "loss": 0.2306, "step": 950 }, { "epoch": 0.558667345602677, "grad_norm": 5.280726909637451, "learning_rate": 2e-07, "loss": 0.1846, "step": 960 }, { "epoch": 0.5644867971193716, "grad_norm": 7.49058198928833, "learning_rate": 2e-07, "loss": 0.186, "step": 970 }, { "epoch": 0.570306248636066, "grad_norm": 17.624052047729492, "learning_rate": 2e-07, "loss": 0.2145, "step": 980 }, { "epoch": 0.5761257001527607, "grad_norm": 11.165759086608887, "learning_rate": 2e-07, "loss": 0.201, "step": 990 }, { "epoch": 0.5819451516694552, "grad_norm": 7.842067718505859, "learning_rate": 2e-07, "loss": 0.1968, "step": 1000 }, { "epoch": 0.5877646031861498, "grad_norm": 9.594118118286133, "learning_rate": 2e-07, "loss": 0.2136, "step": 1010 }, { "epoch": 0.5935840547028443, "grad_norm": 14.04518985748291, "learning_rate": 2e-07, "loss": 0.2075, "step": 1020 }, { "epoch": 0.5994035062195389, "grad_norm": 15.331314086914062, "learning_rate": 2e-07, "loss": 0.2023, "step": 1030 }, { "epoch": 0.6052229577362334, "grad_norm": 8.51887321472168, "learning_rate": 2e-07, "loss": 0.1713, "step": 1040 }, { "epoch": 0.611042409252928, "grad_norm": 11.08820629119873, "learning_rate": 2e-07, "loss": 0.1868, "step": 1050 }, { "epoch": 0.6168618607696225, "grad_norm": 12.212711334228516, "learning_rate": 2e-07, "loss": 0.1876, "step": 1060 }, { "epoch": 0.6226813122863171, "grad_norm": 11.730500221252441, "learning_rate": 2e-07, "loss": 0.1934, "step": 1070 }, { "epoch": 0.6285007638030116, "grad_norm": 9.89484977722168, "learning_rate": 2e-07, "loss": 0.2097, "step": 1080 }, { "epoch": 0.6343202153197062, "grad_norm": 7.291867256164551, "learning_rate": 2e-07, "loss": 0.1582, "step": 1090 }, { "epoch": 0.6401396668364007, "grad_norm": 7.9038920402526855, "learning_rate": 2e-07, "loss": 0.21, "step": 1100 }, { "epoch": 0.6459591183530953, "grad_norm": 18.028404235839844, "learning_rate": 2e-07, "loss": 0.2293, "step": 1110 }, { "epoch": 0.6517785698697898, "grad_norm": 12.41182804107666, "learning_rate": 2e-07, "loss": 0.2048, "step": 1120 }, { "epoch": 0.6575980213864844, "grad_norm": 11.216751098632812, "learning_rate": 2e-07, "loss": 0.1903, "step": 1130 }, { "epoch": 0.6634174729031789, "grad_norm": 13.232751846313477, "learning_rate": 2e-07, "loss": 0.19, "step": 1140 }, { "epoch": 0.6692369244198735, "grad_norm": 9.882608413696289, "learning_rate": 2e-07, "loss": 0.1936, "step": 1150 }, { "epoch": 0.675056375936568, "grad_norm": 9.839518547058105, "learning_rate": 2e-07, "loss": 0.2019, "step": 1160 }, { "epoch": 0.6808758274532626, "grad_norm": 12.619885444641113, "learning_rate": 2e-07, "loss": 0.2124, "step": 1170 }, { "epoch": 0.6866952789699571, "grad_norm": 12.874058723449707, "learning_rate": 2e-07, "loss": 0.1875, "step": 1180 }, { "epoch": 0.6925147304866517, "grad_norm": 13.787298202514648, "learning_rate": 2e-07, "loss": 0.2071, "step": 1190 }, { "epoch": 0.6983341820033462, "grad_norm": 11.749211311340332, "learning_rate": 2e-07, "loss": 0.1957, "step": 1200 }, { "epoch": 0.7041536335200408, "grad_norm": 12.892156600952148, "learning_rate": 2e-07, "loss": 0.1748, "step": 1210 }, { "epoch": 0.7099730850367353, "grad_norm": 14.410128593444824, "learning_rate": 2e-07, "loss": 0.1934, "step": 1220 }, { "epoch": 0.7157925365534299, "grad_norm": 11.821023941040039, "learning_rate": 2e-07, "loss": 0.2113, "step": 1230 }, { "epoch": 0.7216119880701244, "grad_norm": 7.469109058380127, "learning_rate": 2e-07, "loss": 0.177, "step": 1240 }, { "epoch": 0.727431439586819, "grad_norm": 11.144964218139648, "learning_rate": 2e-07, "loss": 0.2026, "step": 1250 }, { "epoch": 0.7332508911035135, "grad_norm": 8.659164428710938, "learning_rate": 2e-07, "loss": 0.1866, "step": 1260 }, { "epoch": 0.7390703426202081, "grad_norm": 12.093481063842773, "learning_rate": 2e-07, "loss": 0.1819, "step": 1270 }, { "epoch": 0.7448897941369026, "grad_norm": 9.76320743560791, "learning_rate": 2e-07, "loss": 0.1826, "step": 1280 }, { "epoch": 0.7507092456535972, "grad_norm": 9.821404457092285, "learning_rate": 2e-07, "loss": 0.1916, "step": 1290 }, { "epoch": 0.7565286971702917, "grad_norm": 13.206873893737793, "learning_rate": 2e-07, "loss": 0.2037, "step": 1300 }, { "epoch": 0.7623481486869863, "grad_norm": 11.560912132263184, "learning_rate": 2e-07, "loss": 0.179, "step": 1310 }, { "epoch": 0.7681676002036808, "grad_norm": 14.256608009338379, "learning_rate": 2e-07, "loss": 0.1879, "step": 1320 }, { "epoch": 0.7739870517203754, "grad_norm": 17.668615341186523, "learning_rate": 2e-07, "loss": 0.1772, "step": 1330 }, { "epoch": 0.7798065032370699, "grad_norm": 11.932788848876953, "learning_rate": 2e-07, "loss": 0.1657, "step": 1340 }, { "epoch": 0.7856259547537645, "grad_norm": 8.989192008972168, "learning_rate": 2e-07, "loss": 0.18, "step": 1350 }, { "epoch": 0.791445406270459, "grad_norm": 8.768953323364258, "learning_rate": 2e-07, "loss": 0.2002, "step": 1360 }, { "epoch": 0.7972648577871536, "grad_norm": 16.538890838623047, "learning_rate": 2e-07, "loss": 0.1693, "step": 1370 }, { "epoch": 0.8030843093038481, "grad_norm": 7.928899765014648, "learning_rate": 2e-07, "loss": 0.1926, "step": 1380 }, { "epoch": 0.8089037608205427, "grad_norm": 11.980950355529785, "learning_rate": 2e-07, "loss": 0.2042, "step": 1390 }, { "epoch": 0.8147232123372372, "grad_norm": 14.4302978515625, "learning_rate": 2e-07, "loss": 0.1797, "step": 1400 }, { "epoch": 0.8205426638539318, "grad_norm": 11.229952812194824, "learning_rate": 2e-07, "loss": 0.17, "step": 1410 }, { "epoch": 0.8263621153706263, "grad_norm": 11.317793846130371, "learning_rate": 2e-07, "loss": 0.1728, "step": 1420 }, { "epoch": 0.8321815668873209, "grad_norm": 10.537130355834961, "learning_rate": 2e-07, "loss": 0.1787, "step": 1430 }, { "epoch": 0.8380010184040154, "grad_norm": 11.120368003845215, "learning_rate": 2e-07, "loss": 0.1621, "step": 1440 }, { "epoch": 0.84382046992071, "grad_norm": 13.397139549255371, "learning_rate": 2e-07, "loss": 0.1639, "step": 1450 }, { "epoch": 0.8496399214374045, "grad_norm": 12.438237190246582, "learning_rate": 2e-07, "loss": 0.1769, "step": 1460 }, { "epoch": 0.8554593729540991, "grad_norm": 11.550435066223145, "learning_rate": 2e-07, "loss": 0.2065, "step": 1470 }, { "epoch": 0.8612788244707936, "grad_norm": 6.573584079742432, "learning_rate": 2e-07, "loss": 0.2021, "step": 1480 }, { "epoch": 0.8670982759874882, "grad_norm": 6.305631637573242, "learning_rate": 2e-07, "loss": 0.1811, "step": 1490 }, { "epoch": 0.8729177275041827, "grad_norm": 9.760597229003906, "learning_rate": 2e-07, "loss": 0.1937, "step": 1500 }, { "epoch": 0.8787371790208773, "grad_norm": 13.0894193649292, "learning_rate": 2e-07, "loss": 0.176, "step": 1510 }, { "epoch": 0.8845566305375718, "grad_norm": 7.848855495452881, "learning_rate": 2e-07, "loss": 0.1773, "step": 1520 }, { "epoch": 0.8903760820542664, "grad_norm": 14.45218563079834, "learning_rate": 2e-07, "loss": 0.2096, "step": 1530 }, { "epoch": 0.8961955335709609, "grad_norm": 7.301393032073975, "learning_rate": 2e-07, "loss": 0.1656, "step": 1540 }, { "epoch": 0.9020149850876555, "grad_norm": 10.425517082214355, "learning_rate": 2e-07, "loss": 0.1802, "step": 1550 }, { "epoch": 0.90783443660435, "grad_norm": 12.547024726867676, "learning_rate": 2e-07, "loss": 0.1922, "step": 1560 }, { "epoch": 0.9136538881210446, "grad_norm": 12.041275978088379, "learning_rate": 2e-07, "loss": 0.1827, "step": 1570 }, { "epoch": 0.9194733396377391, "grad_norm": 10.960613250732422, "learning_rate": 2e-07, "loss": 0.2234, "step": 1580 }, { "epoch": 0.9252927911544337, "grad_norm": 11.155454635620117, "learning_rate": 2e-07, "loss": 0.1615, "step": 1590 }, { "epoch": 0.9311122426711282, "grad_norm": 17.65553092956543, "learning_rate": 2e-07, "loss": 0.1892, "step": 1600 }, { "epoch": 0.9369316941878228, "grad_norm": 18.907163619995117, "learning_rate": 2e-07, "loss": 0.1915, "step": 1610 }, { "epoch": 0.9427511457045173, "grad_norm": 14.12991714477539, "learning_rate": 2e-07, "loss": 0.2156, "step": 1620 }, { "epoch": 0.948570597221212, "grad_norm": 9.105369567871094, "learning_rate": 2e-07, "loss": 0.1808, "step": 1630 }, { "epoch": 0.9543900487379064, "grad_norm": 10.135030746459961, "learning_rate": 2e-07, "loss": 0.1842, "step": 1640 }, { "epoch": 0.9602095002546011, "grad_norm": 33.01081466674805, "learning_rate": 2e-07, "loss": 0.1757, "step": 1650 }, { "epoch": 0.9660289517712956, "grad_norm": 7.512867450714111, "learning_rate": 2e-07, "loss": 0.1591, "step": 1660 }, { "epoch": 0.9718484032879902, "grad_norm": 7.911075592041016, "learning_rate": 2e-07, "loss": 0.1885, "step": 1670 }, { "epoch": 0.9776678548046847, "grad_norm": 4.568904399871826, "learning_rate": 2e-07, "loss": 0.1929, "step": 1680 }, { "epoch": 0.9834873063213793, "grad_norm": 14.508501052856445, "learning_rate": 2e-07, "loss": 0.2354, "step": 1690 }, { "epoch": 0.9893067578380738, "grad_norm": 9.957372665405273, "learning_rate": 2e-07, "loss": 0.1523, "step": 1700 }, { "epoch": 0.9951262093547684, "grad_norm": 8.227477073669434, "learning_rate": 2e-07, "loss": 0.1816, "step": 1710 }, { "epoch": 0.999781770568124, "step": 1718, "total_flos": 6.783175773021798e+16, "train_loss": 0.28378574687193936, "train_runtime": 68908.423, "train_samples_per_second": 0.798, "train_steps_per_second": 0.025 } ], "logging_steps": 10, "max_steps": 1718, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.783175773021798e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }