| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.999781770568124, |
| "eval_steps": 500, |
| "global_step": 1718, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0058194515166945515, |
| "grad_norm": 292.5841369628906, |
| "learning_rate": 1.1654988945205933e-07, |
| "loss": 3.4335, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.011638903033389103, |
| "grad_norm": 310.4792785644531, |
| "learning_rate": 1.5163490216845022e-07, |
| "loss": 3.3043, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.017458354550083655, |
| "grad_norm": 255.72767639160156, |
| "learning_rate": 1.721583189448638e-07, |
| "loss": 2.6653, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.023277806066778206, |
| "grad_norm": 148.07073974609375, |
| "learning_rate": 1.867199148848411e-07, |
| "loss": 1.4797, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.029097257583472758, |
| "grad_norm": 68.8450927734375, |
| "learning_rate": 1.9801476618772772e-07, |
| "loss": 0.8688, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03491670910016731, |
| "grad_norm": 17.056119918823242, |
| "learning_rate": 2e-07, |
| "loss": 0.4755, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04073616061686186, |
| "grad_norm": 18.346956253051758, |
| "learning_rate": 2e-07, |
| "loss": 0.4493, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.04655561213355641, |
| "grad_norm": 12.33590316772461, |
| "learning_rate": 2e-07, |
| "loss": 0.4181, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.052375063650250964, |
| "grad_norm": 10.863751411437988, |
| "learning_rate": 2e-07, |
| "loss": 0.423, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.058194515166945515, |
| "grad_norm": 12.51539421081543, |
| "learning_rate": 2e-07, |
| "loss": 0.4138, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06401396668364007, |
| "grad_norm": 10.982370376586914, |
| "learning_rate": 2e-07, |
| "loss": 0.3545, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06983341820033462, |
| "grad_norm": 14.997520446777344, |
| "learning_rate": 2e-07, |
| "loss": 0.4036, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.07565286971702917, |
| "grad_norm": 15.653407096862793, |
| "learning_rate": 2e-07, |
| "loss": 0.3872, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.08147232123372372, |
| "grad_norm": 14.640938758850098, |
| "learning_rate": 2e-07, |
| "loss": 0.3816, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.08729177275041827, |
| "grad_norm": 12.07015609741211, |
| "learning_rate": 2e-07, |
| "loss": 0.3119, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09311122426711282, |
| "grad_norm": 23.116605758666992, |
| "learning_rate": 2e-07, |
| "loss": 0.3326, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.09893067578380738, |
| "grad_norm": 14.743234634399414, |
| "learning_rate": 2e-07, |
| "loss": 0.3304, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.10475012730050193, |
| "grad_norm": 13.361212730407715, |
| "learning_rate": 2e-07, |
| "loss": 0.3284, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.11056957881719648, |
| "grad_norm": 14.600347518920898, |
| "learning_rate": 2e-07, |
| "loss": 0.2876, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.11638903033389103, |
| "grad_norm": 16.508502960205078, |
| "learning_rate": 2e-07, |
| "loss": 0.3143, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.12220848185058558, |
| "grad_norm": 11.057723999023438, |
| "learning_rate": 2e-07, |
| "loss": 0.2918, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.12802793336728013, |
| "grad_norm": 14.558637619018555, |
| "learning_rate": 2e-07, |
| "loss": 0.3074, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.13384738488397468, |
| "grad_norm": 11.879530906677246, |
| "learning_rate": 2e-07, |
| "loss": 0.28, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.13966683640066924, |
| "grad_norm": 11.994890213012695, |
| "learning_rate": 2e-07, |
| "loss": 0.2601, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1454862879173638, |
| "grad_norm": 15.544328689575195, |
| "learning_rate": 2e-07, |
| "loss": 0.281, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.15130573943405834, |
| "grad_norm": 11.14696979522705, |
| "learning_rate": 2e-07, |
| "loss": 0.2891, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.1571251909507529, |
| "grad_norm": 8.271623611450195, |
| "learning_rate": 2e-07, |
| "loss": 0.2932, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.16294464246744744, |
| "grad_norm": 16.06687355041504, |
| "learning_rate": 2e-07, |
| "loss": 0.2363, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.168764093984142, |
| "grad_norm": 8.106555938720703, |
| "learning_rate": 2e-07, |
| "loss": 0.2313, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.17458354550083655, |
| "grad_norm": 13.634657859802246, |
| "learning_rate": 2e-07, |
| "loss": 0.2727, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.1804029970175311, |
| "grad_norm": 14.710253715515137, |
| "learning_rate": 2e-07, |
| "loss": 0.2649, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.18622244853422565, |
| "grad_norm": 9.026782035827637, |
| "learning_rate": 2e-07, |
| "loss": 0.2547, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1920419000509202, |
| "grad_norm": 10.011273384094238, |
| "learning_rate": 2e-07, |
| "loss": 0.2771, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.19786135156761475, |
| "grad_norm": 13.526799201965332, |
| "learning_rate": 2e-07, |
| "loss": 0.2589, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.2036808030843093, |
| "grad_norm": 16.426071166992188, |
| "learning_rate": 2e-07, |
| "loss": 0.2436, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.20950025460100385, |
| "grad_norm": 14.218461036682129, |
| "learning_rate": 2e-07, |
| "loss": 0.2593, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.2153197061176984, |
| "grad_norm": 6.507007122039795, |
| "learning_rate": 2e-07, |
| "loss": 0.2245, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.22113915763439296, |
| "grad_norm": 19.18690299987793, |
| "learning_rate": 2e-07, |
| "loss": 0.2447, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2269586091510875, |
| "grad_norm": 7.621412754058838, |
| "learning_rate": 2e-07, |
| "loss": 0.2131, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.23277806066778206, |
| "grad_norm": 9.732011795043945, |
| "learning_rate": 2e-07, |
| "loss": 0.2495, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2385975121844766, |
| "grad_norm": 15.301701545715332, |
| "learning_rate": 2e-07, |
| "loss": 0.2427, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.24441696370117116, |
| "grad_norm": 8.665528297424316, |
| "learning_rate": 2e-07, |
| "loss": 0.2281, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2502364152178657, |
| "grad_norm": 9.586000442504883, |
| "learning_rate": 2e-07, |
| "loss": 0.232, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.25605586673456027, |
| "grad_norm": 13.73252010345459, |
| "learning_rate": 2e-07, |
| "loss": 0.2326, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.2618753182512548, |
| "grad_norm": 14.053579330444336, |
| "learning_rate": 2e-07, |
| "loss": 0.2483, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.26769476976794937, |
| "grad_norm": 9.641685485839844, |
| "learning_rate": 2e-07, |
| "loss": 0.2461, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2735142212846439, |
| "grad_norm": 13.011364936828613, |
| "learning_rate": 2e-07, |
| "loss": 0.1929, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.2793336728013385, |
| "grad_norm": 13.232110023498535, |
| "learning_rate": 2e-07, |
| "loss": 0.204, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.285153124318033, |
| "grad_norm": 10.551194190979004, |
| "learning_rate": 2e-07, |
| "loss": 0.2341, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.2909725758347276, |
| "grad_norm": 11.238757133483887, |
| "learning_rate": 2e-07, |
| "loss": 0.2342, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.2967920273514221, |
| "grad_norm": 14.688443183898926, |
| "learning_rate": 2e-07, |
| "loss": 0.2177, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3026114788681167, |
| "grad_norm": 14.724873542785645, |
| "learning_rate": 2e-07, |
| "loss": 0.1969, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.30843093038481123, |
| "grad_norm": 11.847085952758789, |
| "learning_rate": 2e-07, |
| "loss": 0.1847, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.3142503819015058, |
| "grad_norm": 11.497530937194824, |
| "learning_rate": 2e-07, |
| "loss": 0.2017, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.32006983341820033, |
| "grad_norm": 11.654367446899414, |
| "learning_rate": 2e-07, |
| "loss": 0.2206, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3258892849348949, |
| "grad_norm": 12.561141967773438, |
| "learning_rate": 2e-07, |
| "loss": 0.2275, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.33170873645158944, |
| "grad_norm": 10.0696382522583, |
| "learning_rate": 2e-07, |
| "loss": 0.2008, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.337528187968284, |
| "grad_norm": 11.895383834838867, |
| "learning_rate": 2e-07, |
| "loss": 0.2192, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.34334763948497854, |
| "grad_norm": 15.39511489868164, |
| "learning_rate": 2e-07, |
| "loss": 0.2113, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3491670910016731, |
| "grad_norm": 11.026963233947754, |
| "learning_rate": 2e-07, |
| "loss": 0.2042, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.35498654251836764, |
| "grad_norm": 10.234797477722168, |
| "learning_rate": 2e-07, |
| "loss": 0.2026, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.3608059940350622, |
| "grad_norm": 11.014677047729492, |
| "learning_rate": 2e-07, |
| "loss": 0.2051, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.36662544555175675, |
| "grad_norm": 9.119762420654297, |
| "learning_rate": 2e-07, |
| "loss": 0.2137, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.3724448970684513, |
| "grad_norm": 6.889257907867432, |
| "learning_rate": 2e-07, |
| "loss": 0.1647, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.37826434858514585, |
| "grad_norm": 7.318251132965088, |
| "learning_rate": 2e-07, |
| "loss": 0.1973, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.3840838001018404, |
| "grad_norm": 13.953539848327637, |
| "learning_rate": 2e-07, |
| "loss": 0.1962, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.38990325161853495, |
| "grad_norm": 11.493965148925781, |
| "learning_rate": 2e-07, |
| "loss": 0.2017, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.3957227031352295, |
| "grad_norm": 9.594313621520996, |
| "learning_rate": 2e-07, |
| "loss": 0.2475, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.40154215465192405, |
| "grad_norm": 8.401884078979492, |
| "learning_rate": 2e-07, |
| "loss": 0.1946, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.4073616061686186, |
| "grad_norm": 9.508219718933105, |
| "learning_rate": 2e-07, |
| "loss": 0.2113, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.41318105768531316, |
| "grad_norm": 11.010660171508789, |
| "learning_rate": 2e-07, |
| "loss": 0.2058, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.4190005092020077, |
| "grad_norm": 15.212239265441895, |
| "learning_rate": 2e-07, |
| "loss": 0.2155, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.42481996071870226, |
| "grad_norm": 8.596692085266113, |
| "learning_rate": 2e-07, |
| "loss": 0.2143, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.4306394122353968, |
| "grad_norm": 18.07278823852539, |
| "learning_rate": 2e-07, |
| "loss": 0.1929, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.43645886375209136, |
| "grad_norm": 11.786556243896484, |
| "learning_rate": 2e-07, |
| "loss": 0.2149, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.4422783152687859, |
| "grad_norm": 5.36111307144165, |
| "learning_rate": 2e-07, |
| "loss": 0.201, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.44809776678548047, |
| "grad_norm": 17.974634170532227, |
| "learning_rate": 2e-07, |
| "loss": 0.2216, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.453917218302175, |
| "grad_norm": 7.572098731994629, |
| "learning_rate": 2e-07, |
| "loss": 0.2137, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.45973666981886957, |
| "grad_norm": 10.07806396484375, |
| "learning_rate": 2e-07, |
| "loss": 0.2011, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.4655561213355641, |
| "grad_norm": 11.195019721984863, |
| "learning_rate": 2e-07, |
| "loss": 0.1976, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4713755728522587, |
| "grad_norm": 13.922369956970215, |
| "learning_rate": 2e-07, |
| "loss": 0.1755, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4771950243689532, |
| "grad_norm": 14.419588088989258, |
| "learning_rate": 2e-07, |
| "loss": 0.2261, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.4830144758856478, |
| "grad_norm": 7.6839070320129395, |
| "learning_rate": 2e-07, |
| "loss": 0.1957, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.4888339274023423, |
| "grad_norm": 18.723840713500977, |
| "learning_rate": 2e-07, |
| "loss": 0.1964, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.4946533789190369, |
| "grad_norm": 11.51264476776123, |
| "learning_rate": 2e-07, |
| "loss": 0.2429, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5004728304357314, |
| "grad_norm": 14.401782989501953, |
| "learning_rate": 2e-07, |
| "loss": 0.2077, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.506292281952426, |
| "grad_norm": 7.909298896789551, |
| "learning_rate": 2e-07, |
| "loss": 0.1981, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5121117334691205, |
| "grad_norm": 9.616816520690918, |
| "learning_rate": 2e-07, |
| "loss": 0.2059, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5179311849858151, |
| "grad_norm": 14.229058265686035, |
| "learning_rate": 2e-07, |
| "loss": 0.1838, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5237506365025096, |
| "grad_norm": 8.7423734664917, |
| "learning_rate": 2e-07, |
| "loss": 0.1883, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5295700880192042, |
| "grad_norm": 7.891780853271484, |
| "learning_rate": 2e-07, |
| "loss": 0.2177, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.5353895395358987, |
| "grad_norm": 15.503663063049316, |
| "learning_rate": 2e-07, |
| "loss": 0.2028, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.5412089910525933, |
| "grad_norm": 10.924641609191895, |
| "learning_rate": 2e-07, |
| "loss": 0.1923, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.5470284425692878, |
| "grad_norm": 14.79828929901123, |
| "learning_rate": 2e-07, |
| "loss": 0.2322, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.5528478940859824, |
| "grad_norm": 11.492549896240234, |
| "learning_rate": 2e-07, |
| "loss": 0.2306, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.558667345602677, |
| "grad_norm": 5.280726909637451, |
| "learning_rate": 2e-07, |
| "loss": 0.1846, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5644867971193716, |
| "grad_norm": 7.49058198928833, |
| "learning_rate": 2e-07, |
| "loss": 0.186, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.570306248636066, |
| "grad_norm": 17.624052047729492, |
| "learning_rate": 2e-07, |
| "loss": 0.2145, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.5761257001527607, |
| "grad_norm": 11.165759086608887, |
| "learning_rate": 2e-07, |
| "loss": 0.201, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.5819451516694552, |
| "grad_norm": 7.842067718505859, |
| "learning_rate": 2e-07, |
| "loss": 0.1968, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5877646031861498, |
| "grad_norm": 9.594118118286133, |
| "learning_rate": 2e-07, |
| "loss": 0.2136, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.5935840547028443, |
| "grad_norm": 14.04518985748291, |
| "learning_rate": 2e-07, |
| "loss": 0.2075, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.5994035062195389, |
| "grad_norm": 15.331314086914062, |
| "learning_rate": 2e-07, |
| "loss": 0.2023, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.6052229577362334, |
| "grad_norm": 8.51887321472168, |
| "learning_rate": 2e-07, |
| "loss": 0.1713, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.611042409252928, |
| "grad_norm": 11.08820629119873, |
| "learning_rate": 2e-07, |
| "loss": 0.1868, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6168618607696225, |
| "grad_norm": 12.212711334228516, |
| "learning_rate": 2e-07, |
| "loss": 0.1876, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.6226813122863171, |
| "grad_norm": 11.730500221252441, |
| "learning_rate": 2e-07, |
| "loss": 0.1934, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6285007638030116, |
| "grad_norm": 9.89484977722168, |
| "learning_rate": 2e-07, |
| "loss": 0.2097, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6343202153197062, |
| "grad_norm": 7.291867256164551, |
| "learning_rate": 2e-07, |
| "loss": 0.1582, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.6401396668364007, |
| "grad_norm": 7.9038920402526855, |
| "learning_rate": 2e-07, |
| "loss": 0.21, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.6459591183530953, |
| "grad_norm": 18.028404235839844, |
| "learning_rate": 2e-07, |
| "loss": 0.2293, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.6517785698697898, |
| "grad_norm": 12.41182804107666, |
| "learning_rate": 2e-07, |
| "loss": 0.2048, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.6575980213864844, |
| "grad_norm": 11.216751098632812, |
| "learning_rate": 2e-07, |
| "loss": 0.1903, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.6634174729031789, |
| "grad_norm": 13.232751846313477, |
| "learning_rate": 2e-07, |
| "loss": 0.19, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.6692369244198735, |
| "grad_norm": 9.882608413696289, |
| "learning_rate": 2e-07, |
| "loss": 0.1936, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.675056375936568, |
| "grad_norm": 9.839518547058105, |
| "learning_rate": 2e-07, |
| "loss": 0.2019, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.6808758274532626, |
| "grad_norm": 12.619885444641113, |
| "learning_rate": 2e-07, |
| "loss": 0.2124, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.6866952789699571, |
| "grad_norm": 12.874058723449707, |
| "learning_rate": 2e-07, |
| "loss": 0.1875, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.6925147304866517, |
| "grad_norm": 13.787298202514648, |
| "learning_rate": 2e-07, |
| "loss": 0.2071, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.6983341820033462, |
| "grad_norm": 11.749211311340332, |
| "learning_rate": 2e-07, |
| "loss": 0.1957, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7041536335200408, |
| "grad_norm": 12.892156600952148, |
| "learning_rate": 2e-07, |
| "loss": 0.1748, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.7099730850367353, |
| "grad_norm": 14.410128593444824, |
| "learning_rate": 2e-07, |
| "loss": 0.1934, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.7157925365534299, |
| "grad_norm": 11.821023941040039, |
| "learning_rate": 2e-07, |
| "loss": 0.2113, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.7216119880701244, |
| "grad_norm": 7.469109058380127, |
| "learning_rate": 2e-07, |
| "loss": 0.177, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.727431439586819, |
| "grad_norm": 11.144964218139648, |
| "learning_rate": 2e-07, |
| "loss": 0.2026, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.7332508911035135, |
| "grad_norm": 8.659164428710938, |
| "learning_rate": 2e-07, |
| "loss": 0.1866, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.7390703426202081, |
| "grad_norm": 12.093481063842773, |
| "learning_rate": 2e-07, |
| "loss": 0.1819, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.7448897941369026, |
| "grad_norm": 9.76320743560791, |
| "learning_rate": 2e-07, |
| "loss": 0.1826, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.7507092456535972, |
| "grad_norm": 9.821404457092285, |
| "learning_rate": 2e-07, |
| "loss": 0.1916, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.7565286971702917, |
| "grad_norm": 13.206873893737793, |
| "learning_rate": 2e-07, |
| "loss": 0.2037, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7623481486869863, |
| "grad_norm": 11.560912132263184, |
| "learning_rate": 2e-07, |
| "loss": 0.179, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.7681676002036808, |
| "grad_norm": 14.256608009338379, |
| "learning_rate": 2e-07, |
| "loss": 0.1879, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.7739870517203754, |
| "grad_norm": 17.668615341186523, |
| "learning_rate": 2e-07, |
| "loss": 0.1772, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.7798065032370699, |
| "grad_norm": 11.932788848876953, |
| "learning_rate": 2e-07, |
| "loss": 0.1657, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.7856259547537645, |
| "grad_norm": 8.989192008972168, |
| "learning_rate": 2e-07, |
| "loss": 0.18, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.791445406270459, |
| "grad_norm": 8.768953323364258, |
| "learning_rate": 2e-07, |
| "loss": 0.2002, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.7972648577871536, |
| "grad_norm": 16.538890838623047, |
| "learning_rate": 2e-07, |
| "loss": 0.1693, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.8030843093038481, |
| "grad_norm": 7.928899765014648, |
| "learning_rate": 2e-07, |
| "loss": 0.1926, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.8089037608205427, |
| "grad_norm": 11.980950355529785, |
| "learning_rate": 2e-07, |
| "loss": 0.2042, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.8147232123372372, |
| "grad_norm": 14.4302978515625, |
| "learning_rate": 2e-07, |
| "loss": 0.1797, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.8205426638539318, |
| "grad_norm": 11.229952812194824, |
| "learning_rate": 2e-07, |
| "loss": 0.17, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.8263621153706263, |
| "grad_norm": 11.317793846130371, |
| "learning_rate": 2e-07, |
| "loss": 0.1728, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.8321815668873209, |
| "grad_norm": 10.537130355834961, |
| "learning_rate": 2e-07, |
| "loss": 0.1787, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.8380010184040154, |
| "grad_norm": 11.120368003845215, |
| "learning_rate": 2e-07, |
| "loss": 0.1621, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.84382046992071, |
| "grad_norm": 13.397139549255371, |
| "learning_rate": 2e-07, |
| "loss": 0.1639, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.8496399214374045, |
| "grad_norm": 12.438237190246582, |
| "learning_rate": 2e-07, |
| "loss": 0.1769, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.8554593729540991, |
| "grad_norm": 11.550435066223145, |
| "learning_rate": 2e-07, |
| "loss": 0.2065, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.8612788244707936, |
| "grad_norm": 6.573584079742432, |
| "learning_rate": 2e-07, |
| "loss": 0.2021, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.8670982759874882, |
| "grad_norm": 6.305631637573242, |
| "learning_rate": 2e-07, |
| "loss": 0.1811, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.8729177275041827, |
| "grad_norm": 9.760597229003906, |
| "learning_rate": 2e-07, |
| "loss": 0.1937, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8787371790208773, |
| "grad_norm": 13.0894193649292, |
| "learning_rate": 2e-07, |
| "loss": 0.176, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.8845566305375718, |
| "grad_norm": 7.848855495452881, |
| "learning_rate": 2e-07, |
| "loss": 0.1773, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.8903760820542664, |
| "grad_norm": 14.45218563079834, |
| "learning_rate": 2e-07, |
| "loss": 0.2096, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.8961955335709609, |
| "grad_norm": 7.301393032073975, |
| "learning_rate": 2e-07, |
| "loss": 0.1656, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.9020149850876555, |
| "grad_norm": 10.425517082214355, |
| "learning_rate": 2e-07, |
| "loss": 0.1802, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.90783443660435, |
| "grad_norm": 12.547024726867676, |
| "learning_rate": 2e-07, |
| "loss": 0.1922, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.9136538881210446, |
| "grad_norm": 12.041275978088379, |
| "learning_rate": 2e-07, |
| "loss": 0.1827, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.9194733396377391, |
| "grad_norm": 10.960613250732422, |
| "learning_rate": 2e-07, |
| "loss": 0.2234, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.9252927911544337, |
| "grad_norm": 11.155454635620117, |
| "learning_rate": 2e-07, |
| "loss": 0.1615, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.9311122426711282, |
| "grad_norm": 17.65553092956543, |
| "learning_rate": 2e-07, |
| "loss": 0.1892, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.9369316941878228, |
| "grad_norm": 18.907163619995117, |
| "learning_rate": 2e-07, |
| "loss": 0.1915, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.9427511457045173, |
| "grad_norm": 14.12991714477539, |
| "learning_rate": 2e-07, |
| "loss": 0.2156, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.948570597221212, |
| "grad_norm": 9.105369567871094, |
| "learning_rate": 2e-07, |
| "loss": 0.1808, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.9543900487379064, |
| "grad_norm": 10.135030746459961, |
| "learning_rate": 2e-07, |
| "loss": 0.1842, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.9602095002546011, |
| "grad_norm": 33.01081466674805, |
| "learning_rate": 2e-07, |
| "loss": 0.1757, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.9660289517712956, |
| "grad_norm": 7.512867450714111, |
| "learning_rate": 2e-07, |
| "loss": 0.1591, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.9718484032879902, |
| "grad_norm": 7.911075592041016, |
| "learning_rate": 2e-07, |
| "loss": 0.1885, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.9776678548046847, |
| "grad_norm": 4.568904399871826, |
| "learning_rate": 2e-07, |
| "loss": 0.1929, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.9834873063213793, |
| "grad_norm": 14.508501052856445, |
| "learning_rate": 2e-07, |
| "loss": 0.2354, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.9893067578380738, |
| "grad_norm": 9.957372665405273, |
| "learning_rate": 2e-07, |
| "loss": 0.1523, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.9951262093547684, |
| "grad_norm": 8.227477073669434, |
| "learning_rate": 2e-07, |
| "loss": 0.1816, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.999781770568124, |
| "step": 1718, |
| "total_flos": 6.783175773021798e+16, |
| "train_loss": 0.28378574687193936, |
| "train_runtime": 68908.423, |
| "train_samples_per_second": 0.798, |
| "train_steps_per_second": 0.025 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1718, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.783175773021798e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|