| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 8.834996162701458, |
| "eval_steps": 2000.0, |
| "global_step": 1440, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.06139677666922486, |
| "grad_norm": 0.1343451291322708, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.293, |
| "num_tokens": 7752366.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.12279355333844973, |
| "grad_norm": 0.14936372637748718, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.2405, |
| "num_tokens": 15390227.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1841903300076746, |
| "grad_norm": 0.10722821950912476, |
| "learning_rate": 3e-06, |
| "loss": 0.2648, |
| "num_tokens": 23123703.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.24558710667689945, |
| "grad_norm": 0.0795913115143776, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.3012, |
| "num_tokens": 30930492.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3069838833461243, |
| "grad_norm": 0.11849294602870941, |
| "learning_rate": 5e-06, |
| "loss": 0.2342, |
| "num_tokens": 38599104.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.3683806600153492, |
| "grad_norm": 0.13234864175319672, |
| "learning_rate": 6e-06, |
| "loss": 0.2306, |
| "num_tokens": 46329582.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4297774366845741, |
| "grad_norm": 0.12520423531532288, |
| "learning_rate": 7e-06, |
| "loss": 0.2753, |
| "num_tokens": 54163228.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.4911742133537989, |
| "grad_norm": 0.1749039590358734, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.2848, |
| "num_tokens": 61755924.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5525709900230238, |
| "grad_norm": 0.13144172728061676, |
| "learning_rate": 9e-06, |
| "loss": 0.2499, |
| "num_tokens": 69324388.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6139677666922486, |
| "grad_norm": 0.1269148290157318, |
| "learning_rate": 1e-05, |
| "loss": 0.2941, |
| "num_tokens": 76977972.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6753645433614736, |
| "grad_norm": 0.13263040781021118, |
| "learning_rate": 9.998945997517957e-06, |
| "loss": 0.3423, |
| "num_tokens": 84625449.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.7367613200306984, |
| "grad_norm": 0.12442048639059067, |
| "learning_rate": 9.99578443444032e-06, |
| "loss": 0.2522, |
| "num_tokens": 92380333.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.7981580966999232, |
| "grad_norm": 0.12848921120166779, |
| "learning_rate": 9.990516643685222e-06, |
| "loss": 0.2305, |
| "num_tokens": 100278549.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.8595548733691482, |
| "grad_norm": 0.18896037340164185, |
| "learning_rate": 9.983144846158472e-06, |
| "loss": 0.2682, |
| "num_tokens": 107896424.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.920951650038373, |
| "grad_norm": 0.10803721845149994, |
| "learning_rate": 9.973672149817232e-06, |
| "loss": 0.2951, |
| "num_tokens": 115687353.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.9823484267075978, |
| "grad_norm": 0.1490592062473297, |
| "learning_rate": 9.96210254835968e-06, |
| "loss": 0.2354, |
| "num_tokens": 123478527.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.0429777436684573, |
| "grad_norm": 0.11919503659009933, |
| "learning_rate": 9.948440919541277e-06, |
| "loss": 0.313, |
| "num_tokens": 131158765.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.1043745203376822, |
| "grad_norm": 0.14419515430927277, |
| "learning_rate": 9.932693023118299e-06, |
| "loss": 0.3636, |
| "num_tokens": 138879586.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.1657712970069072, |
| "grad_norm": 0.13056258857250214, |
| "learning_rate": 9.91486549841951e-06, |
| "loss": 0.237, |
| "num_tokens": 146732371.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.2271680736761321, |
| "grad_norm": 0.1365155279636383, |
| "learning_rate": 9.894965861547023e-06, |
| "loss": 0.2511, |
| "num_tokens": 154459979.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.2885648503453568, |
| "grad_norm": 0.2808520495891571, |
| "learning_rate": 9.873002502207502e-06, |
| "loss": 0.2508, |
| "num_tokens": 162175757.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.3499616270145818, |
| "grad_norm": 0.0974242314696312, |
| "learning_rate": 9.848984680175049e-06, |
| "loss": 0.2109, |
| "num_tokens": 169696281.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.4113584036838067, |
| "grad_norm": 0.1405002772808075, |
| "learning_rate": 9.822922521387277e-06, |
| "loss": 0.2564, |
| "num_tokens": 177488034.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.4727551803530314, |
| "grad_norm": 0.17119945585727692, |
| "learning_rate": 9.794827013676206e-06, |
| "loss": 0.2494, |
| "num_tokens": 185380889.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.5341519570222564, |
| "grad_norm": 0.11872359365224838, |
| "learning_rate": 9.764710002135784e-06, |
| "loss": 0.3133, |
| "num_tokens": 193120160.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.5955487336914813, |
| "grad_norm": 0.14314331114292145, |
| "learning_rate": 9.732584184127973e-06, |
| "loss": 0.3011, |
| "num_tokens": 200841296.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.656945510360706, |
| "grad_norm": 0.1306375414133072, |
| "learning_rate": 9.698463103929542e-06, |
| "loss": 0.2421, |
| "num_tokens": 208630495.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.718342287029931, |
| "grad_norm": 0.18562458455562592, |
| "learning_rate": 9.66236114702178e-06, |
| "loss": 0.2501, |
| "num_tokens": 216297417.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.779739063699156, |
| "grad_norm": 0.3968285322189331, |
| "learning_rate": 9.62429353402556e-06, |
| "loss": 0.2719, |
| "num_tokens": 224014081.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.8411358403683806, |
| "grad_norm": 0.13298219442367554, |
| "learning_rate": 9.584276314284316e-06, |
| "loss": 0.2728, |
| "num_tokens": 231641746.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.9025326170376056, |
| "grad_norm": 0.11644992977380753, |
| "learning_rate": 9.542326359097619e-06, |
| "loss": 0.2557, |
| "num_tokens": 239183635.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.9639293937068305, |
| "grad_norm": 0.13967359066009521, |
| "learning_rate": 9.498461354608228e-06, |
| "loss": 0.24, |
| "num_tokens": 246880516.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.02455871066769, |
| "grad_norm": 0.10428839921951294, |
| "learning_rate": 9.452699794345583e-06, |
| "loss": 0.2238, |
| "num_tokens": 254662159.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.0859554873369146, |
| "grad_norm": 0.1438775211572647, |
| "learning_rate": 9.405060971428924e-06, |
| "loss": 0.228, |
| "num_tokens": 262332176.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.1473522640061398, |
| "grad_norm": 0.13422533869743347, |
| "learning_rate": 9.355564970433288e-06, |
| "loss": 0.2599, |
| "num_tokens": 270068159.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.2087490406753645, |
| "grad_norm": 0.18186450004577637, |
| "learning_rate": 9.30423265892184e-06, |
| "loss": 0.2758, |
| "num_tokens": 277876746.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.2701458173445896, |
| "grad_norm": 0.1439521163702011, |
| "learning_rate": 9.251085678648072e-06, |
| "loss": 0.2618, |
| "num_tokens": 285680808.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.3315425940138144, |
| "grad_norm": 0.11770835518836975, |
| "learning_rate": 9.196146436431635e-06, |
| "loss": 0.2424, |
| "num_tokens": 293359797.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.392939370683039, |
| "grad_norm": 0.11337260156869888, |
| "learning_rate": 9.13943809471159e-06, |
| "loss": 0.2129, |
| "num_tokens": 301088235.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.4543361473522642, |
| "grad_norm": 0.11012344807386398, |
| "learning_rate": 9.08098456178111e-06, |
| "loss": 0.2094, |
| "num_tokens": 308725104.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.515732924021489, |
| "grad_norm": 0.1827051341533661, |
| "learning_rate": 9.020810481707709e-06, |
| "loss": 0.2676, |
| "num_tokens": 316685360.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.5771297006907137, |
| "grad_norm": 0.16157183051109314, |
| "learning_rate": 8.958941223943292e-06, |
| "loss": 0.27, |
| "num_tokens": 324401853.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.638526477359939, |
| "grad_norm": 0.15233761072158813, |
| "learning_rate": 8.895402872628352e-06, |
| "loss": 0.3185, |
| "num_tokens": 332036172.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.6999232540291636, |
| "grad_norm": 0.18302637338638306, |
| "learning_rate": 8.83022221559489e-06, |
| "loss": 0.2625, |
| "num_tokens": 339756434.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.7613200306983883, |
| "grad_norm": 0.12365231662988663, |
| "learning_rate": 8.763426733072624e-06, |
| "loss": 0.2228, |
| "num_tokens": 347501900.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.8227168073676134, |
| "grad_norm": 0.14065682888031006, |
| "learning_rate": 8.695044586103297e-06, |
| "loss": 0.2668, |
| "num_tokens": 355111306.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.884113584036838, |
| "grad_norm": 0.12397003918886185, |
| "learning_rate": 8.625104604667965e-06, |
| "loss": 0.256, |
| "num_tokens": 362659417.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.945510360706063, |
| "grad_norm": 0.11920394748449326, |
| "learning_rate": 8.553636275532236e-06, |
| "loss": 0.252, |
| "num_tokens": 370477140.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.0061396776669227, |
| "grad_norm": 0.15792995691299438, |
| "learning_rate": 8.480669729814635e-06, |
| "loss": 0.2199, |
| "num_tokens": 378142328.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.0675364543361474, |
| "grad_norm": 0.27516841888427734, |
| "learning_rate": 8.40623573028327e-06, |
| "loss": 0.2155, |
| "num_tokens": 385845709.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.128933231005372, |
| "grad_norm": 0.12260135263204575, |
| "learning_rate": 8.330365658386252e-06, |
| "loss": 0.2946, |
| "num_tokens": 393595072.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.1903300076745973, |
| "grad_norm": 0.14132475852966309, |
| "learning_rate": 8.25309150102121e-06, |
| "loss": 0.2236, |
| "num_tokens": 401446593.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.251726784343822, |
| "grad_norm": 0.12539486587047577, |
| "learning_rate": 8.174445837049614e-06, |
| "loss": 0.3032, |
| "num_tokens": 409151914.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 3.3131235610130467, |
| "grad_norm": 0.11866062134504318, |
| "learning_rate": 8.094461823561473e-06, |
| "loss": 0.1683, |
| "num_tokens": 416802356.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.374520337682272, |
| "grad_norm": 0.13061948120594025, |
| "learning_rate": 8.013173181896283e-06, |
| "loss": 0.1884, |
| "num_tokens": 424608578.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.4359171143514966, |
| "grad_norm": 0.17994067072868347, |
| "learning_rate": 7.930614183426074e-06, |
| "loss": 0.2907, |
| "num_tokens": 432256378.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.4973138910207213, |
| "grad_norm": 0.1164034903049469, |
| "learning_rate": 7.846819635106569e-06, |
| "loss": 0.2326, |
| "num_tokens": 440160328.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.5587106676899465, |
| "grad_norm": 0.11783929914236069, |
| "learning_rate": 7.76182486480253e-06, |
| "loss": 0.2441, |
| "num_tokens": 447898314.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.620107444359171, |
| "grad_norm": 0.144779771566391, |
| "learning_rate": 7.675665706393502e-06, |
| "loss": 0.2743, |
| "num_tokens": 455623259.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.681504221028396, |
| "grad_norm": 0.14519008994102478, |
| "learning_rate": 7.588378484666214e-06, |
| "loss": 0.2549, |
| "num_tokens": 463234578.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.742900997697621, |
| "grad_norm": 0.10034901648759842, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.2239, |
| "num_tokens": 470964210.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 3.804297774366846, |
| "grad_norm": 0.19951561093330383, |
| "learning_rate": 7.4105675128517456e-06, |
| "loss": 0.2149, |
| "num_tokens": 478675230.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.8656945510360705, |
| "grad_norm": 0.171798974275589, |
| "learning_rate": 7.320118728046818e-06, |
| "loss": 0.2186, |
| "num_tokens": 486384447.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 3.9270913277052957, |
| "grad_norm": 0.12792909145355225, |
| "learning_rate": 7.2286917788826926e-06, |
| "loss": 0.2247, |
| "num_tokens": 493980451.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 3.9884881043745204, |
| "grad_norm": 0.17786537110805511, |
| "learning_rate": 7.136325211051905e-06, |
| "loss": 0.2348, |
| "num_tokens": 501677989.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.04911742133538, |
| "grad_norm": 0.1922304332256317, |
| "learning_rate": 7.043057966391158e-06, |
| "loss": 0.2142, |
| "num_tokens": 509247605.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 4.110514198004605, |
| "grad_norm": 0.0999738946557045, |
| "learning_rate": 6.948929366463397e-06, |
| "loss": 0.2415, |
| "num_tokens": 516999508.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 4.171910974673829, |
| "grad_norm": 0.15080419182777405, |
| "learning_rate": 6.8539790959798045e-06, |
| "loss": 0.215, |
| "num_tokens": 524765050.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 4.233307751343054, |
| "grad_norm": 0.14418524503707886, |
| "learning_rate": 6.758247186068684e-06, |
| "loss": 0.2395, |
| "num_tokens": 532419727.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 4.2947045280122795, |
| "grad_norm": 0.20315948128700256, |
| "learning_rate": 6.6617739973982985e-06, |
| "loss": 0.1999, |
| "num_tokens": 540203866.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.356101304681504, |
| "grad_norm": 0.13794706761837006, |
| "learning_rate": 6.5646002031607726e-06, |
| "loss": 0.2243, |
| "num_tokens": 547694716.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 4.417498081350729, |
| "grad_norm": 0.1133931577205658, |
| "learning_rate": 6.466766771924231e-06, |
| "loss": 0.2181, |
| "num_tokens": 555331030.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 4.478894858019954, |
| "grad_norm": 0.11515721678733826, |
| "learning_rate": 6.368314950360416e-06, |
| "loss": 0.205, |
| "num_tokens": 563196922.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 4.540291634689179, |
| "grad_norm": 0.13237498700618744, |
| "learning_rate": 6.269286245855039e-06, |
| "loss": 0.2082, |
| "num_tokens": 570961435.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 4.601688411358404, |
| "grad_norm": 0.1634393036365509, |
| "learning_rate": 6.169722409008244e-06, |
| "loss": 0.2295, |
| "num_tokens": 578522917.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.663085188027629, |
| "grad_norm": 0.14259153604507446, |
| "learning_rate": 6.0696654160324875e-06, |
| "loss": 0.2191, |
| "num_tokens": 586368851.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.724481964696853, |
| "grad_norm": 0.15294960141181946, |
| "learning_rate": 5.9691574510553505e-06, |
| "loss": 0.2633, |
| "num_tokens": 594103147.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 4.785878741366078, |
| "grad_norm": 0.15502986311912537, |
| "learning_rate": 5.8682408883346535e-06, |
| "loss": 0.1844, |
| "num_tokens": 601860501.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.847275518035303, |
| "grad_norm": 0.11382922530174255, |
| "learning_rate": 5.766958274393428e-06, |
| "loss": 0.202, |
| "num_tokens": 609508900.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 4.9086722947045285, |
| "grad_norm": 0.08602210134267807, |
| "learning_rate": 5.66535231008227e-06, |
| "loss": 0.2037, |
| "num_tokens": 617381309.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.970069071373753, |
| "grad_norm": 0.12180744856595993, |
| "learning_rate": 5.5634658325766066e-06, |
| "loss": 0.2697, |
| "num_tokens": 625196889.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 5.030698388334613, |
| "grad_norm": 0.11605013906955719, |
| "learning_rate": 5.46134179731651e-06, |
| "loss": 0.2298, |
| "num_tokens": 632748509.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 5.092095165003837, |
| "grad_norm": 0.14186152815818787, |
| "learning_rate": 5.359023259896638e-06, |
| "loss": 0.2598, |
| "num_tokens": 640449280.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 5.153491941673062, |
| "grad_norm": 0.17204760015010834, |
| "learning_rate": 5.2565533579139484e-06, |
| "loss": 0.2417, |
| "num_tokens": 648111545.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 5.214888718342287, |
| "grad_norm": 0.13311974704265594, |
| "learning_rate": 5.153975292780852e-06, |
| "loss": 0.2073, |
| "num_tokens": 656000810.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.276285495011512, |
| "grad_norm": 0.12501160800457, |
| "learning_rate": 5.05133231151145e-06, |
| "loss": 0.172, |
| "num_tokens": 663847591.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 5.337682271680737, |
| "grad_norm": 0.1829407662153244, |
| "learning_rate": 4.948667688488552e-06, |
| "loss": 0.189, |
| "num_tokens": 671577237.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 5.399079048349962, |
| "grad_norm": 0.06771395355463028, |
| "learning_rate": 4.846024707219149e-06, |
| "loss": 0.1776, |
| "num_tokens": 679342427.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 5.460475825019186, |
| "grad_norm": 0.1194373071193695, |
| "learning_rate": 4.7434466420860515e-06, |
| "loss": 0.1533, |
| "num_tokens": 687084512.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 5.521872601688411, |
| "grad_norm": 0.18959811329841614, |
| "learning_rate": 4.640976740103363e-06, |
| "loss": 0.1898, |
| "num_tokens": 694781917.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 5.583269378357636, |
| "grad_norm": 0.18240872025489807, |
| "learning_rate": 4.53865820268349e-06, |
| "loss": 0.1944, |
| "num_tokens": 702455444.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 5.6446661550268615, |
| "grad_norm": 0.11243653297424316, |
| "learning_rate": 4.436534167423395e-06, |
| "loss": 0.2385, |
| "num_tokens": 710082886.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 5.706062931696086, |
| "grad_norm": 0.18290652334690094, |
| "learning_rate": 4.334647689917734e-06, |
| "loss": 0.2426, |
| "num_tokens": 717779503.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 5.767459708365311, |
| "grad_norm": 0.1503763198852539, |
| "learning_rate": 4.233041725606573e-06, |
| "loss": 0.1905, |
| "num_tokens": 725305489.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 5.828856485034535, |
| "grad_norm": 0.21133235096931458, |
| "learning_rate": 4.131759111665349e-06, |
| "loss": 0.2697, |
| "num_tokens": 733143147.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 5.89025326170376, |
| "grad_norm": 0.16622580587863922, |
| "learning_rate": 4.03084254894465e-06, |
| "loss": 0.2006, |
| "num_tokens": 740902933.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 5.951650038372986, |
| "grad_norm": 0.16907618939876556, |
| "learning_rate": 3.930334583967514e-06, |
| "loss": 0.1796, |
| "num_tokens": 748631899.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 6.012279355333845, |
| "grad_norm": 0.11626313626766205, |
| "learning_rate": 3.8302775909917585e-06, |
| "loss": 0.2062, |
| "num_tokens": 756270769.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 6.07367613200307, |
| "grad_norm": 0.15304428339004517, |
| "learning_rate": 3.730713754144961e-06, |
| "loss": 0.3051, |
| "num_tokens": 764034264.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 6.135072908672295, |
| "grad_norm": 0.13653458654880524, |
| "learning_rate": 3.6316850496395863e-06, |
| "loss": 0.2186, |
| "num_tokens": 771807750.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 6.19646968534152, |
| "grad_norm": 0.128171905875206, |
| "learning_rate": 3.5332332280757706e-06, |
| "loss": 0.22, |
| "num_tokens": 779403239.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 6.257866462010744, |
| "grad_norm": 0.14104680716991425, |
| "learning_rate": 3.4353997968392295e-06, |
| "loss": 0.2039, |
| "num_tokens": 787121903.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 6.319263238679969, |
| "grad_norm": 0.2058696150779724, |
| "learning_rate": 3.3382260026017027e-06, |
| "loss": 0.1801, |
| "num_tokens": 794810789.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 6.380660015349195, |
| "grad_norm": 0.11931730061769485, |
| "learning_rate": 3.241752813931316e-06, |
| "loss": 0.1968, |
| "num_tokens": 802512965.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 6.442056792018419, |
| "grad_norm": 0.11417070776224136, |
| "learning_rate": 3.1460209040201967e-06, |
| "loss": 0.1711, |
| "num_tokens": 810183997.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 6.503453568687644, |
| "grad_norm": 0.1443619281053543, |
| "learning_rate": 3.0510706335366034e-06, |
| "loss": 0.1694, |
| "num_tokens": 818043919.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 6.564850345356868, |
| "grad_norm": 0.13925348222255707, |
| "learning_rate": 2.956942033608843e-06, |
| "loss": 0.2639, |
| "num_tokens": 825607330.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 6.6262471220260935, |
| "grad_norm": 0.15820568799972534, |
| "learning_rate": 2.863674788948097e-06, |
| "loss": 0.1922, |
| "num_tokens": 833304152.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 6.687643898695319, |
| "grad_norm": 0.17013588547706604, |
| "learning_rate": 2.771308221117309e-06, |
| "loss": 0.1434, |
| "num_tokens": 841048537.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 6.749040675364544, |
| "grad_norm": 0.12619027495384216, |
| "learning_rate": 2.6798812719531843e-06, |
| "loss": 0.1994, |
| "num_tokens": 848766284.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 6.810437452033768, |
| "grad_norm": 0.12287342548370361, |
| "learning_rate": 2.5894324871482557e-06, |
| "loss": 0.1693, |
| "num_tokens": 856562254.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 6.871834228702993, |
| "grad_norm": 0.1615433394908905, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": 0.1454, |
| "num_tokens": 864377910.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 6.9332310053722175, |
| "grad_norm": 0.3870552182197571, |
| "learning_rate": 2.411621515333788e-06, |
| "loss": 0.191, |
| "num_tokens": 872108030.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 6.994627782041443, |
| "grad_norm": 0.17497745156288147, |
| "learning_rate": 2.324334293606499e-06, |
| "loss": 0.1893, |
| "num_tokens": 879761913.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 7.0552570990023025, |
| "grad_norm": 0.12139171361923218, |
| "learning_rate": 2.238175135197471e-06, |
| "loss": 0.1567, |
| "num_tokens": 887504352.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 7.116653875671528, |
| "grad_norm": 0.08999204635620117, |
| "learning_rate": 2.1531803648934333e-06, |
| "loss": 0.1625, |
| "num_tokens": 895097892.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 7.178050652340752, |
| "grad_norm": 0.08897876739501953, |
| "learning_rate": 2.069385816573928e-06, |
| "loss": 0.2311, |
| "num_tokens": 902848147.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 7.239447429009977, |
| "grad_norm": 0.21378548443317413, |
| "learning_rate": 1.9868268181037186e-06, |
| "loss": 0.1423, |
| "num_tokens": 910458382.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 7.300844205679202, |
| "grad_norm": 0.1900954693555832, |
| "learning_rate": 1.9055381764385272e-06, |
| "loss": 0.2398, |
| "num_tokens": 918226444.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 7.3622409823484265, |
| "grad_norm": 0.15378384292125702, |
| "learning_rate": 1.8255541629503865e-06, |
| "loss": 0.195, |
| "num_tokens": 925917841.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 7.423637759017652, |
| "grad_norm": 0.14738328754901886, |
| "learning_rate": 1.746908498978791e-06, |
| "loss": 0.178, |
| "num_tokens": 933651661.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 7.485034535686877, |
| "grad_norm": 0.11289914697408676, |
| "learning_rate": 1.6696343416137495e-06, |
| "loss": 0.1942, |
| "num_tokens": 941360552.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 7.546431312356101, |
| "grad_norm": 0.1341191977262497, |
| "learning_rate": 1.5937642697167288e-06, |
| "loss": 0.2068, |
| "num_tokens": 949058805.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 7.607828089025326, |
| "grad_norm": 0.13751080632209778, |
| "learning_rate": 1.5193302701853674e-06, |
| "loss": 0.1909, |
| "num_tokens": 956782180.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 7.669224865694551, |
| "grad_norm": 0.12948212027549744, |
| "learning_rate": 1.4463637244677648e-06, |
| "loss": 0.2166, |
| "num_tokens": 964547747.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 7.730621642363776, |
| "grad_norm": 0.19808124005794525, |
| "learning_rate": 1.374895395332037e-06, |
| "loss": 0.1593, |
| "num_tokens": 972262764.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 7.792018419033001, |
| "grad_norm": 0.15310564637184143, |
| "learning_rate": 1.3049554138967052e-06, |
| "loss": 0.1817, |
| "num_tokens": 979976870.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 7.853415195702226, |
| "grad_norm": 0.09420406818389893, |
| "learning_rate": 1.2365732669273778e-06, |
| "loss": 0.171, |
| "num_tokens": 987702995.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 7.91481197237145, |
| "grad_norm": 0.15969103574752808, |
| "learning_rate": 1.1697777844051105e-06, |
| "loss": 0.1672, |
| "num_tokens": 995495385.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 7.9762087490406754, |
| "grad_norm": 0.14377442002296448, |
| "learning_rate": 1.1045971273716476e-06, |
| "loss": 0.1721, |
| "num_tokens": 1003279772.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 8.036838066001534, |
| "grad_norm": 0.14576008915901184, |
| "learning_rate": 1.0410587760567104e-06, |
| "loss": 0.1598, |
| "num_tokens": 1010897625.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 8.09823484267076, |
| "grad_norm": 0.16741247475147247, |
| "learning_rate": 9.791895182922911e-07, |
| "loss": 0.158, |
| "num_tokens": 1018639336.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 8.159631619339985, |
| "grad_norm": 0.11017988622188568, |
| "learning_rate": 9.190154382188921e-07, |
| "loss": 0.1905, |
| "num_tokens": 1026303694.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 8.22102839600921, |
| "grad_norm": 0.13501618802547455, |
| "learning_rate": 8.605619052884106e-07, |
| "loss": 0.2215, |
| "num_tokens": 1033995723.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 8.282425172678435, |
| "grad_norm": 0.1380506455898285, |
| "learning_rate": 8.03853563568367e-07, |
| "loss": 0.2105, |
| "num_tokens": 1041828748.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 8.343821949347658, |
| "grad_norm": 0.10789493471384048, |
| "learning_rate": 7.489143213519301e-07, |
| "loss": 0.1776, |
| "num_tokens": 1049624316.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 8.405218726016884, |
| "grad_norm": 0.14292606711387634, |
| "learning_rate": 6.957673410781617e-07, |
| "loss": 0.1514, |
| "num_tokens": 1057358883.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 8.466615502686109, |
| "grad_norm": 0.16883572936058044, |
| "learning_rate": 6.444350295667112e-07, |
| "loss": 0.1597, |
| "num_tokens": 1065075389.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 8.528012279355334, |
| "grad_norm": 0.1803523451089859, |
| "learning_rate": 5.949390285710777e-07, |
| "loss": 0.2488, |
| "num_tokens": 1072877315.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 8.589409056024559, |
| "grad_norm": 0.16701538860797882, |
| "learning_rate": 5.473002056544191e-07, |
| "loss": 0.1594, |
| "num_tokens": 1080563869.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 8.650805832693784, |
| "grad_norm": 0.1643553078174591, |
| "learning_rate": 5.015386453917742e-07, |
| "loss": 0.1802, |
| "num_tokens": 1088207914.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 8.712202609363008, |
| "grad_norm": 0.13831958174705505, |
| "learning_rate": 4.576736409023813e-07, |
| "loss": 0.1725, |
| "num_tokens": 1096010265.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 8.773599386032233, |
| "grad_norm": 0.16193532943725586, |
| "learning_rate": 4.15723685715686e-07, |
| "loss": 0.1718, |
| "num_tokens": 1103755705.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 8.834996162701458, |
| "grad_norm": 0.10267972201108932, |
| "learning_rate": 3.7570646597444196e-07, |
| "loss": 0.1459, |
| "num_tokens": 1111399056.0, |
| "step": 1440 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1630, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 60, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.198652494383048e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|