| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9989717223650385, |
| "eval_steps": 500, |
| "global_step": 972, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02056555269922879, |
| "grad_norm": 9.233743238441296, |
| "learning_rate": 1.9994839090452616e-05, |
| "loss": 1.5713, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04113110539845758, |
| "grad_norm": 13.69020758087521, |
| "learning_rate": 1.9979236966675828e-05, |
| "loss": 1.3565, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.061696658097686374, |
| "grad_norm": 9.2629201307749, |
| "learning_rate": 1.9953209739827946e-05, |
| "loss": 1.3132, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.08226221079691516, |
| "grad_norm": 5.836967088114798, |
| "learning_rate": 1.9916784600016132e-05, |
| "loss": 1.2708, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.10282776349614396, |
| "grad_norm": 9.00423601885076, |
| "learning_rate": 1.9869999599832804e-05, |
| "loss": 1.2658, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12339331619537275, |
| "grad_norm": 13.166982152661006, |
| "learning_rate": 1.981290361460287e-05, |
| "loss": 1.2531, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.14395886889460155, |
| "grad_norm": 10.728661259090346, |
| "learning_rate": 1.974555629132469e-05, |
| "loss": 1.2339, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.16452442159383032, |
| "grad_norm": 9.294779587967374, |
| "learning_rate": 1.9668027986358082e-05, |
| "loss": 1.2372, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.18508997429305912, |
| "grad_norm": 6.388693353299931, |
| "learning_rate": 1.9580399691924484e-05, |
| "loss": 1.2298, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.20565552699228792, |
| "grad_norm": 5.685294719278735, |
| "learning_rate": 1.9482762951496056e-05, |
| "loss": 1.2312, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2262210796915167, |
| "grad_norm": 6.685866232367591, |
| "learning_rate": 1.9375219764162096e-05, |
| "loss": 1.2316, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.2467866323907455, |
| "grad_norm": 7.688018717119547, |
| "learning_rate": 1.92578824780727e-05, |
| "loss": 1.2337, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.26735218508997427, |
| "grad_norm": 3.884064710067138, |
| "learning_rate": 1.913087367307095e-05, |
| "loss": 1.2136, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2879177377892031, |
| "grad_norm": 9.444686410104936, |
| "learning_rate": 1.8994326032636318e-05, |
| "loss": 1.2072, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.30848329048843187, |
| "grad_norm": 4.864822081558493, |
| "learning_rate": 1.8848382205272924e-05, |
| "loss": 1.2017, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.32904884318766064, |
| "grad_norm": 5.9852253238798605, |
| "learning_rate": 1.869319465548762e-05, |
| "loss": 1.208, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3496143958868895, |
| "grad_norm": 6.2970547396781225, |
| "learning_rate": 1.852892550451345e-05, |
| "loss": 1.2012, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.37017994858611825, |
| "grad_norm": 7.846752995277368, |
| "learning_rate": 1.835574636094494e-05, |
| "loss": 1.2035, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.390745501285347, |
| "grad_norm": 9.795450293111358, |
| "learning_rate": 1.8173838141462145e-05, |
| "loss": 1.2147, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.41131105398457585, |
| "grad_norm": 7.782933189444834, |
| "learning_rate": 1.798339088183071e-05, |
| "loss": 1.2126, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4318766066838046, |
| "grad_norm": 5.494144367822917, |
| "learning_rate": 1.7784603538375453e-05, |
| "loss": 1.2089, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4524421593830334, |
| "grad_norm": 6.609017931196138, |
| "learning_rate": 1.7577683780134756e-05, |
| "loss": 1.1879, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4730077120822622, |
| "grad_norm": 8.906035242084766, |
| "learning_rate": 1.7362847771913035e-05, |
| "loss": 1.2097, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.493573264781491, |
| "grad_norm": 8.648775336727859, |
| "learning_rate": 1.714031994845782e-05, |
| "loss": 1.187, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5141388174807198, |
| "grad_norm": 5.843126331449411, |
| "learning_rate": 1.6910332779997378e-05, |
| "loss": 1.1835, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5347043701799485, |
| "grad_norm": 7.57208189767232, |
| "learning_rate": 1.6673126529383905e-05, |
| "loss": 1.1906, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5552699228791774, |
| "grad_norm": 5.872037189234157, |
| "learning_rate": 1.642894900109584e-05, |
| "loss": 1.1736, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5758354755784062, |
| "grad_norm": 5.59549822016061, |
| "learning_rate": 1.6178055282361642e-05, |
| "loss": 1.19, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.596401028277635, |
| "grad_norm": 7.1037529678120785, |
| "learning_rate": 1.5920707476675446e-05, |
| "loss": 1.1851, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6169665809768637, |
| "grad_norm": 4.7250598751432875, |
| "learning_rate": 1.565717442998292e-05, |
| "loss": 1.1824, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6375321336760925, |
| "grad_norm": 7.603461950437508, |
| "learning_rate": 1.5387731449823474e-05, |
| "loss": 1.1543, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6580976863753213, |
| "grad_norm": 7.812378658902085, |
| "learning_rate": 1.5112660017722122e-05, |
| "loss": 1.1683, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6786632390745502, |
| "grad_norm": 6.866611481508661, |
| "learning_rate": 1.4832247495131566e-05, |
| "loss": 1.1643, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.699228791773779, |
| "grad_norm": 8.447272563997627, |
| "learning_rate": 1.45467868232316e-05, |
| "loss": 1.1679, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7197943444730077, |
| "grad_norm": 8.29660722849121, |
| "learning_rate": 1.4256576216899494e-05, |
| "loss": 1.1605, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7403598971722365, |
| "grad_norm": 6.408869669950844, |
| "learning_rate": 1.3961918853171073e-05, |
| "loss": 1.1681, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7609254498714653, |
| "grad_norm": 7.257507857539118, |
| "learning_rate": 1.3663122554517917e-05, |
| "loss": 1.1545, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.781491002570694, |
| "grad_norm": 6.745599564457566, |
| "learning_rate": 1.3360499467271552e-05, |
| "loss": 1.167, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8020565552699229, |
| "grad_norm": 4.372254672475978, |
| "learning_rate": 1.3054365735530666e-05, |
| "loss": 1.1706, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8226221079691517, |
| "grad_norm": 5.9583586806734585, |
| "learning_rate": 1.2745041170891827e-05, |
| "loss": 1.1512, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8431876606683805, |
| "grad_norm": 8.218488729311929, |
| "learning_rate": 1.243284891834894e-05, |
| "loss": 1.161, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8637532133676092, |
| "grad_norm": 9.05020466841952, |
| "learning_rate": 1.211811511871033e-05, |
| "loss": 1.1499, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.884318766066838, |
| "grad_norm": 7.133782169539759, |
| "learning_rate": 1.1801168567886159e-05, |
| "loss": 1.1428, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9048843187660668, |
| "grad_norm": 10.061681353175718, |
| "learning_rate": 1.1482340373402128e-05, |
| "loss": 1.1548, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9254498714652957, |
| "grad_norm": 5.964595532826752, |
| "learning_rate": 1.1161963608498254e-05, |
| "loss": 1.1375, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9460154241645244, |
| "grad_norm": 7.682863713931749, |
| "learning_rate": 1.0840372964174148e-05, |
| "loss": 1.1441, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9665809768637532, |
| "grad_norm": 6.093064442303913, |
| "learning_rate": 1.051790439954422e-05, |
| "loss": 1.1374, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.987146529562982, |
| "grad_norm": 2.859419774137469, |
| "learning_rate": 1.0194894790868113e-05, |
| "loss": 1.135, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0077120822622108, |
| "grad_norm": 5.504925974002832, |
| "learning_rate": 9.871681579623028e-06, |
| "loss": 1.1268, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0282776349614395, |
| "grad_norm": 6.006410935514135, |
| "learning_rate": 9.548602419985584e-06, |
| "loss": 1.0911, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.0488431876606683, |
| "grad_norm": 3.2675854341672608, |
| "learning_rate": 9.225994826091431e-06, |
| "loss": 1.0816, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.069408740359897, |
| "grad_norm": 6.801925989140796, |
| "learning_rate": 8.904195819441222e-06, |
| "loss": 1.0833, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.089974293059126, |
| "grad_norm": 3.6643836420434663, |
| "learning_rate": 8.583541576821191e-06, |
| "loss": 1.081, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1105398457583548, |
| "grad_norm": 6.747377788645208, |
| "learning_rate": 8.264367079106194e-06, |
| "loss": 1.0793, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1311053984575836, |
| "grad_norm": 6.372537375904327, |
| "learning_rate": 7.947005761312097e-06, |
| "loss": 1.0979, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.1516709511568124, |
| "grad_norm": 6.925714108622525, |
| "learning_rate": 7.6317891642631e-06, |
| "loss": 1.0868, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.1722365038560412, |
| "grad_norm": 7.332135497496814, |
| "learning_rate": 7.319046588237864e-06, |
| "loss": 1.0613, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.19280205655527, |
| "grad_norm": 3.4981756709446454, |
| "learning_rate": 7.009104748956304e-06, |
| "loss": 1.0801, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.2133676092544987, |
| "grad_norm": 5.6660043695937, |
| "learning_rate": 6.7022874362664155e-06, |
| "loss": 1.0838, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.2339331619537275, |
| "grad_norm": 6.6393615146162634, |
| "learning_rate": 6.398915175887698e-06, |
| "loss": 1.0692, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.2544987146529563, |
| "grad_norm": 9.01722626725686, |
| "learning_rate": 6.099304894564544e-06, |
| "loss": 1.093, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.275064267352185, |
| "grad_norm": 7.465032128492785, |
| "learning_rate": 5.8037695889794e-06, |
| "loss": 1.0781, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.2956298200514138, |
| "grad_norm": 8.243642393853053, |
| "learning_rate": 5.512617998771598e-06, |
| "loss": 1.0833, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.3161953727506428, |
| "grad_norm": 3.1779811319825577, |
| "learning_rate": 5.226154284003411e-06, |
| "loss": 1.0715, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.3367609254498714, |
| "grad_norm": 4.120350391737107, |
| "learning_rate": 4.944677707410315e-06, |
| "loss": 1.0829, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.3573264781491003, |
| "grad_norm": 7.105813120974067, |
| "learning_rate": 4.668482321767371e-06, |
| "loss": 1.0865, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.3778920308483291, |
| "grad_norm": 8.035377582845337, |
| "learning_rate": 4.397856662698368e-06, |
| "loss": 1.0533, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.398457583547558, |
| "grad_norm": 8.049531770624485, |
| "learning_rate": 4.133083447248599e-06, |
| "loss": 1.0745, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.4190231362467867, |
| "grad_norm": 7.225962565044245, |
| "learning_rate": 3.874439278536187e-06, |
| "loss": 1.0899, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.4395886889460154, |
| "grad_norm": 7.270761540137814, |
| "learning_rate": 3.6221943567905283e-06, |
| "loss": 1.0784, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.4601542416452442, |
| "grad_norm": 8.705361491327107, |
| "learning_rate": 3.3766121970796716e-06, |
| "loss": 1.0819, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.480719794344473, |
| "grad_norm": 4.183940499981808, |
| "learning_rate": 3.1379493540215677e-06, |
| "loss": 1.069, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.5012853470437018, |
| "grad_norm": 6.398726679627311, |
| "learning_rate": 2.906455153766744e-06, |
| "loss": 1.0785, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.5218508997429305, |
| "grad_norm": 13.601116552168905, |
| "learning_rate": 2.6823714335324237e-06, |
| "loss": 1.057, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.5424164524421595, |
| "grad_norm": 5.5895207138975, |
| "learning_rate": 2.46593228896017e-06, |
| "loss": 1.0553, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.562982005141388, |
| "grad_norm": 6.267057233279022, |
| "learning_rate": 2.257363829560986e-06, |
| "loss": 1.0542, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.583547557840617, |
| "grad_norm": 4.378216644983502, |
| "learning_rate": 2.0568839425033906e-06, |
| "loss": 1.0799, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6041131105398456, |
| "grad_norm": 7.255472699627192, |
| "learning_rate": 1.864702064991173e-06, |
| "loss": 1.0571, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.6246786632390746, |
| "grad_norm": 6.289474192540149, |
| "learning_rate": 1.6810189654686715e-06, |
| "loss": 1.0472, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.6452442159383034, |
| "grad_norm": 6.589217658649451, |
| "learning_rate": 1.5060265338821123e-06, |
| "loss": 1.0703, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.6658097686375322, |
| "grad_norm": 4.865701816340032, |
| "learning_rate": 1.3399075812161488e-06, |
| "loss": 1.055, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.686375321336761, |
| "grad_norm": 8.65763359081179, |
| "learning_rate": 1.1828356485149927e-06, |
| "loss": 1.0622, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.7069408740359897, |
| "grad_norm": 6.174972638755989, |
| "learning_rate": 1.0349748255876536e-06, |
| "loss": 1.0526, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.7275064267352185, |
| "grad_norm": 4.986705584850829, |
| "learning_rate": 8.964795795867176e-07, |
| "loss": 1.051, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.7480719794344473, |
| "grad_norm": 9.202122011281354, |
| "learning_rate": 7.67494593639686e-07, |
| "loss": 1.0574, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.7686375321336762, |
| "grad_norm": 8.058395431378568, |
| "learning_rate": 6.481546157014996e-07, |
| "loss": 1.0589, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.7892030848329048, |
| "grad_norm": 5.3354582297297135, |
| "learning_rate": 5.385843177861261e-07, |
| "loss": 1.0578, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.8097686375321338, |
| "grad_norm": 3.8239017444437415, |
| "learning_rate": 4.388981657242819e-07, |
| "loss": 1.0663, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.8303341902313623, |
| "grad_norm": 10.032769787716335, |
| "learning_rate": 3.4920029958333656e-07, |
| "loss": 1.0671, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.8508997429305913, |
| "grad_norm": 4.449303124886201, |
| "learning_rate": 2.695844248743318e-07, |
| "loss": 1.0573, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.87146529562982, |
| "grad_norm": 6.5875403409786735, |
| "learning_rate": 2.0013371465976816e-07, |
| "loss": 1.063, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.8920308483290489, |
| "grad_norm": 6.1087849946753545, |
| "learning_rate": 1.409207226644227e-07, |
| "loss": 1.0703, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.9125964010282777, |
| "grad_norm": 9.357948853303002, |
| "learning_rate": 9.200730747996211e-08, |
| "loss": 1.0615, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.9331619537275064, |
| "grad_norm": 6.746455746388351, |
| "learning_rate": 5.344456794255881e-08, |
| "loss": 1.0591, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.9537275064267352, |
| "grad_norm": 4.307020198331092, |
| "learning_rate": 2.5272789750980797e-08, |
| "loss": 1.0591, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.974293059125964, |
| "grad_norm": 2.368072060390636, |
| "learning_rate": 7.521403380956748e-09, |
| "loss": 1.0602, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.9948586118251928, |
| "grad_norm": 6.97337278897093, |
| "learning_rate": 2.089533397653387e-10, |
| "loss": 1.0495, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.9989717223650385, |
| "step": 972, |
| "total_flos": 9.034546879177687e+18, |
| "train_loss": 1.1365475546675945, |
| "train_runtime": 15552.1986, |
| "train_samples_per_second": 32.015, |
| "train_steps_per_second": 0.062 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 972, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "total_flos": 9.034546879177687e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|