| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.7086247086247086, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.018648018648018648, |
| "grad_norm": 6.3322858810424805, |
| "learning_rate": 0.0, |
| "loss": 0.7991, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.037296037296037296, |
| "grad_norm": 6.591444492340088, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.8816, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.055944055944055944, |
| "grad_norm": 6.131351947784424, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.7863, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.07459207459207459, |
| "grad_norm": 3.621307373046875, |
| "learning_rate": 1e-05, |
| "loss": 0.7604, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.09324009324009325, |
| "grad_norm": 5.652075290679932, |
| "learning_rate": 9.99965389153533e-06, |
| "loss": 0.7269, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.11188811188811189, |
| "grad_norm": 5.4281415939331055, |
| "learning_rate": 9.998615614057743e-06, |
| "loss": 0.7568, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.13053613053613053, |
| "grad_norm": 5.2610087394714355, |
| "learning_rate": 9.996885311309892e-06, |
| "loss": 0.7266, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.14918414918414918, |
| "grad_norm": 4.351637840270996, |
| "learning_rate": 9.994463222840748e-06, |
| "loss": 0.6749, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.16783216783216784, |
| "grad_norm": 3.5025124549865723, |
| "learning_rate": 9.991349683972435e-06, |
| "loss": 0.689, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.1864801864801865, |
| "grad_norm": 2.4112985134124756, |
| "learning_rate": 9.987545125753818e-06, |
| "loss": 0.7442, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.20512820512820512, |
| "grad_norm": 2.289846658706665, |
| "learning_rate": 9.983050074900824e-06, |
| "loss": 0.6741, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.22377622377622378, |
| "grad_norm": 2.045738697052002, |
| "learning_rate": 9.977865153723508e-06, |
| "loss": 0.6719, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.24242424242424243, |
| "grad_norm": 1.7808794975280762, |
| "learning_rate": 9.971991080039912e-06, |
| "loss": 0.6307, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.26107226107226106, |
| "grad_norm": 1.6070201396942139, |
| "learning_rate": 9.965428667076687e-06, |
| "loss": 0.665, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.27972027972027974, |
| "grad_norm": 1.4720741510391235, |
| "learning_rate": 9.958178823356503e-06, |
| "loss": 0.6371, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.29836829836829837, |
| "grad_norm": 1.2774218320846558, |
| "learning_rate": 9.950242552572272e-06, |
| "loss": 0.6312, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.317016317016317, |
| "grad_norm": 1.4246975183486938, |
| "learning_rate": 9.941620953448195e-06, |
| "loss": 0.6642, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.3356643356643357, |
| "grad_norm": 1.3494893312454224, |
| "learning_rate": 9.932315219587641e-06, |
| "loss": 0.5713, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.3543123543123543, |
| "grad_norm": 1.2985285520553589, |
| "learning_rate": 9.922326639307918e-06, |
| "loss": 0.5967, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.372960372960373, |
| "grad_norm": 1.5050538778305054, |
| "learning_rate": 9.911656595461899e-06, |
| "loss": 0.63, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.3916083916083916, |
| "grad_norm": 1.3601627349853516, |
| "learning_rate": 9.900306565246579e-06, |
| "loss": 0.5996, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.41025641025641024, |
| "grad_norm": 1.293419599533081, |
| "learning_rate": 9.888278119998573e-06, |
| "loss": 0.6239, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.4289044289044289, |
| "grad_norm": 1.2258663177490234, |
| "learning_rate": 9.875572924976568e-06, |
| "loss": 0.6268, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.44755244755244755, |
| "grad_norm": 1.1782615184783936, |
| "learning_rate": 9.86219273913078e-06, |
| "loss": 0.5997, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.4662004662004662, |
| "grad_norm": 1.2980071306228638, |
| "learning_rate": 9.848139414859441e-06, |
| "loss": 0.5644, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.48484848484848486, |
| "grad_norm": 1.2712095975875854, |
| "learning_rate": 9.833414897752346e-06, |
| "loss": 0.5805, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.5034965034965035, |
| "grad_norm": 1.3099210262298584, |
| "learning_rate": 9.818021226321502e-06, |
| "loss": 0.5912, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.5221445221445221, |
| "grad_norm": 1.2401714324951172, |
| "learning_rate": 9.801960531718898e-06, |
| "loss": 0.6011, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.5407925407925408, |
| "grad_norm": 1.2797614336013794, |
| "learning_rate": 9.785235037441473e-06, |
| "loss": 0.6409, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5594405594405595, |
| "grad_norm": 1.1702829599380493, |
| "learning_rate": 9.767847059023292e-06, |
| "loss": 0.5244, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.578088578088578, |
| "grad_norm": 1.1308348178863525, |
| "learning_rate": 9.749799003714954e-06, |
| "loss": 0.5703, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.5967365967365967, |
| "grad_norm": 1.2901357412338257, |
| "learning_rate": 9.731093370150349e-06, |
| "loss": 0.5943, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 1.243963360786438, |
| "learning_rate": 9.71173274800072e-06, |
| "loss": 0.5791, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.634032634032634, |
| "grad_norm": 1.122418999671936, |
| "learning_rate": 9.691719817616148e-06, |
| "loss": 0.5494, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.6526806526806527, |
| "grad_norm": 1.1197413206100464, |
| "learning_rate": 9.671057349654481e-06, |
| "loss": 0.5844, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.6713286713286714, |
| "grad_norm": 1.1654632091522217, |
| "learning_rate": 9.649748204697741e-06, |
| "loss": 0.5525, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.6899766899766899, |
| "grad_norm": 1.2135071754455566, |
| "learning_rate": 9.627795332856107e-06, |
| "loss": 0.5611, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.7086247086247086, |
| "grad_norm": 1.0855522155761719, |
| "learning_rate": 9.605201773359485e-06, |
| "loss": 0.5426, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 1.2061638832092285, |
| "learning_rate": 9.581970654136752e-06, |
| "loss": 0.5942, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.745920745920746, |
| "grad_norm": 1.0460582971572876, |
| "learning_rate": 9.55810519138271e-06, |
| "loss": 0.5781, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7645687645687645, |
| "grad_norm": 1.1728343963623047, |
| "learning_rate": 9.533608689112827e-06, |
| "loss": 0.5508, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.7832167832167832, |
| "grad_norm": 1.396593689918518, |
| "learning_rate": 9.508484538705823e-06, |
| "loss": 0.5841, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.8018648018648019, |
| "grad_norm": 1.1621497869491577, |
| "learning_rate": 9.482736218434144e-06, |
| "loss": 0.5211, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.8205128205128205, |
| "grad_norm": 1.2855342626571655, |
| "learning_rate": 9.45636729298243e-06, |
| "loss": 0.5505, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.8391608391608392, |
| "grad_norm": 1.1451700925827026, |
| "learning_rate": 9.429381412954e-06, |
| "loss": 0.5283, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8578088578088578, |
| "grad_norm": 1.2317947149276733, |
| "learning_rate": 9.401782314365458e-06, |
| "loss": 0.554, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.8764568764568764, |
| "grad_norm": 1.2228690385818481, |
| "learning_rate": 9.37357381812946e-06, |
| "loss": 0.5231, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.8951048951048951, |
| "grad_norm": 1.3035013675689697, |
| "learning_rate": 9.344759829525734e-06, |
| "loss": 0.5618, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.9137529137529138, |
| "grad_norm": 1.2204902172088623, |
| "learning_rate": 9.315344337660422e-06, |
| "loss": 0.5535, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.9324009324009324, |
| "grad_norm": 1.211380958557129, |
| "learning_rate": 9.285331414913816e-06, |
| "loss": 0.5824, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.951048951048951, |
| "grad_norm": 1.1931360960006714, |
| "learning_rate": 9.254725216376562e-06, |
| "loss": 0.5421, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "grad_norm": 1.2671467065811157, |
| "learning_rate": 9.223529979274411e-06, |
| "loss": 0.5926, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.9883449883449883, |
| "grad_norm": 1.2529419660568237, |
| "learning_rate": 9.191750022381613e-06, |
| "loss": 0.5775, |
| "step": 53 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.2529419660568237, |
| "learning_rate": 9.159389745423003e-06, |
| "loss": 0.5212, |
| "step": 54 |
| }, |
| { |
| "epoch": 1.0186480186480187, |
| "grad_norm": 1.69382905960083, |
| "learning_rate": 9.126453628464889e-06, |
| "loss": 0.5111, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0372960372960374, |
| "grad_norm": 1.2325468063354492, |
| "learning_rate": 9.09294623129482e-06, |
| "loss": 0.5528, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.055944055944056, |
| "grad_norm": 1.4443434476852417, |
| "learning_rate": 9.058872192790314e-06, |
| "loss": 0.5364, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.0745920745920745, |
| "grad_norm": 1.1799763441085815, |
| "learning_rate": 9.02423623027663e-06, |
| "loss": 0.4809, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.0932400932400932, |
| "grad_norm": 1.2783596515655518, |
| "learning_rate": 8.98904313887369e-06, |
| "loss": 0.5252, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.1118881118881119, |
| "grad_norm": 1.164589762687683, |
| "learning_rate": 8.953297790832231e-06, |
| "loss": 0.4953, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.1305361305361306, |
| "grad_norm": 1.184706687927246, |
| "learning_rate": 8.917005134859263e-06, |
| "loss": 0.4995, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.1491841491841492, |
| "grad_norm": 1.1743718385696411, |
| "learning_rate": 8.88017019543296e-06, |
| "loss": 0.4566, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.167832167832168, |
| "grad_norm": 1.2495801448822021, |
| "learning_rate": 8.842798072107055e-06, |
| "loss": 0.5244, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.1864801864801864, |
| "grad_norm": 1.192482590675354, |
| "learning_rate": 8.804893938804839e-06, |
| "loss": 0.4859, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.205128205128205, |
| "grad_norm": 1.2100673913955688, |
| "learning_rate": 8.766463043102864e-06, |
| "loss": 0.5466, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.2237762237762237, |
| "grad_norm": 1.2848899364471436, |
| "learning_rate": 8.727510705504453e-06, |
| "loss": 0.503, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.2424242424242424, |
| "grad_norm": 1.185261845588684, |
| "learning_rate": 8.688042318703111e-06, |
| "loss": 0.5007, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.2610722610722611, |
| "grad_norm": 1.252984881401062, |
| "learning_rate": 8.648063346835943e-06, |
| "loss": 0.5121, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.2797202797202798, |
| "grad_norm": 1.2732371091842651, |
| "learning_rate": 8.607579324727175e-06, |
| "loss": 0.5247, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.2983682983682985, |
| "grad_norm": 1.2724429368972778, |
| "learning_rate": 8.566595857121902e-06, |
| "loss": 0.4986, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.317016317016317, |
| "grad_norm": 1.2770615816116333, |
| "learning_rate": 8.525118617910144e-06, |
| "loss": 0.464, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.3356643356643356, |
| "grad_norm": 1.1614543199539185, |
| "learning_rate": 8.483153349341336e-06, |
| "loss": 0.5128, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.3543123543123543, |
| "grad_norm": 1.2303462028503418, |
| "learning_rate": 8.440705861229344e-06, |
| "loss": 0.4872, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.372960372960373, |
| "grad_norm": 1.168981671333313, |
| "learning_rate": 8.397782030148147e-06, |
| "loss": 0.5009, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.3916083916083917, |
| "grad_norm": 1.0755895376205444, |
| "learning_rate": 8.354387798618254e-06, |
| "loss": 0.5135, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.4102564102564101, |
| "grad_norm": 1.1931865215301514, |
| "learning_rate": 8.310529174284004e-06, |
| "loss": 0.4942, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.428904428904429, |
| "grad_norm": 1.2469918727874756, |
| "learning_rate": 8.266212229081846e-06, |
| "loss": 0.5237, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.4475524475524475, |
| "grad_norm": 1.1931370496749878, |
| "learning_rate": 8.221443098399733e-06, |
| "loss": 0.5069, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.4662004662004662, |
| "grad_norm": 1.099647045135498, |
| "learning_rate": 8.176227980227693e-06, |
| "loss": 0.4646, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.4848484848484849, |
| "grad_norm": 1.2228244543075562, |
| "learning_rate": 8.130573134299782e-06, |
| "loss": 0.5151, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.5034965034965035, |
| "grad_norm": 1.2358263731002808, |
| "learning_rate": 8.084484881227449e-06, |
| "loss": 0.5119, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.5221445221445222, |
| "grad_norm": 1.1055835485458374, |
| "learning_rate": 8.037969601624495e-06, |
| "loss": 0.4502, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.5407925407925407, |
| "grad_norm": 1.2238577604293823, |
| "learning_rate": 7.99103373522373e-06, |
| "loss": 0.4787, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.5594405594405596, |
| "grad_norm": 1.2454148530960083, |
| "learning_rate": 7.943683779985412e-06, |
| "loss": 0.4767, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.578088578088578, |
| "grad_norm": 1.131277084350586, |
| "learning_rate": 7.895926291197667e-06, |
| "loss": 0.4717, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.5967365967365967, |
| "grad_norm": 1.131995439529419, |
| "learning_rate": 7.847767880568944e-06, |
| "loss": 0.4875, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.6153846153846154, |
| "grad_norm": 1.0890105962753296, |
| "learning_rate": 7.799215215312667e-06, |
| "loss": 0.4749, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.6340326340326339, |
| "grad_norm": 1.1363167762756348, |
| "learning_rate": 7.750275017224208e-06, |
| "loss": 0.5297, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.6526806526806528, |
| "grad_norm": 1.196660041809082, |
| "learning_rate": 7.700954061750295e-06, |
| "loss": 0.504, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.6713286713286712, |
| "grad_norm": 1.1384004354476929, |
| "learning_rate": 7.651259177050996e-06, |
| "loss": 0.498, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.68997668997669, |
| "grad_norm": 1.1040617227554321, |
| "learning_rate": 7.601197243054411e-06, |
| "loss": 0.504, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.7086247086247086, |
| "grad_norm": 1.1692686080932617, |
| "learning_rate": 7.5507751905041885e-06, |
| "loss": 0.4742, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.7272727272727273, |
| "grad_norm": 1.197273850440979, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.4948, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.745920745920746, |
| "grad_norm": 1.1838762760162354, |
| "learning_rate": 7.4488787010311425e-06, |
| "loss": 0.4882, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.7645687645687644, |
| "grad_norm": 1.074876070022583, |
| "learning_rate": 7.3974183710033334e-06, |
| "loss": 0.4994, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.7832167832167833, |
| "grad_norm": 1.1607307195663452, |
| "learning_rate": 7.345626134258897e-06, |
| "loss": 0.498, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.8018648018648018, |
| "grad_norm": 1.1693527698516846, |
| "learning_rate": 7.293509161090453e-06, |
| "loss": 0.4706, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.8205128205128205, |
| "grad_norm": 1.1076033115386963, |
| "learning_rate": 7.241074666748228e-06, |
| "loss": 0.5055, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.8391608391608392, |
| "grad_norm": 1.1932028532028198, |
| "learning_rate": 7.188329910441154e-06, |
| "loss": 0.5006, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.8578088578088578, |
| "grad_norm": 1.2250460386276245, |
| "learning_rate": 7.135282194331881e-06, |
| "loss": 0.479, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.8764568764568765, |
| "grad_norm": 1.3001635074615479, |
| "learning_rate": 7.0819388625258385e-06, |
| "loss": 0.5101, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.895104895104895, |
| "grad_norm": 1.3003392219543457, |
| "learning_rate": 7.028307300054499e-06, |
| "loss": 0.4875, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.913752913752914, |
| "grad_norm": 1.1021766662597656, |
| "learning_rate": 6.974394931852957e-06, |
| "loss": 0.4848, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.9324009324009324, |
| "grad_norm": 1.3179751634597778, |
| "learning_rate": 6.920209221732007e-06, |
| "loss": 0.5254, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.951048951048951, |
| "grad_norm": 1.2668633460998535, |
| "learning_rate": 6.865757671344827e-06, |
| "loss": 0.5057, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.9696969696969697, |
| "grad_norm": 1.1570608615875244, |
| "learning_rate": 6.811047819148413e-06, |
| "loss": 0.4837, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.9883449883449882, |
| "grad_norm": 1.1735329627990723, |
| "learning_rate": 6.756087239359948e-06, |
| "loss": 0.5096, |
| "step": 107 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.3861793279647827, |
| "learning_rate": 6.700883540908185e-06, |
| "loss": 0.4528, |
| "step": 108 |
| }, |
| { |
| "epoch": 2.0186480186480185, |
| "grad_norm": 1.4102225303649902, |
| "learning_rate": 6.64544436638005e-06, |
| "loss": 0.4672, |
| "step": 109 |
| }, |
| { |
| "epoch": 2.0372960372960374, |
| "grad_norm": 1.3145784139633179, |
| "learning_rate": 6.589777390962575e-06, |
| "loss": 0.471, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.055944055944056, |
| "grad_norm": 1.2322523593902588, |
| "learning_rate": 6.53389032138032e-06, |
| "loss": 0.4443, |
| "step": 111 |
| }, |
| { |
| "epoch": 2.0745920745920747, |
| "grad_norm": 1.2904436588287354, |
| "learning_rate": 6.477790894828422e-06, |
| "loss": 0.4656, |
| "step": 112 |
| }, |
| { |
| "epoch": 2.093240093240093, |
| "grad_norm": 1.1724885702133179, |
| "learning_rate": 6.421486877901436e-06, |
| "loss": 0.4061, |
| "step": 113 |
| }, |
| { |
| "epoch": 2.111888111888112, |
| "grad_norm": 1.1402873992919922, |
| "learning_rate": 6.364986065518106e-06, |
| "loss": 0.4268, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.1305361305361306, |
| "grad_norm": 1.2183982133865356, |
| "learning_rate": 6.308296279842204e-06, |
| "loss": 0.4333, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.149184149184149, |
| "grad_norm": 1.2880562543869019, |
| "learning_rate": 6.2514253691996e-06, |
| "loss": 0.4599, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.167832167832168, |
| "grad_norm": 1.1045494079589844, |
| "learning_rate": 6.194381206991723e-06, |
| "loss": 0.4359, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.1864801864801864, |
| "grad_norm": 1.0571209192276, |
| "learning_rate": 6.1371716906055336e-06, |
| "loss": 0.4506, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.2051282051282053, |
| "grad_norm": 1.2100480794906616, |
| "learning_rate": 6.079804740320181e-06, |
| "loss": 0.4629, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.2237762237762237, |
| "grad_norm": 1.2123056650161743, |
| "learning_rate": 6.022288298210502e-06, |
| "loss": 0.4294, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.242424242424242, |
| "grad_norm": 1.1131305694580078, |
| "learning_rate": 5.964630327047485e-06, |
| "loss": 0.4652, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.261072261072261, |
| "grad_norm": 1.0908523797988892, |
| "learning_rate": 5.906838809195879e-06, |
| "loss": 0.4552, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.2797202797202796, |
| "grad_norm": 1.1521565914154053, |
| "learning_rate": 5.848921745509094e-06, |
| "loss": 0.4443, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.2983682983682985, |
| "grad_norm": 1.1898850202560425, |
| "learning_rate": 5.790887154221521e-06, |
| "loss": 0.4831, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.317016317016317, |
| "grad_norm": 1.1984978914260864, |
| "learning_rate": 5.7327430698384775e-06, |
| "loss": 0.4533, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.335664335664336, |
| "grad_norm": 1.1915768384933472, |
| "learning_rate": 5.674497542023875e-06, |
| "loss": 0.4171, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.3543123543123543, |
| "grad_norm": 1.220482349395752, |
| "learning_rate": 5.616158634485793e-06, |
| "loss": 0.4745, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.3729603729603728, |
| "grad_norm": 1.2805721759796143, |
| "learning_rate": 5.557734423860122e-06, |
| "loss": 0.4597, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.3916083916083917, |
| "grad_norm": 1.1252267360687256, |
| "learning_rate": 5.499232998592399e-06, |
| "loss": 0.43, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.41025641025641, |
| "grad_norm": 1.2377195358276367, |
| "learning_rate": 5.44066245781801e-06, |
| "loss": 0.4456, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.428904428904429, |
| "grad_norm": 1.0829073190689087, |
| "learning_rate": 5.382030910240936e-06, |
| "loss": 0.4475, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.4475524475524475, |
| "grad_norm": 1.1754248142242432, |
| "learning_rate": 5.3233464730111426e-06, |
| "loss": 0.4638, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.4662004662004664, |
| "grad_norm": 1.1847262382507324, |
| "learning_rate": 5.2646172706008154e-06, |
| "loss": 0.4748, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.484848484848485, |
| "grad_norm": 1.2288694381713867, |
| "learning_rate": 5.20585143367959e-06, |
| "loss": 0.4063, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.5034965034965033, |
| "grad_norm": 1.1340534687042236, |
| "learning_rate": 5.147057097988898e-06, |
| "loss": 0.4805, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.5221445221445222, |
| "grad_norm": 1.1196503639221191, |
| "learning_rate": 5.088242403215644e-06, |
| "loss": 0.4487, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.5407925407925407, |
| "grad_norm": 1.1831355094909668, |
| "learning_rate": 5.029415491865311e-06, |
| "loss": 0.4206, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.5594405594405596, |
| "grad_norm": 1.148800253868103, |
| "learning_rate": 4.97058450813469e-06, |
| "loss": 0.46, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.578088578088578, |
| "grad_norm": 1.1630257368087769, |
| "learning_rate": 4.911757596784358e-06, |
| "loss": 0.4357, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.596736596736597, |
| "grad_norm": 1.2669141292572021, |
| "learning_rate": 4.8529429020111035e-06, |
| "loss": 0.4658, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.6153846153846154, |
| "grad_norm": 1.153731107711792, |
| "learning_rate": 4.794148566320412e-06, |
| "loss": 0.4333, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.634032634032634, |
| "grad_norm": 1.1950958967208862, |
| "learning_rate": 4.7353827293991845e-06, |
| "loss": 0.4582, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.652680652680653, |
| "grad_norm": 1.1688624620437622, |
| "learning_rate": 4.676653526988858e-06, |
| "loss": 0.4364, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.6713286713286712, |
| "grad_norm": 1.208266019821167, |
| "learning_rate": 4.617969089759066e-06, |
| "loss": 0.4465, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.6899766899766897, |
| "grad_norm": 1.1561684608459473, |
| "learning_rate": 4.559337542181993e-06, |
| "loss": 0.4337, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.7086247086247086, |
| "grad_norm": 1.1300450563430786, |
| "learning_rate": 4.500767001407604e-06, |
| "loss": 0.4652, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 1.2024763822555542, |
| "learning_rate": 4.4422655761398785e-06, |
| "loss": 0.4433, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.745920745920746, |
| "grad_norm": 1.1854138374328613, |
| "learning_rate": 4.383841365514208e-06, |
| "loss": 0.4056, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.7645687645687644, |
| "grad_norm": 1.153937816619873, |
| "learning_rate": 4.325502457976126e-06, |
| "loss": 0.4267, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.7832167832167833, |
| "grad_norm": 1.0862733125686646, |
| "learning_rate": 4.267256930161523e-06, |
| "loss": 0.4348, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.801864801864802, |
| "grad_norm": 1.2374823093414307, |
| "learning_rate": 4.209112845778481e-06, |
| "loss": 0.4199, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.8205128205128203, |
| "grad_norm": 1.1854605674743652, |
| "learning_rate": 4.151078254490908e-06, |
| "loss": 0.4336, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.839160839160839, |
| "grad_norm": 1.1627620458602905, |
| "learning_rate": 4.09316119080412e-06, |
| "loss": 0.4224, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.857808857808858, |
| "grad_norm": 1.1857858896255493, |
| "learning_rate": 4.035369672952516e-06, |
| "loss": 0.4475, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.8764568764568765, |
| "grad_norm": 1.1136168241500854, |
| "learning_rate": 3.977711701789499e-06, |
| "loss": 0.45, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.895104895104895, |
| "grad_norm": 1.1074330806732178, |
| "learning_rate": 3.920195259679822e-06, |
| "loss": 0.4446, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.913752913752914, |
| "grad_norm": 1.1580936908721924, |
| "learning_rate": 3.862828309394469e-06, |
| "loss": 0.4632, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.9324009324009324, |
| "grad_norm": 1.1213862895965576, |
| "learning_rate": 3.805618793008279e-06, |
| "loss": 0.4525, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.951048951048951, |
| "grad_norm": 1.1242083311080933, |
| "learning_rate": 3.7485746308004013e-06, |
| "loss": 0.4262, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.9696969696969697, |
| "grad_norm": 1.1000770330429077, |
| "learning_rate": 3.6917037201577977e-06, |
| "loss": 0.4468, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.988344988344988, |
| "grad_norm": 1.1469992399215698, |
| "learning_rate": 3.635013934481895e-06, |
| "loss": 0.4818, |
| "step": 161 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.1469992399215698, |
| "learning_rate": 3.578513122098566e-06, |
| "loss": 0.4544, |
| "step": 162 |
| }, |
| { |
| "epoch": 3.0186480186480185, |
| "grad_norm": 1.7396659851074219, |
| "learning_rate": 3.5222091051715803e-06, |
| "loss": 0.4208, |
| "step": 163 |
| }, |
| { |
| "epoch": 3.0372960372960374, |
| "grad_norm": 1.1009488105773926, |
| "learning_rate": 3.466109678619681e-06, |
| "loss": 0.3786, |
| "step": 164 |
| }, |
| { |
| "epoch": 3.055944055944056, |
| "grad_norm": 1.1437153816223145, |
| "learning_rate": 3.4102226090374246e-06, |
| "loss": 0.4494, |
| "step": 165 |
| }, |
| { |
| "epoch": 3.0745920745920747, |
| "grad_norm": 1.1509310007095337, |
| "learning_rate": 3.35455563361995e-06, |
| "loss": 0.3992, |
| "step": 166 |
| }, |
| { |
| "epoch": 3.093240093240093, |
| "grad_norm": 1.0771594047546387, |
| "learning_rate": 3.2991164590918162e-06, |
| "loss": 0.4229, |
| "step": 167 |
| }, |
| { |
| "epoch": 3.111888111888112, |
| "grad_norm": 1.0693249702453613, |
| "learning_rate": 3.2439127606400546e-06, |
| "loss": 0.4083, |
| "step": 168 |
| }, |
| { |
| "epoch": 3.1305361305361306, |
| "grad_norm": 1.0992814302444458, |
| "learning_rate": 3.1889521808515888e-06, |
| "loss": 0.4571, |
| "step": 169 |
| }, |
| { |
| "epoch": 3.149184149184149, |
| "grad_norm": 1.1360324621200562, |
| "learning_rate": 3.1342423286551756e-06, |
| "loss": 0.4196, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.167832167832168, |
| "grad_norm": 1.167336344718933, |
| "learning_rate": 3.0797907782679944e-06, |
| "loss": 0.4165, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.1864801864801864, |
| "grad_norm": 1.1829248666763306, |
| "learning_rate": 3.0256050681470446e-06, |
| "loss": 0.4764, |
| "step": 172 |
| }, |
| { |
| "epoch": 3.2051282051282053, |
| "grad_norm": 1.150373101234436, |
| "learning_rate": 2.971692699945502e-06, |
| "loss": 0.4224, |
| "step": 173 |
| }, |
| { |
| "epoch": 3.2237762237762237, |
| "grad_norm": 1.1187318563461304, |
| "learning_rate": 2.9180611374741623e-06, |
| "loss": 0.4192, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.242424242424242, |
| "grad_norm": 1.096235990524292, |
| "learning_rate": 2.8647178056681197e-06, |
| "loss": 0.4162, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.261072261072261, |
| "grad_norm": 1.1855800151824951, |
| "learning_rate": 2.8116700895588473e-06, |
| "loss": 0.41, |
| "step": 176 |
| }, |
| { |
| "epoch": 3.2797202797202796, |
| "grad_norm": 1.1241765022277832, |
| "learning_rate": 2.7589253332517736e-06, |
| "loss": 0.425, |
| "step": 177 |
| }, |
| { |
| "epoch": 3.2983682983682985, |
| "grad_norm": 1.1213021278381348, |
| "learning_rate": 2.706490838909547e-06, |
| "loss": 0.414, |
| "step": 178 |
| }, |
| { |
| "epoch": 3.317016317016317, |
| "grad_norm": 1.0516644716262817, |
| "learning_rate": 2.6543738657411033e-06, |
| "loss": 0.4102, |
| "step": 179 |
| }, |
| { |
| "epoch": 3.335664335664336, |
| "grad_norm": 1.085395097732544, |
| "learning_rate": 2.6025816289966703e-06, |
| "loss": 0.4476, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.3543123543123543, |
| "grad_norm": 1.1523784399032593, |
| "learning_rate": 2.5511212989688587e-06, |
| "loss": 0.4453, |
| "step": 181 |
| }, |
| { |
| "epoch": 3.3729603729603728, |
| "grad_norm": 1.1129071712493896, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": 0.4151, |
| "step": 182 |
| }, |
| { |
| "epoch": 3.3916083916083917, |
| "grad_norm": 1.0871580839157104, |
| "learning_rate": 2.449224809495815e-06, |
| "loss": 0.4252, |
| "step": 183 |
| }, |
| { |
| "epoch": 3.41025641025641, |
| "grad_norm": 1.0709385871887207, |
| "learning_rate": 2.3988027569455895e-06, |
| "loss": 0.3925, |
| "step": 184 |
| }, |
| { |
| "epoch": 3.428904428904429, |
| "grad_norm": 1.0848338603973389, |
| "learning_rate": 2.348740822949006e-06, |
| "loss": 0.4265, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.4475524475524475, |
| "grad_norm": 1.0750283002853394, |
| "learning_rate": 2.2990459382497086e-06, |
| "loss": 0.3992, |
| "step": 186 |
| }, |
| { |
| "epoch": 3.4662004662004664, |
| "grad_norm": 1.1117126941680908, |
| "learning_rate": 2.2497249827757933e-06, |
| "loss": 0.3911, |
| "step": 187 |
| }, |
| { |
| "epoch": 3.484848484848485, |
| "grad_norm": 1.1823192834854126, |
| "learning_rate": 2.2007847846873342e-06, |
| "loss": 0.4162, |
| "step": 188 |
| }, |
| { |
| "epoch": 3.5034965034965033, |
| "grad_norm": 1.1932240724563599, |
| "learning_rate": 2.1522321194310577e-06, |
| "loss": 0.3889, |
| "step": 189 |
| }, |
| { |
| "epoch": 3.5221445221445222, |
| "grad_norm": 1.0959529876708984, |
| "learning_rate": 2.1040737088023323e-06, |
| "loss": 0.4305, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.5407925407925407, |
| "grad_norm": 1.1720889806747437, |
| "learning_rate": 2.056316220014588e-06, |
| "loss": 0.4321, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.5594405594405596, |
| "grad_norm": 1.157222032546997, |
| "learning_rate": 2.0089662647762716e-06, |
| "loss": 0.4036, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.578088578088578, |
| "grad_norm": 1.0834124088287354, |
| "learning_rate": 1.962030398375506e-06, |
| "loss": 0.4039, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.596736596736597, |
| "grad_norm": 1.0103845596313477, |
| "learning_rate": 1.915515118772555e-06, |
| "loss": 0.4508, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.6153846153846154, |
| "grad_norm": 1.094143033027649, |
| "learning_rate": 1.8694268657002197e-06, |
| "loss": 0.4324, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.634032634032634, |
| "grad_norm": 1.1231107711791992, |
| "learning_rate": 1.8237720197723075e-06, |
| "loss": 0.4036, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.652680652680653, |
| "grad_norm": 1.1039270162582397, |
| "learning_rate": 1.7785569016002686e-06, |
| "loss": 0.3793, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.6713286713286712, |
| "grad_norm": 1.1073729991912842, |
| "learning_rate": 1.7337877709181527e-06, |
| "loss": 0.4372, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.6899766899766897, |
| "grad_norm": 1.109791874885559, |
| "learning_rate": 1.689470825715998e-06, |
| "loss": 0.4416, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.7086247086247086, |
| "grad_norm": 1.1324268579483032, |
| "learning_rate": 1.6456122013817477e-06, |
| "loss": 0.4021, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 270, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 20, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.053262545813504e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|