| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 81965, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.018301052310507854, |
| "grad_norm": 0.5670120716094971, |
| "learning_rate": 0.00019999343324994024, |
| "loss": 1.7725, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03660210462101571, |
| "grad_norm": 0.5167950987815857, |
| "learning_rate": 0.00019997364594600192, |
| "loss": 1.5681, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.05490315693152356, |
| "grad_norm": 0.46956542134284973, |
| "learning_rate": 0.00019994064063090904, |
| "loss": 1.5176, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.07320420924203141, |
| "grad_norm": 0.5100629329681396, |
| "learning_rate": 0.00019989442166845785, |
| "loss": 1.4843, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.09150526155253927, |
| "grad_norm": 0.4842604398727417, |
| "learning_rate": 0.0001998349951694864, |
| "loss": 1.4558, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10980631386304712, |
| "grad_norm": 0.4316524863243103, |
| "learning_rate": 0.00019976236899106663, |
| "loss": 1.4411, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.12810736617355498, |
| "grad_norm": 0.4475869834423065, |
| "learning_rate": 0.0001996765527354655, |
| "loss": 1.4211, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.14640841848406283, |
| "grad_norm": 0.4860438406467438, |
| "learning_rate": 0.00019957755774887542, |
| "loss": 1.4096, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.16470947079457068, |
| "grad_norm": 0.4574320316314697, |
| "learning_rate": 0.00019946539711991412, |
| "loss": 1.3946, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.18301052310507854, |
| "grad_norm": 0.44823363423347473, |
| "learning_rate": 0.0001993400856778942, |
| "loss": 1.3861, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.2013115754155864, |
| "grad_norm": 0.4297441840171814, |
| "learning_rate": 0.00019920163999086237, |
| "loss": 1.3743, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.21961262772609424, |
| "grad_norm": 0.4372228980064392, |
| "learning_rate": 0.00019905007836340904, |
| "loss": 1.3683, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.2379136800366021, |
| "grad_norm": 0.43532341718673706, |
| "learning_rate": 0.00019888542083424797, |
| "loss": 1.36, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.25621473234710995, |
| "grad_norm": 0.4318517744541168, |
| "learning_rate": 0.00019870768917356713, |
| "loss": 1.353, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.27451578465761783, |
| "grad_norm": 0.41758111119270325, |
| "learning_rate": 0.00019851690688015006, |
| "loss": 1.3469, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.29281683696812566, |
| "grad_norm": 0.46511879563331604, |
| "learning_rate": 0.00019831309917826929, |
| "loss": 1.3392, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.31111788927863354, |
| "grad_norm": 0.4342522621154785, |
| "learning_rate": 0.00019809629301435096, |
| "loss": 1.3298, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.32941894158914137, |
| "grad_norm": 0.4393858313560486, |
| "learning_rate": 0.00019786651705341253, |
| "loss": 1.3247, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.34771999389964925, |
| "grad_norm": 0.4274674654006958, |
| "learning_rate": 0.00019762380167527238, |
| "loss": 1.3227, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.3660210462101571, |
| "grad_norm": 0.4522903561592102, |
| "learning_rate": 0.0001973681789705334, |
| "loss": 1.3152, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.38432209852066496, |
| "grad_norm": 0.4059693515300751, |
| "learning_rate": 0.00019709968273634018, |
| "loss": 1.3169, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.4026231508311728, |
| "grad_norm": 0.39100533723831177, |
| "learning_rate": 0.00019681834847191042, |
| "loss": 1.3093, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.42092420314168066, |
| "grad_norm": 0.4144134819507599, |
| "learning_rate": 0.00019652421337384136, |
| "loss": 1.3032, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.4392252554521885, |
| "grad_norm": 0.4192708432674408, |
| "learning_rate": 0.00019621731633119192, |
| "loss": 1.2983, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.45752630776269637, |
| "grad_norm": 0.4011496305465698, |
| "learning_rate": 0.0001958976979203409, |
| "loss": 1.2906, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.4758273600732042, |
| "grad_norm": 0.4050943851470947, |
| "learning_rate": 0.00019556540039962234, |
| "loss": 1.2897, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.4941284123837121, |
| "grad_norm": 0.40949124097824097, |
| "learning_rate": 0.0001952204677037382, |
| "loss": 1.2891, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.5124294646942199, |
| "grad_norm": 0.42695119976997375, |
| "learning_rate": 0.0001948629454379494, |
| "loss": 1.2822, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.5307305170047277, |
| "grad_norm": 0.37948158383369446, |
| "learning_rate": 0.00019449288087204658, |
| "loss": 1.2793, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.5490315693152357, |
| "grad_norm": 0.40617385506629944, |
| "learning_rate": 0.0001941103229340998, |
| "loss": 1.2742, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.5673326216257435, |
| "grad_norm": 0.43047332763671875, |
| "learning_rate": 0.00019371532220398976, |
| "loss": 1.2726, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.5856336739362513, |
| "grad_norm": 0.38262712955474854, |
| "learning_rate": 0.0001933079309067205, |
| "loss": 1.2728, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.6039347262467591, |
| "grad_norm": 0.4016256630420685, |
| "learning_rate": 0.0001928882029055143, |
| "loss": 1.2647, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.6222357785572671, |
| "grad_norm": 0.3772490918636322, |
| "learning_rate": 0.00019245619369469012, |
| "loss": 1.2606, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.6405368308677749, |
| "grad_norm": 0.3964807689189911, |
| "learning_rate": 0.0001920119603923266, |
| "loss": 1.2595, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.6588378831782827, |
| "grad_norm": 0.3914461135864258, |
| "learning_rate": 0.00019155556173271006, |
| "loss": 1.2549, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.6771389354887906, |
| "grad_norm": 0.4277031123638153, |
| "learning_rate": 0.000191087058058569, |
| "loss": 1.2555, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.6954399877992985, |
| "grad_norm": 0.3928915560245514, |
| "learning_rate": 0.00019060651131309582, |
| "loss": 1.2481, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.7137410401098063, |
| "grad_norm": 0.3531055450439453, |
| "learning_rate": 0.00019011398503175723, |
| "loss": 1.2525, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.7320420924203141, |
| "grad_norm": 0.37548962235450745, |
| "learning_rate": 0.00018960954433389345, |
| "loss": 1.2425, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.750343144730822, |
| "grad_norm": 0.3959772288799286, |
| "learning_rate": 0.00018909325591410906, |
| "loss": 1.2407, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.7686441970413299, |
| "grad_norm": 0.36781710386276245, |
| "learning_rate": 0.0001885651880334545, |
| "loss": 1.2417, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.7869452493518377, |
| "grad_norm": 0.3856872022151947, |
| "learning_rate": 0.00018802541051040112, |
| "loss": 1.2391, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.8052463016623456, |
| "grad_norm": 0.3833692669868469, |
| "learning_rate": 0.0001874739947116101, |
| "loss": 1.2377, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.8235473539728534, |
| "grad_norm": 0.40677550435066223, |
| "learning_rate": 0.00018691101354249688, |
| "loss": 1.2286, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.8418484062833613, |
| "grad_norm": 0.39465758204460144, |
| "learning_rate": 0.0001863365414375916, |
| "loss": 1.2324, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.8601494585938692, |
| "grad_norm": 0.3750082552433014, |
| "learning_rate": 0.00018575065435069807, |
| "loss": 1.2285, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.878450510904377, |
| "grad_norm": 0.3967496454715729, |
| "learning_rate": 0.0001851534297448515, |
| "loss": 1.2264, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.8967515632148848, |
| "grad_norm": 0.376779705286026, |
| "learning_rate": 0.00018454494658207658, |
| "loss": 1.2224, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.9150526155253927, |
| "grad_norm": 0.41239818930625916, |
| "learning_rate": 0.00018392528531294762, |
| "loss": 1.2209, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.9333536678359006, |
| "grad_norm": 0.41411134600639343, |
| "learning_rate": 0.00018329452786595177, |
| "loss": 1.2207, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.9516547201464084, |
| "grad_norm": 0.3956342339515686, |
| "learning_rate": 0.00018265275763665683, |
| "loss": 1.2169, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.9699557724569162, |
| "grad_norm": 0.4099271893501282, |
| "learning_rate": 0.00018200005947668522, |
| "loss": 1.2138, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.9882568247674242, |
| "grad_norm": 0.37904131412506104, |
| "learning_rate": 0.00018133651968249502, |
| "loss": 1.2126, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.006527375324081, |
| "grad_norm": 0.37347930669784546, |
| "learning_rate": 0.00018066222598397077, |
| "loss": 1.2007, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.024828427634589, |
| "grad_norm": 0.35225772857666016, |
| "learning_rate": 0.0001799772675328237, |
| "loss": 1.1819, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.0431294799450967, |
| "grad_norm": 0.3745623528957367, |
| "learning_rate": 0.00017928173489080505, |
| "loss": 1.1873, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.0614305322556048, |
| "grad_norm": 0.36279329657554626, |
| "learning_rate": 0.00017857572001773214, |
| "loss": 1.1799, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.0797315845661126, |
| "grad_norm": 0.3755429685115814, |
| "learning_rate": 0.00017785931625933008, |
| "loss": 1.1822, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.0980326368766204, |
| "grad_norm": 0.3589393198490143, |
| "learning_rate": 0.0001771326183348899, |
| "loss": 1.1854, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.1163336891871283, |
| "grad_norm": 0.3789694309234619, |
| "learning_rate": 0.00017639572232474538, |
| "loss": 1.1815, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.134634741497636, |
| "grad_norm": 0.41688501834869385, |
| "learning_rate": 0.00017564872565756977, |
| "loss": 1.1809, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1.152935793808144, |
| "grad_norm": 0.3584393560886383, |
| "learning_rate": 0.00017489172709749413, |
| "loss": 1.1768, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1.1712368461186518, |
| "grad_norm": 0.3694947063922882, |
| "learning_rate": 0.00017412482673104953, |
| "loss": 1.1799, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1.1895378984291596, |
| "grad_norm": 0.356913298368454, |
| "learning_rate": 0.0001733481259539338, |
| "loss": 1.1714, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.2078389507396676, |
| "grad_norm": 0.3630887567996979, |
| "learning_rate": 0.00017256172745760563, |
| "loss": 1.1733, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1.2261400030501755, |
| "grad_norm": 0.3733890950679779, |
| "learning_rate": 0.00017176573521570726, |
| "loss": 1.1756, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1.2444410553606833, |
| "grad_norm": 0.37580791115760803, |
| "learning_rate": 0.00017096025447031757, |
| "loss": 1.1697, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1.262742107671191, |
| "grad_norm": 0.3889394700527191, |
| "learning_rate": 0.00017014539171803756, |
| "loss": 1.172, |
| "step": 20700 |
| }, |
| { |
| "epoch": 1.281043159981699, |
| "grad_norm": 0.3594827950000763, |
| "learning_rate": 0.00016932125469590976, |
| "loss": 1.1716, |
| "step": 21000 |
| }, |
| { |
| "epoch": 1.2993442122922068, |
| "grad_norm": 0.3551196753978729, |
| "learning_rate": 0.00016848795236717385, |
| "loss": 1.1666, |
| "step": 21300 |
| }, |
| { |
| "epoch": 1.3176452646027146, |
| "grad_norm": 0.3702225387096405, |
| "learning_rate": 0.00016764559490686028, |
| "loss": 1.1725, |
| "step": 21600 |
| }, |
| { |
| "epoch": 1.3359463169132226, |
| "grad_norm": 0.37975847721099854, |
| "learning_rate": 0.00016679429368722317, |
| "loss": 1.1658, |
| "step": 21900 |
| }, |
| { |
| "epoch": 1.3542473692237302, |
| "grad_norm": 0.3720037341117859, |
| "learning_rate": 0.00016593416126301543, |
| "loss": 1.1665, |
| "step": 22200 |
| }, |
| { |
| "epoch": 1.3725484215342383, |
| "grad_norm": 0.3632305860519409, |
| "learning_rate": 0.00016506531135660732, |
| "loss": 1.1651, |
| "step": 22500 |
| }, |
| { |
| "epoch": 1.390849473844746, |
| "grad_norm": 0.3983381688594818, |
| "learning_rate": 0.00016418785884295055, |
| "loss": 1.1633, |
| "step": 22800 |
| }, |
| { |
| "epoch": 1.409150526155254, |
| "grad_norm": 0.35853487253189087, |
| "learning_rate": 0.00016330191973439027, |
| "loss": 1.1624, |
| "step": 23100 |
| }, |
| { |
| "epoch": 1.4274515784657618, |
| "grad_norm": 0.3824533224105835, |
| "learning_rate": 0.00016240761116532624, |
| "loss": 1.1624, |
| "step": 23400 |
| }, |
| { |
| "epoch": 1.4457526307762696, |
| "grad_norm": 0.38290977478027344, |
| "learning_rate": 0.00016150505137672624, |
| "loss": 1.158, |
| "step": 23700 |
| }, |
| { |
| "epoch": 1.4640536830867774, |
| "grad_norm": 0.34818851947784424, |
| "learning_rate": 0.0001605943597004926, |
| "loss": 1.1637, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1.4823547353972852, |
| "grad_norm": 0.3592260181903839, |
| "learning_rate": 0.00015967565654368478, |
| "loss": 1.1571, |
| "step": 24300 |
| }, |
| { |
| "epoch": 1.5006557877077933, |
| "grad_norm": 0.36368805170059204, |
| "learning_rate": 0.00015874906337259978, |
| "loss": 1.1514, |
| "step": 24600 |
| }, |
| { |
| "epoch": 1.518956840018301, |
| "grad_norm": 0.3715674877166748, |
| "learning_rate": 0.00015781470269671263, |
| "loss": 1.154, |
| "step": 24900 |
| }, |
| { |
| "epoch": 1.537257892328809, |
| "grad_norm": 0.38933447003364563, |
| "learning_rate": 0.0001568726980524785, |
| "loss": 1.1521, |
| "step": 25200 |
| }, |
| { |
| "epoch": 1.5555589446393168, |
| "grad_norm": 0.4079601466655731, |
| "learning_rate": 0.00015592317398699948, |
| "loss": 1.1548, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1.5738599969498246, |
| "grad_norm": 0.40320438146591187, |
| "learning_rate": 0.0001549662560415578, |
| "loss": 1.152, |
| "step": 25800 |
| }, |
| { |
| "epoch": 1.5921610492603324, |
| "grad_norm": 0.3639344573020935, |
| "learning_rate": 0.00015400207073501703, |
| "loss": 1.1452, |
| "step": 26100 |
| }, |
| { |
| "epoch": 1.6104621015708402, |
| "grad_norm": 0.3776426315307617, |
| "learning_rate": 0.0001530307455470946, |
| "loss": 1.1494, |
| "step": 26400 |
| }, |
| { |
| "epoch": 1.6287631538813483, |
| "grad_norm": 0.3482027053833008, |
| "learning_rate": 0.00015205240890150701, |
| "loss": 1.148, |
| "step": 26700 |
| }, |
| { |
| "epoch": 1.647064206191856, |
| "grad_norm": 0.380161851644516, |
| "learning_rate": 0.0001510671901489905, |
| "loss": 1.1487, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1.665365258502364, |
| "grad_norm": 0.3611465394496918, |
| "learning_rate": 0.00015007521955019847, |
| "loss": 1.1437, |
| "step": 27300 |
| }, |
| { |
| "epoch": 1.6836663108128718, |
| "grad_norm": 0.3532879650592804, |
| "learning_rate": 0.00014907662825847968, |
| "loss": 1.146, |
| "step": 27600 |
| }, |
| { |
| "epoch": 1.7019673631233796, |
| "grad_norm": 0.37363260984420776, |
| "learning_rate": 0.00014807154830253732, |
| "loss": 1.1423, |
| "step": 27900 |
| }, |
| { |
| "epoch": 1.7202684154338874, |
| "grad_norm": 0.35999348759651184, |
| "learning_rate": 0.00014706011256897304, |
| "loss": 1.1439, |
| "step": 28200 |
| }, |
| { |
| "epoch": 1.7385694677443952, |
| "grad_norm": 0.35193225741386414, |
| "learning_rate": 0.0001460424547847175, |
| "loss": 1.1422, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1.7568705200549033, |
| "grad_norm": 0.35045188665390015, |
| "learning_rate": 0.0001450187094993493, |
| "loss": 1.1405, |
| "step": 28800 |
| }, |
| { |
| "epoch": 1.775171572365411, |
| "grad_norm": 0.36350497603416443, |
| "learning_rate": 0.00014398901206730591, |
| "loss": 1.1395, |
| "step": 29100 |
| }, |
| { |
| "epoch": 1.793472624675919, |
| "grad_norm": 0.3509625792503357, |
| "learning_rate": 0.0001429534986299875, |
| "loss": 1.1366, |
| "step": 29400 |
| }, |
| { |
| "epoch": 1.8117736769864268, |
| "grad_norm": 0.38187241554260254, |
| "learning_rate": 0.00014191230609775715, |
| "loss": 1.1386, |
| "step": 29700 |
| }, |
| { |
| "epoch": 1.8300747292969346, |
| "grad_norm": 0.3755728304386139, |
| "learning_rate": 0.0001408655721318392, |
| "loss": 1.136, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1.8483757816074424, |
| "grad_norm": 0.37514305114746094, |
| "learning_rate": 0.00013981343512611848, |
| "loss": 1.1395, |
| "step": 30300 |
| }, |
| { |
| "epoch": 1.8666768339179503, |
| "grad_norm": 0.38384050130844116, |
| "learning_rate": 0.00013875603418884243, |
| "loss": 1.1337, |
| "step": 30600 |
| }, |
| { |
| "epoch": 1.8849778862284583, |
| "grad_norm": 0.38143786787986755, |
| "learning_rate": 0.00013769350912422893, |
| "loss": 1.1325, |
| "step": 30900 |
| }, |
| { |
| "epoch": 1.903278938538966, |
| "grad_norm": 0.3801436424255371, |
| "learning_rate": 0.00013662600041398215, |
| "loss": 1.1358, |
| "step": 31200 |
| }, |
| { |
| "epoch": 1.921579990849474, |
| "grad_norm": 0.37413644790649414, |
| "learning_rate": 0.00013555364919871877, |
| "loss": 1.1319, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1.9398810431599816, |
| "grad_norm": 0.36143454909324646, |
| "learning_rate": 0.00013447659725930713, |
| "loss": 1.1303, |
| "step": 31800 |
| }, |
| { |
| "epoch": 1.9581820954704896, |
| "grad_norm": 0.3567048907279968, |
| "learning_rate": 0.00013339498699812142, |
| "loss": 1.1292, |
| "step": 32100 |
| }, |
| { |
| "epoch": 1.9764831477809974, |
| "grad_norm": 0.4003380835056305, |
| "learning_rate": 0.00013230896142021425, |
| "loss": 1.1278, |
| "step": 32400 |
| }, |
| { |
| "epoch": 1.9947842000915053, |
| "grad_norm": 0.37288859486579895, |
| "learning_rate": 0.00013121866411440917, |
| "loss": 1.1252, |
| "step": 32700 |
| }, |
| { |
| "epoch": 2.013054750648162, |
| "grad_norm": 0.37710532546043396, |
| "learning_rate": 0.000130124239234316, |
| "loss": 1.1003, |
| "step": 33000 |
| }, |
| { |
| "epoch": 2.0313558029586702, |
| "grad_norm": 0.36639609932899475, |
| "learning_rate": 0.0001290258314792716, |
| "loss": 1.0967, |
| "step": 33300 |
| }, |
| { |
| "epoch": 2.049656855269178, |
| "grad_norm": 0.3679460883140564, |
| "learning_rate": 0.0001279235860752084, |
| "loss": 1.1011, |
| "step": 33600 |
| }, |
| { |
| "epoch": 2.067957907579686, |
| "grad_norm": 0.40204325318336487, |
| "learning_rate": 0.00012681764875545362, |
| "loss": 1.0981, |
| "step": 33900 |
| }, |
| { |
| "epoch": 2.0862589598901935, |
| "grad_norm": 0.37128251791000366, |
| "learning_rate": 0.0001257081657414608, |
| "loss": 1.0963, |
| "step": 34200 |
| }, |
| { |
| "epoch": 2.1045600122007015, |
| "grad_norm": 0.35975438356399536, |
| "learning_rate": 0.00012459528372347722, |
| "loss": 1.0977, |
| "step": 34500 |
| }, |
| { |
| "epoch": 2.1228610645112096, |
| "grad_norm": 0.3734683394432068, |
| "learning_rate": 0.0001234791498411495, |
| "loss": 1.0962, |
| "step": 34800 |
| }, |
| { |
| "epoch": 2.141162116821717, |
| "grad_norm": 0.36970898509025574, |
| "learning_rate": 0.0001223599116640693, |
| "loss": 1.0969, |
| "step": 35100 |
| }, |
| { |
| "epoch": 2.1594631691322252, |
| "grad_norm": 0.41932040452957153, |
| "learning_rate": 0.00012123771717226238, |
| "loss": 1.0945, |
| "step": 35400 |
| }, |
| { |
| "epoch": 2.177764221442733, |
| "grad_norm": 0.38500991463661194, |
| "learning_rate": 0.00012011271473662365, |
| "loss": 1.0975, |
| "step": 35700 |
| }, |
| { |
| "epoch": 2.196065273753241, |
| "grad_norm": 0.36856573820114136, |
| "learning_rate": 0.00011898505309930006, |
| "loss": 1.095, |
| "step": 36000 |
| }, |
| { |
| "epoch": 2.2143663260637485, |
| "grad_norm": 0.38829776644706726, |
| "learning_rate": 0.00011785488135402492, |
| "loss": 1.0961, |
| "step": 36300 |
| }, |
| { |
| "epoch": 2.2326673783742566, |
| "grad_norm": 0.3822610080242157, |
| "learning_rate": 0.00011672234892640525, |
| "loss": 1.0971, |
| "step": 36600 |
| }, |
| { |
| "epoch": 2.2509684306847646, |
| "grad_norm": 0.3844328820705414, |
| "learning_rate": 0.00011558760555416565, |
| "loss": 1.0945, |
| "step": 36900 |
| }, |
| { |
| "epoch": 2.269269482995272, |
| "grad_norm": 0.3492465317249298, |
| "learning_rate": 0.00011445080126735061, |
| "loss": 1.0938, |
| "step": 37200 |
| }, |
| { |
| "epoch": 2.2875705353057803, |
| "grad_norm": 0.4076479971408844, |
| "learning_rate": 0.00011331208636848844, |
| "loss": 1.09, |
| "step": 37500 |
| }, |
| { |
| "epoch": 2.305871587616288, |
| "grad_norm": 0.36309176683425903, |
| "learning_rate": 0.00011217161141271879, |
| "loss": 1.0918, |
| "step": 37800 |
| }, |
| { |
| "epoch": 2.324172639926796, |
| "grad_norm": 0.36070308089256287, |
| "learning_rate": 0.00011102952718788731, |
| "loss": 1.0947, |
| "step": 38100 |
| }, |
| { |
| "epoch": 2.3424736922373035, |
| "grad_norm": 0.36666443943977356, |
| "learning_rate": 0.00010988598469460896, |
| "loss": 1.0904, |
| "step": 38400 |
| }, |
| { |
| "epoch": 2.3607747445478116, |
| "grad_norm": 0.3801022171974182, |
| "learning_rate": 0.00010874113512630368, |
| "loss": 1.0907, |
| "step": 38700 |
| }, |
| { |
| "epoch": 2.379075796858319, |
| "grad_norm": 0.3615148067474365, |
| "learning_rate": 0.00010759512984920626, |
| "loss": 1.0893, |
| "step": 39000 |
| }, |
| { |
| "epoch": 2.397376849168827, |
| "grad_norm": 0.3498951196670532, |
| "learning_rate": 0.00010644812038235344, |
| "loss": 1.0907, |
| "step": 39300 |
| }, |
| { |
| "epoch": 2.4156779014793353, |
| "grad_norm": 0.38336920738220215, |
| "learning_rate": 0.0001053002583775509, |
| "loss": 1.0945, |
| "step": 39600 |
| }, |
| { |
| "epoch": 2.433978953789843, |
| "grad_norm": 0.3785784840583801, |
| "learning_rate": 0.00010415169559932263, |
| "loss": 1.091, |
| "step": 39900 |
| }, |
| { |
| "epoch": 2.452280006100351, |
| "grad_norm": 0.3536151945590973, |
| "learning_rate": 0.00010300258390484522, |
| "loss": 1.0857, |
| "step": 40200 |
| }, |
| { |
| "epoch": 2.4705810584108585, |
| "grad_norm": 0.35834431648254395, |
| "learning_rate": 0.00010185307522387033, |
| "loss": 1.0873, |
| "step": 40500 |
| }, |
| { |
| "epoch": 2.4888821107213666, |
| "grad_norm": 0.36664196848869324, |
| "learning_rate": 0.00010070332153863707, |
| "loss": 1.0886, |
| "step": 40800 |
| }, |
| { |
| "epoch": 2.5071831630318746, |
| "grad_norm": 0.35749050974845886, |
| "learning_rate": 9.955347486377786e-05, |
| "loss": 1.0866, |
| "step": 41100 |
| }, |
| { |
| "epoch": 2.525484215342382, |
| "grad_norm": 0.40087947249412537, |
| "learning_rate": 9.840368722621967e-05, |
| "loss": 1.0797, |
| "step": 41400 |
| }, |
| { |
| "epoch": 2.54378526765289, |
| "grad_norm": 0.3594193160533905, |
| "learning_rate": 9.725411064508392e-05, |
| "loss": 1.0879, |
| "step": 41700 |
| }, |
| { |
| "epoch": 2.562086319963398, |
| "grad_norm": 0.3470621705055237, |
| "learning_rate": 9.610489711158714e-05, |
| "loss": 1.0832, |
| "step": 42000 |
| }, |
| { |
| "epoch": 2.580387372273906, |
| "grad_norm": 0.37977200746536255, |
| "learning_rate": 9.495619856894563e-05, |
| "loss": 1.0842, |
| "step": 42300 |
| }, |
| { |
| "epoch": 2.5986884245844135, |
| "grad_norm": 0.3905890882015228, |
| "learning_rate": 9.380816689228608e-05, |
| "loss": 1.0811, |
| "step": 42600 |
| }, |
| { |
| "epoch": 2.6169894768949216, |
| "grad_norm": 0.3852068781852722, |
| "learning_rate": 9.266095386856565e-05, |
| "loss": 1.0832, |
| "step": 42900 |
| }, |
| { |
| "epoch": 2.635290529205429, |
| "grad_norm": 0.40047767758369446, |
| "learning_rate": 9.15147111765032e-05, |
| "loss": 1.0792, |
| "step": 43200 |
| }, |
| { |
| "epoch": 2.653591581515937, |
| "grad_norm": 0.40440091490745544, |
| "learning_rate": 9.03695903665254e-05, |
| "loss": 1.0802, |
| "step": 43500 |
| }, |
| { |
| "epoch": 2.6718926338264453, |
| "grad_norm": 0.3432023823261261, |
| "learning_rate": 8.922574284072916e-05, |
| "loss": 1.08, |
| "step": 43800 |
| }, |
| { |
| "epoch": 2.690193686136953, |
| "grad_norm": 0.3821187913417816, |
| "learning_rate": 8.80833198328643e-05, |
| "loss": 1.0775, |
| "step": 44100 |
| }, |
| { |
| "epoch": 2.7084947384474605, |
| "grad_norm": 0.3733247220516205, |
| "learning_rate": 8.694247238833797e-05, |
| "loss": 1.0762, |
| "step": 44400 |
| }, |
| { |
| "epoch": 2.7267957907579685, |
| "grad_norm": 0.35147610306739807, |
| "learning_rate": 8.58033513442444e-05, |
| "loss": 1.0747, |
| "step": 44700 |
| }, |
| { |
| "epoch": 2.7450968430684766, |
| "grad_norm": 0.37748807668685913, |
| "learning_rate": 8.466610730942169e-05, |
| "loss": 1.0738, |
| "step": 45000 |
| }, |
| { |
| "epoch": 2.763397895378984, |
| "grad_norm": 0.39242950081825256, |
| "learning_rate": 8.353089064453948e-05, |
| "loss": 1.0734, |
| "step": 45300 |
| }, |
| { |
| "epoch": 2.781698947689492, |
| "grad_norm": 0.36078205704689026, |
| "learning_rate": 8.239785144221857e-05, |
| "loss": 1.0727, |
| "step": 45600 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 0.36267706751823425, |
| "learning_rate": 8.126713950718679e-05, |
| "loss": 1.0705, |
| "step": 45900 |
| }, |
| { |
| "epoch": 2.818301052310508, |
| "grad_norm": 0.38176020979881287, |
| "learning_rate": 8.013890433647228e-05, |
| "loss": 1.0718, |
| "step": 46200 |
| }, |
| { |
| "epoch": 2.836602104621016, |
| "grad_norm": 0.37327638268470764, |
| "learning_rate": 7.901329509963807e-05, |
| "loss": 1.0708, |
| "step": 46500 |
| }, |
| { |
| "epoch": 2.8549031569315235, |
| "grad_norm": 0.38468560576438904, |
| "learning_rate": 7.78904606190593e-05, |
| "loss": 1.0717, |
| "step": 46800 |
| }, |
| { |
| "epoch": 2.8732042092420316, |
| "grad_norm": 0.3524203896522522, |
| "learning_rate": 7.677054935024696e-05, |
| "loss": 1.0714, |
| "step": 47100 |
| }, |
| { |
| "epoch": 2.891505261552539, |
| "grad_norm": 0.3709242641925812, |
| "learning_rate": 7.565370936221959e-05, |
| "loss": 1.0725, |
| "step": 47400 |
| }, |
| { |
| "epoch": 2.9098063138630472, |
| "grad_norm": 0.3623937666416168, |
| "learning_rate": 7.454008831792668e-05, |
| "loss": 1.0665, |
| "step": 47700 |
| }, |
| { |
| "epoch": 2.928107366173555, |
| "grad_norm": 0.3631395995616913, |
| "learning_rate": 7.3429833454725e-05, |
| "loss": 1.0725, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2.946408418484063, |
| "grad_norm": 0.38803574442863464, |
| "learning_rate": 7.2323091564912e-05, |
| "loss": 1.0679, |
| "step": 48300 |
| }, |
| { |
| "epoch": 2.9647094707945705, |
| "grad_norm": 0.3702128529548645, |
| "learning_rate": 7.122000897631738e-05, |
| "loss": 1.0687, |
| "step": 48600 |
| }, |
| { |
| "epoch": 2.9830105231050785, |
| "grad_norm": 0.3717169165611267, |
| "learning_rate": 7.012073153295662e-05, |
| "loss": 1.0668, |
| "step": 48900 |
| }, |
| { |
| "epoch": 3.0012810736617355, |
| "grad_norm": 0.40531572699546814, |
| "learning_rate": 6.902540457574798e-05, |
| "loss": 1.0633, |
| "step": 49200 |
| }, |
| { |
| "epoch": 3.0195821259722435, |
| "grad_norm": 0.38881915807724, |
| "learning_rate": 6.793417292329643e-05, |
| "loss": 1.0336, |
| "step": 49500 |
| }, |
| { |
| "epoch": 3.037883178282751, |
| "grad_norm": 0.368335485458374, |
| "learning_rate": 6.684718085274634e-05, |
| "loss": 1.0392, |
| "step": 49800 |
| }, |
| { |
| "epoch": 3.056184230593259, |
| "grad_norm": 0.3770321309566498, |
| "learning_rate": 6.576457208070615e-05, |
| "loss": 1.0399, |
| "step": 50100 |
| }, |
| { |
| "epoch": 3.0744852829037668, |
| "grad_norm": 0.3865681290626526, |
| "learning_rate": 6.468648974424633e-05, |
| "loss": 1.0398, |
| "step": 50400 |
| }, |
| { |
| "epoch": 3.092786335214275, |
| "grad_norm": 0.3610497713088989, |
| "learning_rate": 6.361307638197522e-05, |
| "loss": 1.0418, |
| "step": 50700 |
| }, |
| { |
| "epoch": 3.111087387524783, |
| "grad_norm": 0.3791823089122772, |
| "learning_rate": 6.254447391519271e-05, |
| "loss": 1.0391, |
| "step": 51000 |
| }, |
| { |
| "epoch": 3.1293884398352905, |
| "grad_norm": 0.3856153190135956, |
| "learning_rate": 6.148082362912652e-05, |
| "loss": 1.0363, |
| "step": 51300 |
| }, |
| { |
| "epoch": 3.1476894921457985, |
| "grad_norm": 0.37801018357276917, |
| "learning_rate": 6.042226615425186e-05, |
| "loss": 1.0412, |
| "step": 51600 |
| }, |
| { |
| "epoch": 3.165990544456306, |
| "grad_norm": 0.3763614594936371, |
| "learning_rate": 5.936894144769827e-05, |
| "loss": 1.0403, |
| "step": 51900 |
| }, |
| { |
| "epoch": 3.184291596766814, |
| "grad_norm": 0.36206382513046265, |
| "learning_rate": 5.8320988774744946e-05, |
| "loss": 1.0377, |
| "step": 52200 |
| }, |
| { |
| "epoch": 3.2025926490773218, |
| "grad_norm": 0.4072185456752777, |
| "learning_rate": 5.727854669040803e-05, |
| "loss": 1.0417, |
| "step": 52500 |
| }, |
| { |
| "epoch": 3.22089370138783, |
| "grad_norm": 0.39182448387145996, |
| "learning_rate": 5.6241753021121335e-05, |
| "loss": 1.0394, |
| "step": 52800 |
| }, |
| { |
| "epoch": 3.239194753698338, |
| "grad_norm": 0.41504260897636414, |
| "learning_rate": 5.521074484651374e-05, |
| "loss": 1.0366, |
| "step": 53100 |
| }, |
| { |
| "epoch": 3.2574958060088455, |
| "grad_norm": 0.3729825019836426, |
| "learning_rate": 5.418565848128518e-05, |
| "loss": 1.0385, |
| "step": 53400 |
| }, |
| { |
| "epoch": 3.2757968583193535, |
| "grad_norm": 0.36718717217445374, |
| "learning_rate": 5.316662945718396e-05, |
| "loss": 1.0376, |
| "step": 53700 |
| }, |
| { |
| "epoch": 3.294097910629861, |
| "grad_norm": 0.36935436725616455, |
| "learning_rate": 5.2153792505087115e-05, |
| "loss": 1.0365, |
| "step": 54000 |
| }, |
| { |
| "epoch": 3.312398962940369, |
| "grad_norm": 0.38036707043647766, |
| "learning_rate": 5.114728153718731e-05, |
| "loss": 1.0333, |
| "step": 54300 |
| }, |
| { |
| "epoch": 3.3307000152508768, |
| "grad_norm": 0.3772546350955963, |
| "learning_rate": 5.0147229629287406e-05, |
| "loss": 1.0384, |
| "step": 54600 |
| }, |
| { |
| "epoch": 3.349001067561385, |
| "grad_norm": 0.38388633728027344, |
| "learning_rate": 4.9153769003205944e-05, |
| "loss": 1.0385, |
| "step": 54900 |
| }, |
| { |
| "epoch": 3.3673021198718924, |
| "grad_norm": 0.3748055696487427, |
| "learning_rate": 4.816703100929538e-05, |
| "loss": 1.0354, |
| "step": 55200 |
| }, |
| { |
| "epoch": 3.3856031721824005, |
| "grad_norm": 0.40391644835472107, |
| "learning_rate": 4.718714610907581e-05, |
| "loss": 1.0394, |
| "step": 55500 |
| }, |
| { |
| "epoch": 3.4039042244929085, |
| "grad_norm": 0.3871014416217804, |
| "learning_rate": 4.6214243857985686e-05, |
| "loss": 1.0348, |
| "step": 55800 |
| }, |
| { |
| "epoch": 3.422205276803416, |
| "grad_norm": 0.3747030198574066, |
| "learning_rate": 4.524845288825298e-05, |
| "loss": 1.031, |
| "step": 56100 |
| }, |
| { |
| "epoch": 3.440506329113924, |
| "grad_norm": 0.38700729608535767, |
| "learning_rate": 4.428990089188769e-05, |
| "loss": 1.0341, |
| "step": 56400 |
| }, |
| { |
| "epoch": 3.458807381424432, |
| "grad_norm": 0.37710437178611755, |
| "learning_rate": 4.333871460379951e-05, |
| "loss": 1.0351, |
| "step": 56700 |
| }, |
| { |
| "epoch": 3.47710843373494, |
| "grad_norm": 0.3593015968799591, |
| "learning_rate": 4.239501978504117e-05, |
| "loss": 1.0369, |
| "step": 57000 |
| }, |
| { |
| "epoch": 3.4954094860454474, |
| "grad_norm": 0.386823445558548, |
| "learning_rate": 4.145894120618123e-05, |
| "loss": 1.0336, |
| "step": 57300 |
| }, |
| { |
| "epoch": 3.5137105383559555, |
| "grad_norm": 0.3797110915184021, |
| "learning_rate": 4.053060263080738e-05, |
| "loss": 1.0304, |
| "step": 57600 |
| }, |
| { |
| "epoch": 3.532011590666463, |
| "grad_norm": 0.35242992639541626, |
| "learning_rate": 3.961012679916307e-05, |
| "loss": 1.0276, |
| "step": 57900 |
| }, |
| { |
| "epoch": 3.550312642976971, |
| "grad_norm": 0.36182209849357605, |
| "learning_rate": 3.869763541191944e-05, |
| "loss": 1.0287, |
| "step": 58200 |
| }, |
| { |
| "epoch": 3.568613695287479, |
| "grad_norm": 0.3675349950790405, |
| "learning_rate": 3.779324911408475e-05, |
| "loss": 1.0321, |
| "step": 58500 |
| }, |
| { |
| "epoch": 3.586914747597987, |
| "grad_norm": 0.38415876030921936, |
| "learning_rate": 3.6897087479053104e-05, |
| "loss": 1.0336, |
| "step": 58800 |
| }, |
| { |
| "epoch": 3.605215799908495, |
| "grad_norm": 0.3762219548225403, |
| "learning_rate": 3.600926899279532e-05, |
| "loss": 1.0326, |
| "step": 59100 |
| }, |
| { |
| "epoch": 3.6235168522190024, |
| "grad_norm": 0.3666684031486511, |
| "learning_rate": 3.512991103819303e-05, |
| "loss": 1.0288, |
| "step": 59400 |
| }, |
| { |
| "epoch": 3.6418179045295105, |
| "grad_norm": 0.39124396443367004, |
| "learning_rate": 3.425912987951914e-05, |
| "loss": 1.0279, |
| "step": 59700 |
| }, |
| { |
| "epoch": 3.6601189568400185, |
| "grad_norm": 0.3636305332183838, |
| "learning_rate": 3.339704064706577e-05, |
| "loss": 1.0327, |
| "step": 60000 |
| }, |
| { |
| "epoch": 3.678420009150526, |
| "grad_norm": 0.40503907203674316, |
| "learning_rate": 3.2543757321922563e-05, |
| "loss": 1.0313, |
| "step": 60300 |
| }, |
| { |
| "epoch": 3.6967210614610337, |
| "grad_norm": 0.3835071623325348, |
| "learning_rate": 3.169939272090634e-05, |
| "loss": 1.0306, |
| "step": 60600 |
| }, |
| { |
| "epoch": 3.715022113771542, |
| "grad_norm": 0.4188929796218872, |
| "learning_rate": 3.08640584816454e-05, |
| "loss": 1.0305, |
| "step": 60900 |
| }, |
| { |
| "epoch": 3.73332316608205, |
| "grad_norm": 0.3759515583515167, |
| "learning_rate": 3.0037865047818903e-05, |
| "loss": 1.0295, |
| "step": 61200 |
| }, |
| { |
| "epoch": 3.7516242183925574, |
| "grad_norm": 0.36246317625045776, |
| "learning_rate": 2.922092165455508e-05, |
| "loss": 1.027, |
| "step": 61500 |
| }, |
| { |
| "epoch": 3.7699252707030655, |
| "grad_norm": 0.3859165608882904, |
| "learning_rate": 2.8413336313988303e-05, |
| "loss": 1.026, |
| "step": 61800 |
| }, |
| { |
| "epoch": 3.788226323013573, |
| "grad_norm": 0.3533560037612915, |
| "learning_rate": 2.7615215800978523e-05, |
| "loss": 1.0293, |
| "step": 62100 |
| }, |
| { |
| "epoch": 3.806527375324081, |
| "grad_norm": 0.3984517455101013, |
| "learning_rate": 2.6826665638993876e-05, |
| "loss": 1.0265, |
| "step": 62400 |
| }, |
| { |
| "epoch": 3.824828427634589, |
| "grad_norm": 0.3726932108402252, |
| "learning_rate": 2.6047790086158952e-05, |
| "loss": 1.0249, |
| "step": 62700 |
| }, |
| { |
| "epoch": 3.843129479945097, |
| "grad_norm": 0.3885975778102875, |
| "learning_rate": 2.5278692121470326e-05, |
| "loss": 1.0242, |
| "step": 63000 |
| }, |
| { |
| "epoch": 3.861430532255605, |
| "grad_norm": 0.3631696105003357, |
| "learning_rate": 2.4519473431181272e-05, |
| "loss": 1.026, |
| "step": 63300 |
| }, |
| { |
| "epoch": 3.8797315845661124, |
| "grad_norm": 0.38549143075942993, |
| "learning_rate": 2.3770234395357115e-05, |
| "loss": 1.0255, |
| "step": 63600 |
| }, |
| { |
| "epoch": 3.8980326368766205, |
| "grad_norm": 0.3577282428741455, |
| "learning_rate": 2.303107407460373e-05, |
| "loss": 1.0255, |
| "step": 63900 |
| }, |
| { |
| "epoch": 3.916333689187128, |
| "grad_norm": 0.36772647500038147, |
| "learning_rate": 2.2302090196970048e-05, |
| "loss": 1.0257, |
| "step": 64200 |
| }, |
| { |
| "epoch": 3.934634741497636, |
| "grad_norm": 0.38493648171424866, |
| "learning_rate": 2.1583379145027016e-05, |
| "loss": 1.0258, |
| "step": 64500 |
| }, |
| { |
| "epoch": 3.9529357938081437, |
| "grad_norm": 0.3904314339160919, |
| "learning_rate": 2.0875035943124422e-05, |
| "loss": 1.0267, |
| "step": 64800 |
| }, |
| { |
| "epoch": 3.971236846118652, |
| "grad_norm": 0.40153247117996216, |
| "learning_rate": 2.0177154244827255e-05, |
| "loss": 1.0263, |
| "step": 65100 |
| }, |
| { |
| "epoch": 3.98953789842916, |
| "grad_norm": 0.35689347982406616, |
| "learning_rate": 1.94898263205332e-05, |
| "loss": 1.0265, |
| "step": 65400 |
| }, |
| { |
| "epoch": 4.007808448985816, |
| "grad_norm": 0.37626588344573975, |
| "learning_rate": 1.881314304527323e-05, |
| "loss": 1.0123, |
| "step": 65700 |
| }, |
| { |
| "epoch": 4.026109501296324, |
| "grad_norm": 0.39857444167137146, |
| "learning_rate": 1.8147193886696467e-05, |
| "loss": 1.0024, |
| "step": 66000 |
| }, |
| { |
| "epoch": 4.044410553606832, |
| "grad_norm": 0.38053014874458313, |
| "learning_rate": 1.7492066893241354e-05, |
| "loss": 1.0055, |
| "step": 66300 |
| }, |
| { |
| "epoch": 4.0627116059173405, |
| "grad_norm": 0.3763318359851837, |
| "learning_rate": 1.684784868249417e-05, |
| "loss": 1.0008, |
| "step": 66600 |
| }, |
| { |
| "epoch": 4.0810126582278485, |
| "grad_norm": 0.35103437304496765, |
| "learning_rate": 1.6214624429737058e-05, |
| "loss": 1.0064, |
| "step": 66900 |
| }, |
| { |
| "epoch": 4.099313710538356, |
| "grad_norm": 0.349678099155426, |
| "learning_rate": 1.5592477856686416e-05, |
| "loss": 1.0025, |
| "step": 67200 |
| }, |
| { |
| "epoch": 4.117614762848864, |
| "grad_norm": 0.3956117033958435, |
| "learning_rate": 1.4981491220423727e-05, |
| "loss": 1.0055, |
| "step": 67500 |
| }, |
| { |
| "epoch": 4.135915815159372, |
| "grad_norm": 0.36628103256225586, |
| "learning_rate": 1.4381745302519978e-05, |
| "loss": 1.0037, |
| "step": 67800 |
| }, |
| { |
| "epoch": 4.15421686746988, |
| "grad_norm": 0.3932549059391022, |
| "learning_rate": 1.379331939835502e-05, |
| "loss": 1.0023, |
| "step": 68100 |
| }, |
| { |
| "epoch": 4.172517919780387, |
| "grad_norm": 0.3831135332584381, |
| "learning_rate": 1.3216291306633643e-05, |
| "loss": 1.0042, |
| "step": 68400 |
| }, |
| { |
| "epoch": 4.190818972090895, |
| "grad_norm": 0.37050631642341614, |
| "learning_rate": 1.2650737319099316e-05, |
| "loss": 1.0035, |
| "step": 68700 |
| }, |
| { |
| "epoch": 4.209120024401403, |
| "grad_norm": 0.3810383081436157, |
| "learning_rate": 1.2096732210447382e-05, |
| "loss": 1.0034, |
| "step": 69000 |
| }, |
| { |
| "epoch": 4.227421076711911, |
| "grad_norm": 0.37233009934425354, |
| "learning_rate": 1.155434922843861e-05, |
| "loss": 1.0029, |
| "step": 69300 |
| }, |
| { |
| "epoch": 4.245722129022419, |
| "grad_norm": 0.36925238370895386, |
| "learning_rate": 1.1023660084214916e-05, |
| "loss": 1.0032, |
| "step": 69600 |
| }, |
| { |
| "epoch": 4.264023181332926, |
| "grad_norm": 0.37379172444343567, |
| "learning_rate": 1.0504734942817851e-05, |
| "loss": 1.001, |
| "step": 69900 |
| }, |
| { |
| "epoch": 4.282324233643434, |
| "grad_norm": 0.39091187715530396, |
| "learning_rate": 9.997642413912001e-06, |
| "loss": 1.0, |
| "step": 70200 |
| }, |
| { |
| "epoch": 4.3006252859539424, |
| "grad_norm": 0.39929676055908203, |
| "learning_rate": 9.502449542713431e-06, |
| "loss": 1.0039, |
| "step": 70500 |
| }, |
| { |
| "epoch": 4.3189263382644505, |
| "grad_norm": 0.38398900628089905, |
| "learning_rate": 9.019221801125677e-06, |
| "loss": 1.005, |
| "step": 70800 |
| }, |
| { |
| "epoch": 4.337227390574958, |
| "grad_norm": 0.37324637174606323, |
| "learning_rate": 8.548023079083045e-06, |
| "loss": 1.0001, |
| "step": 71100 |
| }, |
| { |
| "epoch": 4.355528442885466, |
| "grad_norm": 0.3978380262851715, |
| "learning_rate": 8.088915676103681e-06, |
| "loss": 1.0033, |
| "step": 71400 |
| }, |
| { |
| "epoch": 4.373829495195974, |
| "grad_norm": 0.3560939431190491, |
| "learning_rate": 7.641960293052442e-06, |
| "loss": 1.004, |
| "step": 71700 |
| }, |
| { |
| "epoch": 4.392130547506482, |
| "grad_norm": 0.3844592571258545, |
| "learning_rate": 7.207216024115415e-06, |
| "loss": 1.001, |
| "step": 72000 |
| }, |
| { |
| "epoch": 4.41043159981699, |
| "grad_norm": 0.3786323070526123, |
| "learning_rate": 6.784740348986785e-06, |
| "loss": 1.0043, |
| "step": 72300 |
| }, |
| { |
| "epoch": 4.428732652127497, |
| "grad_norm": 0.3794030249118805, |
| "learning_rate": 6.374589125269203e-06, |
| "loss": 1.0005, |
| "step": 72600 |
| }, |
| { |
| "epoch": 4.447033704438005, |
| "grad_norm": 0.376708984375, |
| "learning_rate": 5.976816581088418e-06, |
| "loss": 1.0042, |
| "step": 72900 |
| }, |
| { |
| "epoch": 4.465334756748513, |
| "grad_norm": 0.3753119707107544, |
| "learning_rate": 5.591475307923744e-06, |
| "loss": 0.9998, |
| "step": 73200 |
| }, |
| { |
| "epoch": 4.483635809059021, |
| "grad_norm": 0.39294296503067017, |
| "learning_rate": 5.21861625365444e-06, |
| "loss": 1.002, |
| "step": 73500 |
| }, |
| { |
| "epoch": 4.501936861369529, |
| "grad_norm": 0.386958509683609, |
| "learning_rate": 4.8582887158238e-06, |
| "loss": 1.0032, |
| "step": 73800 |
| }, |
| { |
| "epoch": 4.520237913680036, |
| "grad_norm": 0.3635695278644562, |
| "learning_rate": 4.510540335121205e-06, |
| "loss": 1.0002, |
| "step": 74100 |
| }, |
| { |
| "epoch": 4.538538965990544, |
| "grad_norm": 0.40328043699264526, |
| "learning_rate": 4.175417089083378e-06, |
| "loss": 0.9995, |
| "step": 74400 |
| }, |
| { |
| "epoch": 4.5568400183010525, |
| "grad_norm": 0.39885058999061584, |
| "learning_rate": 3.852963286015376e-06, |
| "loss": 1.0049, |
| "step": 74700 |
| }, |
| { |
| "epoch": 4.5751410706115605, |
| "grad_norm": 0.3917011022567749, |
| "learning_rate": 3.5432215591324702e-06, |
| "loss": 0.9994, |
| "step": 75000 |
| }, |
| { |
| "epoch": 4.593442122922068, |
| "grad_norm": 0.36823955178260803, |
| "learning_rate": 3.2462328609232707e-06, |
| "loss": 0.998, |
| "step": 75300 |
| }, |
| { |
| "epoch": 4.611743175232576, |
| "grad_norm": 0.3946107029914856, |
| "learning_rate": 2.962036457735329e-06, |
| "loss": 1.0011, |
| "step": 75600 |
| }, |
| { |
| "epoch": 4.630044227543084, |
| "grad_norm": 0.37494683265686035, |
| "learning_rate": 2.6906699245834554e-06, |
| "loss": 1.0024, |
| "step": 75900 |
| }, |
| { |
| "epoch": 4.648345279853592, |
| "grad_norm": 0.3627372980117798, |
| "learning_rate": 2.4321691401817725e-06, |
| "loss": 1.0021, |
| "step": 76200 |
| }, |
| { |
| "epoch": 4.666646332164099, |
| "grad_norm": 0.3681999742984772, |
| "learning_rate": 2.1865682821999966e-06, |
| "loss": 1.0038, |
| "step": 76500 |
| }, |
| { |
| "epoch": 4.684947384474607, |
| "grad_norm": 0.38293156027793884, |
| "learning_rate": 1.9538998227446424e-06, |
| "loss": 0.9986, |
| "step": 76800 |
| }, |
| { |
| "epoch": 4.703248436785115, |
| "grad_norm": 0.3759300410747528, |
| "learning_rate": 1.7341945240657243e-06, |
| "loss": 1.0026, |
| "step": 77100 |
| }, |
| { |
| "epoch": 4.721549489095623, |
| "grad_norm": 0.4103819727897644, |
| "learning_rate": 1.5274814344895749e-06, |
| "loss": 0.9956, |
| "step": 77400 |
| }, |
| { |
| "epoch": 4.739850541406131, |
| "grad_norm": 0.38952457904815674, |
| "learning_rate": 1.3337878845781082e-06, |
| "loss": 0.9961, |
| "step": 77700 |
| }, |
| { |
| "epoch": 4.758151593716638, |
| "grad_norm": 0.370206356048584, |
| "learning_rate": 1.1531394835154308e-06, |
| "loss": 1.0005, |
| "step": 78000 |
| }, |
| { |
| "epoch": 4.776452646027146, |
| "grad_norm": 0.3750290274620056, |
| "learning_rate": 9.85560115721862e-07, |
| "loss": 0.9981, |
| "step": 78300 |
| }, |
| { |
| "epoch": 4.794753698337654, |
| "grad_norm": 0.3763698637485504, |
| "learning_rate": 8.310719376960596e-07, |
| "loss": 1.0021, |
| "step": 78600 |
| }, |
| { |
| "epoch": 4.8130547506481625, |
| "grad_norm": 0.3856047987937927, |
| "learning_rate": 6.896953750856083e-07, |
| "loss": 0.9989, |
| "step": 78900 |
| }, |
| { |
| "epoch": 4.8313558029586705, |
| "grad_norm": 0.3992738425731659, |
| "learning_rate": 5.61449119986468e-07, |
| "loss": 1.0031, |
| "step": 79200 |
| }, |
| { |
| "epoch": 4.849656855269178, |
| "grad_norm": 0.39547818899154663, |
| "learning_rate": 4.463501284715399e-07, |
| "loss": 1.0039, |
| "step": 79500 |
| }, |
| { |
| "epoch": 4.867957907579686, |
| "grad_norm": 0.39359623193740845, |
| "learning_rate": 3.4441361834886e-07, |
| "loss": 0.9978, |
| "step": 79800 |
| }, |
| { |
| "epoch": 4.886258959890194, |
| "grad_norm": 0.3566698431968689, |
| "learning_rate": 2.5565306714960825e-07, |
| "loss": 0.9969, |
| "step": 80100 |
| }, |
| { |
| "epoch": 4.904560012200702, |
| "grad_norm": 0.38897261023521423, |
| "learning_rate": 1.800802103461008e-07, |
| "loss": 1.0021, |
| "step": 80400 |
| }, |
| { |
| "epoch": 4.92286106451121, |
| "grad_norm": 0.38005581498146057, |
| "learning_rate": 1.1770503980024216e-07, |
| "loss": 1.0041, |
| "step": 80700 |
| }, |
| { |
| "epoch": 4.941162116821717, |
| "grad_norm": 0.39922693371772766, |
| "learning_rate": 6.853580244243762e-08, |
| "loss": 1.0011, |
| "step": 81000 |
| }, |
| { |
| "epoch": 4.959463169132225, |
| "grad_norm": 0.37051552534103394, |
| "learning_rate": 3.257899918120977e-08, |
| "loss": 1.002, |
| "step": 81300 |
| }, |
| { |
| "epoch": 4.977764221442733, |
| "grad_norm": 0.3841174840927124, |
| "learning_rate": 9.839384043719423e-09, |
| "loss": 1.0027, |
| "step": 81600 |
| }, |
| { |
| "epoch": 4.996065273753241, |
| "grad_norm": 0.3595673441886902, |
| "learning_rate": 3.1996354714625144e-10, |
| "loss": 1.0007, |
| "step": 81900 |
| }, |
| { |
| "epoch": 5.0, |
| "step": 81965, |
| "total_flos": 1.7441017798508093e+19, |
| "train_loss": 1.117615111251009, |
| "train_runtime": 146190.8611, |
| "train_samples_per_second": 143.527, |
| "train_steps_per_second": 0.561 |
| } |
| ], |
| "logging_steps": 300, |
| "max_steps": 81965, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.7441017798508093e+19, |
| "train_batch_size": 128, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|