| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9937769267592147, |
| "eval_steps": 500, |
| "global_step": 1044, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.028721876495931067, |
| "grad_norm": 3.700932765736394, |
| "learning_rate": 9.523809523809525e-07, |
| "loss": 0.93, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.057443752991862135, |
| "grad_norm": 1.734082511082376, |
| "learning_rate": 1.904761904761905e-06, |
| "loss": 0.8877, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0861656294877932, |
| "grad_norm": 1.024208832618697, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.8525, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11488750598372427, |
| "grad_norm": 0.7469304469128593, |
| "learning_rate": 3.80952380952381e-06, |
| "loss": 0.7877, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14360938247965535, |
| "grad_norm": 0.586040626713746, |
| "learning_rate": 4.761904761904762e-06, |
| "loss": 0.7286, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1723312589755864, |
| "grad_norm": 0.54841652853263, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.7166, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.20105313547151749, |
| "grad_norm": 0.5034517147897491, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.6823, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.22977501196744854, |
| "grad_norm": 0.49774064773427634, |
| "learning_rate": 7.61904761904762e-06, |
| "loss": 0.6824, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2584968884633796, |
| "grad_norm": 0.552315492890169, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 0.6856, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2872187649593107, |
| "grad_norm": 0.5309414818094527, |
| "learning_rate": 9.523809523809525e-06, |
| "loss": 0.658, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.31594064145524176, |
| "grad_norm": 0.5385308790861494, |
| "learning_rate": 9.999300418283908e-06, |
| "loss": 0.6423, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3446625179511728, |
| "grad_norm": 0.5526511778665282, |
| "learning_rate": 9.993704939095376e-06, |
| "loss": 0.6408, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.37338439444710386, |
| "grad_norm": 0.6179734532307238, |
| "learning_rate": 9.982520243472044e-06, |
| "loss": 0.6505, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.40210627094303497, |
| "grad_norm": 0.5239737338093862, |
| "learning_rate": 9.965758849911774e-06, |
| "loss": 0.6161, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.430828147438966, |
| "grad_norm": 0.5619528495039146, |
| "learning_rate": 9.943439518645193e-06, |
| "loss": 0.6308, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4595500239348971, |
| "grad_norm": 0.4729313140856091, |
| "learning_rate": 9.915587230638269e-06, |
| "loss": 0.6316, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.48827190043082813, |
| "grad_norm": 0.4851042681486466, |
| "learning_rate": 9.882233159632297e-06, |
| "loss": 0.6273, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5169937769267592, |
| "grad_norm": 0.4990097338675921, |
| "learning_rate": 9.843414637252615e-06, |
| "loss": 0.6351, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5457156534226902, |
| "grad_norm": 0.4721715881606927, |
| "learning_rate": 9.79917511122509e-06, |
| "loss": 0.6174, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5744375299186214, |
| "grad_norm": 0.49937447981211275, |
| "learning_rate": 9.749564096747148e-06, |
| "loss": 0.619, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6031594064145525, |
| "grad_norm": 0.5273757739930043, |
| "learning_rate": 9.694637121067764e-06, |
| "loss": 0.6212, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6318812829104835, |
| "grad_norm": 0.5688431120752676, |
| "learning_rate": 9.63445566133846e-06, |
| "loss": 0.6272, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6606031594064146, |
| "grad_norm": 0.5845365795446384, |
| "learning_rate": 9.569087075804842e-06, |
| "loss": 0.6313, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6893250359023456, |
| "grad_norm": 0.5495325867850296, |
| "learning_rate": 9.498604528415731e-06, |
| "loss": 0.625, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7180469123982767, |
| "grad_norm": 0.5031448369891197, |
| "learning_rate": 9.423086906934228e-06, |
| "loss": 0.6248, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7467687888942077, |
| "grad_norm": 0.4900365577299398, |
| "learning_rate": 9.342618734642395e-06, |
| "loss": 0.6142, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7754906653901388, |
| "grad_norm": 0.5315649843783232, |
| "learning_rate": 9.257290075738365e-06, |
| "loss": 0.6016, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8042125418860699, |
| "grad_norm": 0.5256710163299946, |
| "learning_rate": 9.16719643453177e-06, |
| "loss": 0.6224, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.832934418382001, |
| "grad_norm": 0.5138537637831458, |
| "learning_rate": 9.072438648550304e-06, |
| "loss": 0.6118, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.861656294877932, |
| "grad_norm": 0.5363724694729143, |
| "learning_rate": 8.973122775677078e-06, |
| "loss": 0.5975, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8903781713738631, |
| "grad_norm": 0.5338491338258355, |
| "learning_rate": 8.869359975445085e-06, |
| "loss": 0.5976, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9191000478697942, |
| "grad_norm": 0.5042648527170004, |
| "learning_rate": 8.761266384621599e-06, |
| "loss": 0.6042, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9478219243657252, |
| "grad_norm": 0.6176720610638503, |
| "learning_rate": 8.648962987221837e-06, |
| "loss": 0.6096, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9765438008616563, |
| "grad_norm": 0.48986374397556326, |
| "learning_rate": 8.532575479097294e-06, |
| "loss": 0.5895, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.0028721876495932, |
| "grad_norm": 1.3018686522828506, |
| "learning_rate": 8.412234127250353e-06, |
| "loss": 0.6163, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0315940641455241, |
| "grad_norm": 0.5832838255643433, |
| "learning_rate": 8.288073624032634e-06, |
| "loss": 0.5642, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.0603159406414553, |
| "grad_norm": 0.5254324766786693, |
| "learning_rate": 8.160232936390239e-06, |
| "loss": 0.5657, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.0890378171373862, |
| "grad_norm": 0.49615638486827607, |
| "learning_rate": 8.02885515032467e-06, |
| "loss": 0.5665, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.1177596936333174, |
| "grad_norm": 0.5919689080906818, |
| "learning_rate": 7.894087310743468e-06, |
| "loss": 0.5658, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.1464815701292483, |
| "grad_norm": 0.5210878612928057, |
| "learning_rate": 7.756080256879837e-06, |
| "loss": 0.544, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.1752034466251795, |
| "grad_norm": 0.5194215534501085, |
| "learning_rate": 7.614988453465469e-06, |
| "loss": 0.5715, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2039253231211107, |
| "grad_norm": 0.4945404206989309, |
| "learning_rate": 7.470969817845518e-06, |
| "loss": 0.5568, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.2326471996170416, |
| "grad_norm": 0.5434010317710215, |
| "learning_rate": 7.324185543229226e-06, |
| "loss": 0.5604, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.2613690761129728, |
| "grad_norm": 0.49174162188431164, |
| "learning_rate": 7.174799918274018e-06, |
| "loss": 0.5594, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.2900909526089037, |
| "grad_norm": 0.442685394222738, |
| "learning_rate": 7.022980143205046e-06, |
| "loss": 0.5738, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.3188128291048349, |
| "grad_norm": 0.5397145615923644, |
| "learning_rate": 6.868896142675903e-06, |
| "loss": 0.5744, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.3475347056007658, |
| "grad_norm": 0.49715490338776275, |
| "learning_rate": 6.712720375580057e-06, |
| "loss": 0.5738, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.376256582096697, |
| "grad_norm": 0.46659441221308456, |
| "learning_rate": 6.554627642025807e-06, |
| "loss": 0.5623, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4049784585926282, |
| "grad_norm": 0.5399824841470794, |
| "learning_rate": 6.394794887690838e-06, |
| "loss": 0.5652, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.433700335088559, |
| "grad_norm": 0.46213954264438495, |
| "learning_rate": 6.233401005775339e-06, |
| "loss": 0.5628, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.4624222115844903, |
| "grad_norm": 0.4959443093326465, |
| "learning_rate": 6.070626636775349e-06, |
| "loss": 0.5649, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.4911440880804212, |
| "grad_norm": 0.5041986397517726, |
| "learning_rate": 5.906653966300444e-06, |
| "loss": 0.5722, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.5198659645763524, |
| "grad_norm": 0.4083574928666145, |
| "learning_rate": 5.741666521162055e-06, |
| "loss": 0.5484, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.5485878410722833, |
| "grad_norm": 0.44781946959219954, |
| "learning_rate": 5.575848963960621e-06, |
| "loss": 0.5593, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.5773097175682145, |
| "grad_norm": 0.5286251349151584, |
| "learning_rate": 5.4093868864015405e-06, |
| "loss": 0.5844, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.6060315940641456, |
| "grad_norm": 0.5023049460335103, |
| "learning_rate": 5.24246660157119e-06, |
| "loss": 0.5621, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.6347534705600766, |
| "grad_norm": 0.4704671038611138, |
| "learning_rate": 5.075274935405554e-06, |
| "loss": 0.5718, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.6634753470560075, |
| "grad_norm": 0.5090581682145832, |
| "learning_rate": 4.90799901758484e-06, |
| "loss": 0.5633, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.6921972235519387, |
| "grad_norm": 0.45798134505233995, |
| "learning_rate": 4.74082607208812e-06, |
| "loss": 0.5656, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.7209191000478699, |
| "grad_norm": 0.48981735934307813, |
| "learning_rate": 4.573943207642452e-06, |
| "loss": 0.5606, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.7496409765438008, |
| "grad_norm": 0.4699165430870941, |
| "learning_rate": 4.407537208300957e-06, |
| "loss": 0.5614, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.778362853039732, |
| "grad_norm": 0.47494342485042146, |
| "learning_rate": 4.241794324384334e-06, |
| "loss": 0.5522, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.8070847295356631, |
| "grad_norm": 0.5178845713340307, |
| "learning_rate": 4.076900064019721e-06, |
| "loss": 0.5595, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.835806606031594, |
| "grad_norm": 0.5219813023574859, |
| "learning_rate": 3.91303898551028e-06, |
| "loss": 0.564, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.864528482527525, |
| "grad_norm": 0.4574407490031541, |
| "learning_rate": 3.7503944907678543e-06, |
| "loss": 0.5637, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.8932503590234562, |
| "grad_norm": 0.47969146853460126, |
| "learning_rate": 3.5891486200399413e-06, |
| "loss": 0.5576, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.9219722355193873, |
| "grad_norm": 0.4131894295273494, |
| "learning_rate": 3.429481848160702e-06, |
| "loss": 0.5502, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.9506941120153183, |
| "grad_norm": 0.41395383387812457, |
| "learning_rate": 3.2715728825540525e-06, |
| "loss": 0.5461, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.9794159885112494, |
| "grad_norm": 0.45923582804755875, |
| "learning_rate": 3.1155984632149565e-06, |
| "loss": 0.5651, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.0057443752991864, |
| "grad_norm": 0.4977440233074294, |
| "learning_rate": 2.961733164892744e-06, |
| "loss": 0.5437, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.0344662517951173, |
| "grad_norm": 0.48802663653181644, |
| "learning_rate": 2.8101492016979027e-06, |
| "loss": 0.5382, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.0631881282910483, |
| "grad_norm": 0.4673676186684926, |
| "learning_rate": 2.6610162343510183e-06, |
| "loss": 0.5221, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.091910004786979, |
| "grad_norm": 0.39392152443459205, |
| "learning_rate": 2.5145011802895835e-06, |
| "loss": 0.5362, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.1206318812829106, |
| "grad_norm": 0.40247433394922916, |
| "learning_rate": 2.370768026845276e-06, |
| "loss": 0.5258, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.1493537577788415, |
| "grad_norm": 0.38912377413533783, |
| "learning_rate": 2.2299776477007073e-06, |
| "loss": 0.5269, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.1780756342747725, |
| "grad_norm": 0.44606591149003666, |
| "learning_rate": 2.0922876228311833e-06, |
| "loss": 0.5183, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.206797510770704, |
| "grad_norm": 0.42032893014171974, |
| "learning_rate": 1.957852062132924e-06, |
| "loss": 0.5275, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.235519387266635, |
| "grad_norm": 0.501692538983492, |
| "learning_rate": 1.8268214329351797e-06, |
| "loss": 0.5319, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.2642412637625657, |
| "grad_norm": 0.4398732872682357, |
| "learning_rate": 1.6993423915893241e-06, |
| "loss": 0.5394, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.2929631402584967, |
| "grad_norm": 0.4426749294047621, |
| "learning_rate": 1.575557619323353e-06, |
| "loss": 0.5323, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.321685016754428, |
| "grad_norm": 0.44560301173068867, |
| "learning_rate": 1.4556056625455922e-06, |
| "loss": 0.5308, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.350406893250359, |
| "grad_norm": 0.47993419658063147, |
| "learning_rate": 1.3396207777762732e-06, |
| "loss": 0.5143, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.37912876974629, |
| "grad_norm": 0.4643782762709979, |
| "learning_rate": 1.2277327813806123e-06, |
| "loss": 0.5341, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.4078506462422213, |
| "grad_norm": 0.4648802469670571, |
| "learning_rate": 1.1200669042715163e-06, |
| "loss": 0.5395, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.4365725227381523, |
| "grad_norm": 0.4388112871344682, |
| "learning_rate": 1.0167436517445777e-06, |
| "loss": 0.5198, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.4652943992340832, |
| "grad_norm": 0.4561716886355634, |
| "learning_rate": 9.178786686022417e-07, |
| "loss": 0.5347, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.494016275730014, |
| "grad_norm": 0.4516982561274727, |
| "learning_rate": 8.235826097180566e-07, |
| "loss": 0.5358, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.5227381522259456, |
| "grad_norm": 0.4293278109349354, |
| "learning_rate": 7.339610161859618e-07, |
| "loss": 0.5363, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.5514600287218765, |
| "grad_norm": 0.4418535368694303, |
| "learning_rate": 6.49114197193137e-07, |
| "loss": 0.5229, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.5801819052178074, |
| "grad_norm": 0.5092110974871947, |
| "learning_rate": 5.691371177487215e-07, |
| "loss": 0.5211, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.608903781713739, |
| "grad_norm": 0.40169505212972373, |
| "learning_rate": 4.941192923939769e-07, |
| "loss": 0.522, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.6376256582096698, |
| "grad_norm": 0.42628243978147184, |
| "learning_rate": 4.2414468501293217e-07, |
| "loss": 0.5351, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.6663475347056007, |
| "grad_norm": 0.4205893212947504, |
| "learning_rate": 3.5929161485559694e-07, |
| "loss": 0.5359, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.6950694112015317, |
| "grad_norm": 0.43814866165359284, |
| "learning_rate": 2.9963266887894526e-07, |
| "loss": 0.5324, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.723791287697463, |
| "grad_norm": 0.4397081557598701, |
| "learning_rate": 2.4523462050379864e-07, |
| "loss": 0.5487, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.752513164193394, |
| "grad_norm": 0.4086230507321953, |
| "learning_rate": 1.9615835487849677e-07, |
| "loss": 0.5116, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.781235040689325, |
| "grad_norm": 0.38229316714447675, |
| "learning_rate": 1.5245880073305963e-07, |
| "loss": 0.5352, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.8099569171852563, |
| "grad_norm": 0.43419896659737084, |
| "learning_rate": 1.1418486890006574e-07, |
| "loss": 0.5208, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.8386787936811873, |
| "grad_norm": 0.4572008563089457, |
| "learning_rate": 8.137939757108526e-08, |
| "loss": 0.5408, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.867400670177118, |
| "grad_norm": 0.40361529979394384, |
| "learning_rate": 5.4079104349929465e-08, |
| "loss": 0.5343, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.896122546673049, |
| "grad_norm": 0.42876437242652693, |
| "learning_rate": 3.231454515638221e-08, |
| "loss": 0.5194, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.9248444231689805, |
| "grad_norm": 0.4906146123977682, |
| "learning_rate": 1.6110080026414123e-08, |
| "loss": 0.533, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.9535662996649115, |
| "grad_norm": 0.44570381041730334, |
| "learning_rate": 5.483845847151226e-09, |
| "loss": 0.5284, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.9822881761608424, |
| "grad_norm": 0.41654981591864015, |
| "learning_rate": 4.4773605712089554e-10, |
| "loss": 0.513, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.9937769267592147, |
| "step": 1044, |
| "total_flos": 372760519114752.0, |
| "train_loss": 0.5842336475620782, |
| "train_runtime": 42367.2333, |
| "train_samples_per_second": 1.183, |
| "train_steps_per_second": 0.025 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1044, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 372760519114752.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|