{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.105875, "eval_steps": 500, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00125, "grad_norm": 0.3465445339679718, "learning_rate": 7.4204999999999995e-06, "loss": 2.7873512268066407, "step": 10 }, { "epoch": 0.0025, "grad_norm": 0.32606860995292664, "learning_rate": 1.56655e-05, "loss": 2.760304069519043, "step": 20 }, { "epoch": 0.00375, "grad_norm": 0.33225658535957336, "learning_rate": 2.3910499999999997e-05, "loss": 2.7759071350097657, "step": 30 }, { "epoch": 0.005, "grad_norm": 0.31996211409568787, "learning_rate": 3.21555e-05, "loss": 2.7292430877685545, "step": 40 }, { "epoch": 0.00625, "grad_norm": 0.3153120279312134, "learning_rate": 4.04005e-05, "loss": 2.733371353149414, "step": 50 }, { "epoch": 0.0075, "grad_norm": 0.3135412037372589, "learning_rate": 4.8645499999999994e-05, "loss": 2.7492229461669924, "step": 60 }, { "epoch": 0.00875, "grad_norm": 0.3155956268310547, "learning_rate": 5.6890499999999993e-05, "loss": 2.7486228942871094, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.3149002194404602, "learning_rate": 6.51355e-05, "loss": 2.760879898071289, "step": 80 }, { "epoch": 0.01125, "grad_norm": 0.3194095492362976, "learning_rate": 7.33805e-05, "loss": 2.734035873413086, "step": 90 }, { "epoch": 0.0125, "grad_norm": 0.3121851980686188, "learning_rate": 8.16255e-05, "loss": 2.7368759155273437, "step": 100 }, { "epoch": 0.01375, "grad_norm": 0.3184032440185547, "learning_rate": 8.98705e-05, "loss": 2.736837387084961, "step": 110 }, { "epoch": 0.015, "grad_norm": 0.3170839250087738, "learning_rate": 9.81155e-05, "loss": 2.7051807403564454, "step": 120 }, { "epoch": 0.01625, "grad_norm": 0.3180184066295624, "learning_rate": 0.0001063605, "loss": 2.7603172302246093, "step": 130 }, { "epoch": 0.0175, "grad_norm": 0.31468942761421204, "learning_rate": 0.00011460549999999999, "loss": 2.7113197326660154, "step": 140 }, { "epoch": 0.01875, "grad_norm": 0.31729385256767273, "learning_rate": 0.00012285049999999999, "loss": 2.7222190856933595, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.3197655975818634, "learning_rate": 0.0001310955, "loss": 2.7241241455078127, "step": 160 }, { "epoch": 0.02125, "grad_norm": 0.3256337642669678, "learning_rate": 0.00013934049999999998, "loss": 2.7403392791748047, "step": 170 }, { "epoch": 0.0225, "grad_norm": 0.3098828196525574, "learning_rate": 0.0001475855, "loss": 2.7496837615966796, "step": 180 }, { "epoch": 0.02375, "grad_norm": 0.3134927749633789, "learning_rate": 0.00015583049999999998, "loss": 2.750768280029297, "step": 190 }, { "epoch": 0.025, "grad_norm": 0.3353506922721863, "learning_rate": 0.0001640755, "loss": 2.7532047271728515, "step": 200 }, { "epoch": 0.02625, "grad_norm": 0.3203900456428528, "learning_rate": 0.0001648994583038516, "loss": 2.7489036560058593, "step": 210 }, { "epoch": 0.0275, "grad_norm": 0.3266359865665436, "learning_rate": 0.00016489758578309418, "loss": 2.7316030502319335, "step": 220 }, { "epoch": 0.02875, "grad_norm": 0.31027814745903015, "learning_rate": 0.00016489437578049018, "loss": 2.7714206695556642, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.325736939907074, "learning_rate": 0.0001648898283481129, "loss": 2.7439931869506835, "step": 240 }, { "epoch": 0.03125, "grad_norm": 0.30528295040130615, "learning_rate": 0.00016488394355973176, "loss": 2.766144943237305, "step": 250 }, { "epoch": 0.0325, "grad_norm": 0.31271758675575256, "learning_rate": 0.000164876721510811, "loss": 2.7013065338134767, "step": 260 }, { "epoch": 0.03375, "grad_norm": 0.3198724687099457, "learning_rate": 0.0001648681623185082, "loss": 2.7379714965820314, "step": 270 }, { "epoch": 0.035, "grad_norm": 0.33557799458503723, "learning_rate": 0.00016485826612167237, "loss": 2.76102352142334, "step": 280 }, { "epoch": 0.03625, "grad_norm": 0.3212190568447113, "learning_rate": 0.00016484703308084162, "loss": 2.7475757598876953, "step": 290 }, { "epoch": 0.0375, "grad_norm": 0.31533411145210266, "learning_rate": 0.00016483446337824071, "loss": 2.747650718688965, "step": 300 }, { "epoch": 0.03875, "grad_norm": 0.33507952094078064, "learning_rate": 0.00016482055721777798, "loss": 2.739873504638672, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.31843748688697815, "learning_rate": 0.00016480531482504198, "loss": 2.7478389739990234, "step": 320 }, { "epoch": 0.04125, "grad_norm": 0.3268890380859375, "learning_rate": 0.00016478873644729805, "loss": 2.7712429046630858, "step": 330 }, { "epoch": 0.0425, "grad_norm": 0.315518319606781, "learning_rate": 0.00016477082235348404, "loss": 2.7189746856689454, "step": 340 }, { "epoch": 0.04375, "grad_norm": 0.31345367431640625, "learning_rate": 0.0001647515728342061, "loss": 2.7359670639038085, "step": 350 }, { "epoch": 0.045, "grad_norm": 0.325610876083374, "learning_rate": 0.0001647309882017339, "loss": 2.748139572143555, "step": 360 }, { "epoch": 0.04625, "grad_norm": 0.3194393813610077, "learning_rate": 0.00016470906878999564, "loss": 2.7462692260742188, "step": 370 }, { "epoch": 0.0475, "grad_norm": 0.3070792555809021, "learning_rate": 0.0001646858149545726, "loss": 2.757720184326172, "step": 380 }, { "epoch": 0.04875, "grad_norm": 0.32982465624809265, "learning_rate": 0.00016466122707269328, "loss": 2.7279708862304686, "step": 390 }, { "epoch": 0.05, "grad_norm": 0.309640496969223, "learning_rate": 0.0001646353055432274, "loss": 2.724739837646484, "step": 400 }, { "epoch": 0.05125, "grad_norm": 0.31954118609428406, "learning_rate": 0.00016460805078667945, "loss": 2.7295236587524414, "step": 410 }, { "epoch": 0.0525, "grad_norm": 0.30906999111175537, "learning_rate": 0.00016457946324518165, "loss": 2.734362030029297, "step": 420 }, { "epoch": 0.05375, "grad_norm": 0.3177924156188965, "learning_rate": 0.00016454954338248712, "loss": 2.7312435150146483, "step": 430 }, { "epoch": 0.055, "grad_norm": 0.3104606866836548, "learning_rate": 0.00016451829168396203, "loss": 2.7339248657226562, "step": 440 }, { "epoch": 0.05625, "grad_norm": 0.31935980916023254, "learning_rate": 0.0001644857086565779, "loss": 2.762462043762207, "step": 450 }, { "epoch": 0.0575, "grad_norm": 0.321206659078598, "learning_rate": 0.0001644517948289035, "loss": 2.7401878356933596, "step": 460 }, { "epoch": 0.05875, "grad_norm": 0.31553006172180176, "learning_rate": 0.00016441655075109576, "loss": 2.7154884338378906, "step": 470 }, { "epoch": 0.06, "grad_norm": 0.3156311810016632, "learning_rate": 0.0001643799769948916, "loss": 2.731028747558594, "step": 480 }, { "epoch": 0.06125, "grad_norm": 0.31830424070358276, "learning_rate": 0.00016434207415359802, "loss": 2.748556137084961, "step": 490 }, { "epoch": 0.0625, "grad_norm": 0.3151983320713043, "learning_rate": 0.0001643028428420828, "loss": 2.7336639404296874, "step": 500 }, { "epoch": 0.06375, "grad_norm": 0.32100728154182434, "learning_rate": 0.00016426228369676436, "loss": 2.733713150024414, "step": 510 }, { "epoch": 0.065, "grad_norm": 0.3150577247142792, "learning_rate": 0.00016422039737560163, "loss": 2.747536849975586, "step": 520 }, { "epoch": 0.06625, "grad_norm": 0.32159915566444397, "learning_rate": 0.0001641771845580832, "loss": 2.7145294189453124, "step": 530 }, { "epoch": 0.0675, "grad_norm": 0.32887545228004456, "learning_rate": 0.0001641326459452163, "loss": 2.7391708374023436, "step": 540 }, { "epoch": 0.06875, "grad_norm": 0.3189705014228821, "learning_rate": 0.00016408678225951563, "loss": 2.724725341796875, "step": 550 }, { "epoch": 0.07, "grad_norm": 0.3386867940425873, "learning_rate": 0.0001640395942449914, "loss": 2.7544118881225588, "step": 560 }, { "epoch": 0.07125, "grad_norm": 0.30630990862846375, "learning_rate": 0.00016399108266713735, "loss": 2.746489715576172, "step": 570 }, { "epoch": 0.0725, "grad_norm": 0.3187973201274872, "learning_rate": 0.00016394124831291837, "loss": 2.7217391967773437, "step": 580 }, { "epoch": 0.07375, "grad_norm": 0.316847562789917, "learning_rate": 0.00016389009199075774, "loss": 2.7319801330566404, "step": 590 }, { "epoch": 0.075, "grad_norm": 0.3210533559322357, "learning_rate": 0.00016383761453052384, "loss": 2.7253528594970704, "step": 600 }, { "epoch": 0.07625, "grad_norm": 0.30917614698410034, "learning_rate": 0.00016378381678351702, "loss": 2.7291168212890624, "step": 610 }, { "epoch": 0.0775, "grad_norm": 0.3088016211986542, "learning_rate": 0.0001637286996224554, "loss": 2.696218490600586, "step": 620 }, { "epoch": 0.07875, "grad_norm": 0.32467445731163025, "learning_rate": 0.0001636722639414611, "loss": 2.7149139404296876, "step": 630 }, { "epoch": 0.08, "grad_norm": 0.3138329088687897, "learning_rate": 0.0001636145106560454, "loss": 2.73681755065918, "step": 640 }, { "epoch": 0.08125, "grad_norm": 0.3167824447154999, "learning_rate": 0.0001635554407030941, "loss": 2.7229454040527346, "step": 650 }, { "epoch": 0.0825, "grad_norm": 0.3144330680370331, "learning_rate": 0.0001634950550408522, "loss": 2.6987558364868165, "step": 660 }, { "epoch": 0.08375, "grad_norm": 0.311829149723053, "learning_rate": 0.00016343335464890846, "loss": 2.706182861328125, "step": 670 }, { "epoch": 0.085, "grad_norm": 0.3265558183193207, "learning_rate": 0.00016337034052817947, "loss": 2.7086441040039064, "step": 680 }, { "epoch": 0.08625, "grad_norm": 0.3073708117008209, "learning_rate": 0.00016330601370089334, "loss": 2.7448238372802733, "step": 690 }, { "epoch": 0.0875, "grad_norm": 0.30871179699897766, "learning_rate": 0.0001632403752105732, "loss": 2.7313838958740235, "step": 700 }, { "epoch": 0.08875, "grad_norm": 0.31331929564476013, "learning_rate": 0.00016317342612202036, "loss": 2.7109472274780275, "step": 710 }, { "epoch": 0.09, "grad_norm": 0.3069651424884796, "learning_rate": 0.0001631051675212967, "loss": 2.698355865478516, "step": 720 }, { "epoch": 0.09125, "grad_norm": 0.3077262341976166, "learning_rate": 0.00016303560051570746, "loss": 2.707406997680664, "step": 730 }, { "epoch": 0.0925, "grad_norm": 0.3193919062614441, "learning_rate": 0.00016296472623378308, "loss": 2.709014129638672, "step": 740 }, { "epoch": 0.09375, "grad_norm": 0.31028079986572266, "learning_rate": 0.0001628925458252608, "loss": 2.7283496856689453, "step": 750 }, { "epoch": 0.095, "grad_norm": 0.3203696310520172, "learning_rate": 0.00016281906046106622, "loss": 2.723176193237305, "step": 760 }, { "epoch": 0.09625, "grad_norm": 0.31216055154800415, "learning_rate": 0.0001627442713332942, "loss": 2.740637016296387, "step": 770 }, { "epoch": 0.0975, "grad_norm": 0.3120918869972229, "learning_rate": 0.00016266817965518942, "loss": 2.720622444152832, "step": 780 }, { "epoch": 0.09875, "grad_norm": 0.3088921308517456, "learning_rate": 0.00016259078666112692, "loss": 2.714591217041016, "step": 790 }, { "epoch": 0.1, "grad_norm": 0.30949750542640686, "learning_rate": 0.00016251209360659192, "loss": 2.7191795349121093, "step": 800 }, { "epoch": 0.10125, "grad_norm": 0.32115787267684937, "learning_rate": 0.00016243210176815944, "loss": 2.6966245651245115, "step": 810 }, { "epoch": 0.1025, "grad_norm": 0.307424396276474, "learning_rate": 0.00016235081244347373, "loss": 2.730236625671387, "step": 820 }, { "epoch": 0.10375, "grad_norm": 0.31429022550582886, "learning_rate": 0.00016226822695122704, "loss": 2.691334533691406, "step": 830 }, { "epoch": 0.105, "grad_norm": 0.30951419472694397, "learning_rate": 0.00016218434663113843, "loss": 2.690280532836914, "step": 840 }, { "epoch": 0.10625, "grad_norm": 0.31636205315589905, "learning_rate": 0.00016209917284393176, "loss": 2.7146608352661135, "step": 850 }, { "epoch": 0.1075, "grad_norm": 0.31698304414749146, "learning_rate": 0.00016201270697131396, "loss": 2.739955520629883, "step": 860 }, { "epoch": 0.10875, "grad_norm": 0.30591675639152527, "learning_rate": 0.00016192495041595235, "loss": 2.725113868713379, "step": 870 }, { "epoch": 0.11, "grad_norm": 0.3064011037349701, "learning_rate": 0.00016183590460145194, "loss": 2.7186939239501955, "step": 880 }, { "epoch": 0.11125, "grad_norm": 0.30639246106147766, "learning_rate": 0.00016174557097233246, "loss": 2.713937187194824, "step": 890 }, { "epoch": 0.1125, "grad_norm": 0.3199147880077362, "learning_rate": 0.00016165395099400478, "loss": 2.7232639312744142, "step": 900 }, { "epoch": 0.11375, "grad_norm": 0.3103027045726776, "learning_rate": 0.00016156104615274719, "loss": 2.7207107543945312, "step": 910 }, { "epoch": 0.115, "grad_norm": 0.32256069779396057, "learning_rate": 0.0001614668579556813, "loss": 2.7164112091064454, "step": 920 }, { "epoch": 0.11625, "grad_norm": 0.31795644760131836, "learning_rate": 0.0001613713879307476, "loss": 2.704681396484375, "step": 930 }, { "epoch": 0.1175, "grad_norm": 0.32402339577674866, "learning_rate": 0.00016127463762668064, "loss": 2.733686065673828, "step": 940 }, { "epoch": 0.11875, "grad_norm": 0.33603930473327637, "learning_rate": 0.00016117660861298395, "loss": 2.736924743652344, "step": 950 }, { "epoch": 0.12, "grad_norm": 0.325527161359787, "learning_rate": 0.0001610773024799045, "loss": 2.7135137557983398, "step": 960 }, { "epoch": 0.12125, "grad_norm": 0.31715628504753113, "learning_rate": 0.000160976720838407, "loss": 2.702963638305664, "step": 970 }, { "epoch": 0.1225, "grad_norm": 0.3281555771827698, "learning_rate": 0.0001608748653201477, "loss": 2.718802261352539, "step": 980 }, { "epoch": 0.12375, "grad_norm": 0.3280923366546631, "learning_rate": 0.00016077173757744805, "loss": 2.722803497314453, "step": 990 }, { "epoch": 0.125, "grad_norm": 0.3167899549007416, "learning_rate": 0.00016066733928326755, "loss": 2.7145980834960937, "step": 1000 }, { "epoch": 0.12625, "grad_norm": 0.3199998438358307, "learning_rate": 0.0001605616721311771, "loss": 2.713690185546875, "step": 1010 }, { "epoch": 0.1275, "grad_norm": 0.33201882243156433, "learning_rate": 0.00016045473783533111, "loss": 2.7083156585693358, "step": 1020 }, { "epoch": 0.12875, "grad_norm": 0.321409672498703, "learning_rate": 0.00016034653813043993, "loss": 2.6916542053222656, "step": 1030 }, { "epoch": 0.13, "grad_norm": 0.3114752769470215, "learning_rate": 0.00016023707477174167, "loss": 2.7114416122436524, "step": 1040 }, { "epoch": 0.13125, "grad_norm": 0.3244589567184448, "learning_rate": 0.0001601263495349736, "loss": 2.678660202026367, "step": 1050 }, { "epoch": 0.1325, "grad_norm": 0.3137204945087433, "learning_rate": 0.0001600143642163435, "loss": 2.7046539306640627, "step": 1060 }, { "epoch": 0.13375, "grad_norm": 0.3140222430229187, "learning_rate": 0.0001599011206325005, "loss": 2.7146488189697267, "step": 1070 }, { "epoch": 0.135, "grad_norm": 0.31908106803894043, "learning_rate": 0.0001597866206205054, "loss": 2.713479995727539, "step": 1080 }, { "epoch": 0.13625, "grad_norm": 0.3061647415161133, "learning_rate": 0.00015967086603780128, "loss": 2.714076805114746, "step": 1090 }, { "epoch": 0.1375, "grad_norm": 0.3262089490890503, "learning_rate": 0.00015955385876218297, "loss": 2.709738540649414, "step": 1100 }, { "epoch": 0.13875, "grad_norm": 0.3090061545372009, "learning_rate": 0.0001594356006917667, "loss": 2.682490921020508, "step": 1110 }, { "epoch": 0.14, "grad_norm": 0.3089563548564911, "learning_rate": 0.00015931609374495955, "loss": 2.707094192504883, "step": 1120 }, { "epoch": 0.14125, "grad_norm": 0.3150913417339325, "learning_rate": 0.00015919533986042794, "loss": 2.6884944915771483, "step": 1130 }, { "epoch": 0.1425, "grad_norm": 0.3184945285320282, "learning_rate": 0.00015907334099706644, "loss": 2.668732833862305, "step": 1140 }, { "epoch": 0.14375, "grad_norm": 0.3181245028972626, "learning_rate": 0.00015895009913396594, "loss": 2.699263000488281, "step": 1150 }, { "epoch": 0.145, "grad_norm": 0.3286084234714508, "learning_rate": 0.00015882561627038154, "loss": 2.6974639892578125, "step": 1160 }, { "epoch": 0.14625, "grad_norm": 0.30604103207588196, "learning_rate": 0.00015869989442570008, "loss": 2.691238212585449, "step": 1170 }, { "epoch": 0.1475, "grad_norm": 0.31512096524238586, "learning_rate": 0.0001585729356394074, "loss": 2.6900882720947266, "step": 1180 }, { "epoch": 0.14875, "grad_norm": 0.324313759803772, "learning_rate": 0.0001584447419710553, "loss": 2.6862293243408204, "step": 1190 }, { "epoch": 0.15, "grad_norm": 0.32386448979377747, "learning_rate": 0.00015831531550022804, "loss": 2.7286815643310547, "step": 1200 }, { "epoch": 0.15125, "grad_norm": 0.3133200705051422, "learning_rate": 0.0001581846583265087, "loss": 2.697834014892578, "step": 1210 }, { "epoch": 0.1525, "grad_norm": 0.30789715051651, "learning_rate": 0.00015805277256944507, "loss": 2.6866151809692385, "step": 1220 }, { "epoch": 0.15375, "grad_norm": 0.3052247166633606, "learning_rate": 0.00015791966036851529, "loss": 2.7111629486083983, "step": 1230 }, { "epoch": 0.155, "grad_norm": 0.312637597322464, "learning_rate": 0.00015778532388309308, "loss": 2.6961734771728514, "step": 1240 }, { "epoch": 0.15625, "grad_norm": 0.3095453977584839, "learning_rate": 0.0001576497652924128, "loss": 2.6890350341796876, "step": 1250 }, { "epoch": 0.1575, "grad_norm": 0.31984061002731323, "learning_rate": 0.00015751298679553402, "loss": 2.6957382202148437, "step": 1260 }, { "epoch": 0.15875, "grad_norm": 0.3066132366657257, "learning_rate": 0.00015737499061130596, "loss": 2.721820068359375, "step": 1270 }, { "epoch": 0.16, "grad_norm": 0.31295251846313477, "learning_rate": 0.00015723577897833128, "loss": 2.688478469848633, "step": 1280 }, { "epoch": 0.16125, "grad_norm": 0.326561838388443, "learning_rate": 0.00015709535415493002, "loss": 2.72012939453125, "step": 1290 }, { "epoch": 0.1625, "grad_norm": 0.31419870257377625, "learning_rate": 0.0001569537184191028, "loss": 2.697279167175293, "step": 1300 }, { "epoch": 0.16375, "grad_norm": 0.3069676160812378, "learning_rate": 0.00015681087406849395, "loss": 2.6784629821777344, "step": 1310 }, { "epoch": 0.165, "grad_norm": 0.3102596402168274, "learning_rate": 0.00015666682342035414, "loss": 2.7019378662109377, "step": 1320 }, { "epoch": 0.16625, "grad_norm": 0.33090364933013916, "learning_rate": 0.00015652156881150288, "loss": 2.698979949951172, "step": 1330 }, { "epoch": 0.1675, "grad_norm": 0.3196777105331421, "learning_rate": 0.00015637511259829055, "loss": 2.670425796508789, "step": 1340 }, { "epoch": 0.16875, "grad_norm": 0.3207469582557678, "learning_rate": 0.0001562274571565603, "loss": 2.687581443786621, "step": 1350 }, { "epoch": 0.17, "grad_norm": 0.30899399518966675, "learning_rate": 0.00015607860488160927, "loss": 2.703385925292969, "step": 1360 }, { "epoch": 0.17125, "grad_norm": 0.32463735342025757, "learning_rate": 0.00015592855818815003, "loss": 2.7129638671875, "step": 1370 }, { "epoch": 0.1725, "grad_norm": 0.29863590002059937, "learning_rate": 0.00015577731951027114, "loss": 2.6898262023925783, "step": 1380 }, { "epoch": 0.17375, "grad_norm": 0.30260539054870605, "learning_rate": 0.00015562489130139783, "loss": 2.696180725097656, "step": 1390 }, { "epoch": 0.175, "grad_norm": 0.30247101187705994, "learning_rate": 0.0001554712760342521, "loss": 2.667018508911133, "step": 1400 }, { "epoch": 0.17625, "grad_norm": 0.3163856565952301, "learning_rate": 0.0001553164762008128, "loss": 2.7117942810058593, "step": 1410 }, { "epoch": 0.1775, "grad_norm": 0.31918948888778687, "learning_rate": 0.0001551604943122748, "loss": 2.6868515014648438, "step": 1420 }, { "epoch": 0.17875, "grad_norm": 0.3069145083427429, "learning_rate": 0.00015500333289900878, "loss": 2.665867042541504, "step": 1430 }, { "epoch": 0.18, "grad_norm": 0.3310893774032593, "learning_rate": 0.00015484499451051976, "loss": 2.6680227279663087, "step": 1440 }, { "epoch": 0.18125, "grad_norm": 0.32211220264434814, "learning_rate": 0.00015468548171540595, "loss": 2.7012916564941407, "step": 1450 }, { "epoch": 0.1825, "grad_norm": 0.3143543303012848, "learning_rate": 0.00015452479710131699, "loss": 2.711798667907715, "step": 1460 }, { "epoch": 0.18375, "grad_norm": 0.33350202441215515, "learning_rate": 0.00015436294327491207, "loss": 2.692435455322266, "step": 1470 }, { "epoch": 0.185, "grad_norm": 0.3231949508190155, "learning_rate": 0.00015419992286181756, "loss": 2.6712711334228514, "step": 1480 }, { "epoch": 0.18625, "grad_norm": 0.3143308758735657, "learning_rate": 0.00015403573850658438, "loss": 2.6955425262451174, "step": 1490 }, { "epoch": 0.1875, "grad_norm": 0.3118044137954712, "learning_rate": 0.0001538703928726452, "loss": 2.6801069259643553, "step": 1500 }, { "epoch": 0.18875, "grad_norm": 0.3099926710128784, "learning_rate": 0.00015370388864227133, "loss": 2.669751739501953, "step": 1510 }, { "epoch": 0.19, "grad_norm": 0.31752023100852966, "learning_rate": 0.0001535362285165288, "loss": 2.6963922500610353, "step": 1520 }, { "epoch": 0.19125, "grad_norm": 0.3166843056678772, "learning_rate": 0.00015336741521523506, "loss": 2.6759317398071287, "step": 1530 }, { "epoch": 0.1925, "grad_norm": 0.30386143922805786, "learning_rate": 0.0001531974514769145, "loss": 2.663748359680176, "step": 1540 }, { "epoch": 0.19375, "grad_norm": 0.3149690628051758, "learning_rate": 0.0001530263400587541, "loss": 2.672575759887695, "step": 1550 }, { "epoch": 0.195, "grad_norm": 0.32157933712005615, "learning_rate": 0.0001528540837365589, "loss": 2.7002744674682617, "step": 1560 }, { "epoch": 0.19625, "grad_norm": 0.31378722190856934, "learning_rate": 0.0001526806853047066, "loss": 2.7025676727294923, "step": 1570 }, { "epoch": 0.1975, "grad_norm": 0.313424676656723, "learning_rate": 0.00015250614757610258, "loss": 2.7100372314453125, "step": 1580 }, { "epoch": 0.19875, "grad_norm": 0.32746565341949463, "learning_rate": 0.00015233047338213414, "loss": 2.721282196044922, "step": 1590 }, { "epoch": 0.2, "grad_norm": 0.3191785216331482, "learning_rate": 0.00015215366557262444, "loss": 2.6832775115966796, "step": 1600 }, { "epoch": 0.20125, "grad_norm": 0.3307384252548218, "learning_rate": 0.00015197572701578654, "loss": 2.683314323425293, "step": 1610 }, { "epoch": 0.2025, "grad_norm": 0.3074938952922821, "learning_rate": 0.00015179666059817658, "loss": 2.6983566284179688, "step": 1620 }, { "epoch": 0.20375, "grad_norm": 0.31642141938209534, "learning_rate": 0.00015161646922464713, "loss": 2.67681770324707, "step": 1630 }, { "epoch": 0.205, "grad_norm": 0.3204726576805115, "learning_rate": 0.0001514351558183001, "loss": 2.673402786254883, "step": 1640 }, { "epoch": 0.20625, "grad_norm": 0.31102851033210754, "learning_rate": 0.00015125272332043916, "loss": 2.6676706314086913, "step": 1650 }, { "epoch": 0.2075, "grad_norm": 0.31576183438301086, "learning_rate": 0.00015106917469052215, "loss": 2.691006088256836, "step": 1660 }, { "epoch": 0.20875, "grad_norm": 0.3049616515636444, "learning_rate": 0.00015088451290611304, "loss": 2.6852401733398437, "step": 1670 }, { "epoch": 0.21, "grad_norm": 0.32038211822509766, "learning_rate": 0.00015069874096283362, "loss": 2.6850494384765624, "step": 1680 }, { "epoch": 0.21125, "grad_norm": 0.31499341130256653, "learning_rate": 0.00015051186187431495, "loss": 2.685712432861328, "step": 1690 }, { "epoch": 0.2125, "grad_norm": 0.3252309262752533, "learning_rate": 0.0001503238786721483, "loss": 2.6800838470458985, "step": 1700 }, { "epoch": 0.21375, "grad_norm": 0.33030372858047485, "learning_rate": 0.00015013479440583626, "loss": 2.6957000732421874, "step": 1710 }, { "epoch": 0.215, "grad_norm": 0.31104838848114014, "learning_rate": 0.00014994461214274302, "loss": 2.6724735260009767, "step": 1720 }, { "epoch": 0.21625, "grad_norm": 0.31927284598350525, "learning_rate": 0.00014975333496804468, "loss": 2.6581308364868166, "step": 1730 }, { "epoch": 0.2175, "grad_norm": 0.3242516815662384, "learning_rate": 0.00014956096598467932, "loss": 2.6579944610595705, "step": 1740 }, { "epoch": 0.21875, "grad_norm": 0.3098279535770416, "learning_rate": 0.00014936750831329645, "loss": 2.6656078338623046, "step": 1750 }, { "epoch": 0.22, "grad_norm": 0.309610515832901, "learning_rate": 0.0001491729650922066, "loss": 2.6563575744628904, "step": 1760 }, { "epoch": 0.22125, "grad_norm": 0.31657662987709045, "learning_rate": 0.00014897733947733031, "loss": 2.6570175170898436, "step": 1770 }, { "epoch": 0.2225, "grad_norm": 0.31096142530441284, "learning_rate": 0.00014878063464214683, "loss": 2.6638370513916017, "step": 1780 }, { "epoch": 0.22375, "grad_norm": 0.3048711121082306, "learning_rate": 0.00014858285377764284, "loss": 2.6526607513427733, "step": 1790 }, { "epoch": 0.225, "grad_norm": 0.32042643427848816, "learning_rate": 0.0001483840000922606, "loss": 2.6601219177246094, "step": 1800 }, { "epoch": 0.22625, "grad_norm": 0.324494332075119, "learning_rate": 0.00014818407681184585, "loss": 2.6538795471191405, "step": 1810 }, { "epoch": 0.2275, "grad_norm": 0.3241287171840668, "learning_rate": 0.00014798308717959552, "loss": 2.678963851928711, "step": 1820 }, { "epoch": 0.22875, "grad_norm": 0.31064486503601074, "learning_rate": 0.00014778103445600512, "loss": 2.6616994857788088, "step": 1830 }, { "epoch": 0.23, "grad_norm": 0.31154972314834595, "learning_rate": 0.0001475779219188159, "loss": 2.6822179794311523, "step": 1840 }, { "epoch": 0.23125, "grad_norm": 0.32366329431533813, "learning_rate": 0.00014737375286296158, "loss": 2.689762496948242, "step": 1850 }, { "epoch": 0.2325, "grad_norm": 0.3157241642475128, "learning_rate": 0.00014716853060051493, "loss": 2.6725814819335936, "step": 1860 }, { "epoch": 0.23375, "grad_norm": 0.31811729073524475, "learning_rate": 0.0001469622584606341, "loss": 2.6730297088623045, "step": 1870 }, { "epoch": 0.235, "grad_norm": 0.3240484893321991, "learning_rate": 0.00014675493978950855, "loss": 2.6649261474609376, "step": 1880 }, { "epoch": 0.23625, "grad_norm": 0.3145361542701721, "learning_rate": 0.0001465465779503048, "loss": 2.6716739654541017, "step": 1890 }, { "epoch": 0.2375, "grad_norm": 0.30439531803131104, "learning_rate": 0.0001463371763231118, "loss": 2.6668254852294924, "step": 1900 }, { "epoch": 0.23875, "grad_norm": 0.3104805052280426, "learning_rate": 0.00014612673830488625, "loss": 2.6472827911376955, "step": 1910 }, { "epoch": 0.24, "grad_norm": 0.3249180316925049, "learning_rate": 0.00014591526730939734, "loss": 2.6549278259277345, "step": 1920 }, { "epoch": 0.24125, "grad_norm": 0.31549057364463806, "learning_rate": 0.00014570276676717145, "loss": 2.672433853149414, "step": 1930 }, { "epoch": 0.2425, "grad_norm": 0.32735103368759155, "learning_rate": 0.00014548924012543646, "loss": 2.6650619506835938, "step": 1940 }, { "epoch": 0.24375, "grad_norm": 0.3208616375923157, "learning_rate": 0.00014527469084806585, "loss": 2.6924251556396483, "step": 1950 }, { "epoch": 1.000875, "grad_norm": 0.3361559808254242, "learning_rate": 0.00014505912241552255, "loss": 2.918643760681152, "step": 1960 }, { "epoch": 1.002125, "grad_norm": 0.32232168316841125, "learning_rate": 0.00014484253832480244, "loss": 2.6152179718017576, "step": 1970 }, { "epoch": 1.003375, "grad_norm": 0.32902058959007263, "learning_rate": 0.0001446249420893775, "loss": 2.6433155059814455, "step": 1980 }, { "epoch": 1.004625, "grad_norm": 0.31211215257644653, "learning_rate": 0.0001444063372391391, "loss": 2.5884145736694335, "step": 1990 }, { "epoch": 1.005875, "grad_norm": 0.32412853837013245, "learning_rate": 0.00014418672732034043, "loss": 2.5942047119140623, "step": 2000 }, { "epoch": 1.007125, "grad_norm": 0.32079222798347473, "learning_rate": 0.0001439661158955392, "loss": 2.5999183654785156, "step": 2010 }, { "epoch": 1.008375, "grad_norm": 0.3363247811794281, "learning_rate": 0.00014374450654353968, "loss": 2.5693603515625, "step": 2020 }, { "epoch": 1.009625, "grad_norm": 0.3330596685409546, "learning_rate": 0.00014352190285933487, "loss": 2.577710723876953, "step": 2030 }, { "epoch": 1.010875, "grad_norm": 0.31830593943595886, "learning_rate": 0.00014329830845404782, "loss": 2.580182647705078, "step": 2040 }, { "epoch": 1.012125, "grad_norm": 0.3276713490486145, "learning_rate": 0.00014307372695487343, "loss": 2.5742984771728517, "step": 2050 }, { "epoch": 1.013375, "grad_norm": 0.32609084248542786, "learning_rate": 0.00014284816200501937, "loss": 2.5697860717773438, "step": 2060 }, { "epoch": 1.014625, "grad_norm": 0.32425832748413086, "learning_rate": 0.00014262161726364707, "loss": 2.5537353515625, "step": 2070 }, { "epoch": 1.015875, "grad_norm": 0.3417907953262329, "learning_rate": 0.00014239409640581238, "loss": 2.5780372619628906, "step": 2080 }, { "epoch": 1.017125, "grad_norm": 0.3302324116230011, "learning_rate": 0.0001421656031224058, "loss": 2.5682140350341798, "step": 2090 }, { "epoch": 1.018375, "grad_norm": 0.33167314529418945, "learning_rate": 0.00014193614112009283, "loss": 2.545709228515625, "step": 2100 }, { "epoch": 1.019625, "grad_norm": 0.3396015763282776, "learning_rate": 0.00014170571412125367, "loss": 2.544954299926758, "step": 2110 }, { "epoch": 1.020875, "grad_norm": 0.33836308121681213, "learning_rate": 0.00014147432586392297, "loss": 2.5545772552490233, "step": 2120 }, { "epoch": 1.022125, "grad_norm": 0.3312232196331024, "learning_rate": 0.00014124198010172898, "loss": 2.559113883972168, "step": 2130 }, { "epoch": 1.023375, "grad_norm": 0.33059218525886536, "learning_rate": 0.00014100868060383292, "loss": 2.533283805847168, "step": 2140 }, { "epoch": 1.024625, "grad_norm": 0.32571902871131897, "learning_rate": 0.00014077443115486767, "loss": 2.551566314697266, "step": 2150 }, { "epoch": 1.025875, "grad_norm": 0.3243643045425415, "learning_rate": 0.00014053923555487638, "loss": 2.564662551879883, "step": 2160 }, { "epoch": 1.027125, "grad_norm": 0.31755268573760986, "learning_rate": 0.0001403030976192509, "loss": 2.522117042541504, "step": 2170 }, { "epoch": 1.028375, "grad_norm": 0.34630700945854187, "learning_rate": 0.00014006602117866982, "loss": 2.529287910461426, "step": 2180 }, { "epoch": 1.029625, "grad_norm": 0.33032891154289246, "learning_rate": 0.0001398280100790363, "loss": 2.521525192260742, "step": 2190 }, { "epoch": 1.030875, "grad_norm": 0.3408825993537903, "learning_rate": 0.0001395890681814159, "loss": 2.5370689392089845, "step": 2200 }, { "epoch": 1.032125, "grad_norm": 0.3269711434841156, "learning_rate": 0.0001393491993619736, "loss": 2.5100967407226564, "step": 2210 }, { "epoch": 1.033375, "grad_norm": 0.32242265343666077, "learning_rate": 0.0001391084075119112, "loss": 2.5302288055419924, "step": 2220 }, { "epoch": 1.034625, "grad_norm": 0.3222724199295044, "learning_rate": 0.000138866696537404, "loss": 2.517455291748047, "step": 2230 }, { "epoch": 1.035875, "grad_norm": 0.3199198246002197, "learning_rate": 0.0001386240703595377, "loss": 2.5055145263671874, "step": 2240 }, { "epoch": 1.037125, "grad_norm": 0.33094459772109985, "learning_rate": 0.0001383805329142444, "loss": 2.5067977905273438, "step": 2250 }, { "epoch": 1.038375, "grad_norm": 0.33781564235687256, "learning_rate": 0.00013813608815223914, "loss": 2.4964527130126952, "step": 2260 }, { "epoch": 1.039625, "grad_norm": 0.34187182784080505, "learning_rate": 0.00013789074003895557, "loss": 2.4876964569091795, "step": 2270 }, { "epoch": 1.040875, "grad_norm": 0.3467255234718323, "learning_rate": 0.00013764449255448166, "loss": 2.527250862121582, "step": 2280 }, { "epoch": 1.042125, "grad_norm": 0.34287887811660767, "learning_rate": 0.00013739734969349526, "loss": 2.5136051177978516, "step": 2290 }, { "epoch": 1.043375, "grad_norm": 0.3415592908859253, "learning_rate": 0.0001371493154651991, "loss": 2.5083173751831054, "step": 2300 }, { "epoch": 1.044625, "grad_norm": 0.34434187412261963, "learning_rate": 0.00013690039389325595, "loss": 2.491905403137207, "step": 2310 }, { "epoch": 1.045875, "grad_norm": 0.35805854201316833, "learning_rate": 0.0001366505890157232, "loss": 2.509074401855469, "step": 2320 }, { "epoch": 1.047125, "grad_norm": 0.3360929787158966, "learning_rate": 0.00013639990488498738, "loss": 2.5023418426513673, "step": 2330 }, { "epoch": 1.048375, "grad_norm": 0.33336424827575684, "learning_rate": 0.00013614834556769853, "loss": 2.5313945770263673, "step": 2340 }, { "epoch": 1.049625, "grad_norm": 0.3515946567058563, "learning_rate": 0.00013589591514470408, "loss": 2.49786491394043, "step": 2350 }, { "epoch": 1.050875, "grad_norm": 0.3500429391860962, "learning_rate": 0.00013564261771098268, "loss": 2.501786804199219, "step": 2360 }, { "epoch": 1.052125, "grad_norm": 0.3275656998157501, "learning_rate": 0.00013538845737557796, "loss": 2.511077117919922, "step": 2370 }, { "epoch": 1.053375, "grad_norm": 0.3502793610095978, "learning_rate": 0.00013513343826153157, "loss": 2.4827537536621094, "step": 2380 }, { "epoch": 1.054625, "grad_norm": 0.3351482152938843, "learning_rate": 0.0001348775645058165, "loss": 2.5033424377441404, "step": 2390 }, { "epoch": 1.055875, "grad_norm": 0.3501179814338684, "learning_rate": 0.00013462084025927, "loss": 2.4896453857421874, "step": 2400 }, { "epoch": 1.057125, "grad_norm": 0.3435039222240448, "learning_rate": 0.00013436326968652593, "loss": 2.5125568389892576, "step": 2410 }, { "epoch": 1.058375, "grad_norm": 0.34035417437553406, "learning_rate": 0.00013410485696594768, "loss": 2.4909286499023438, "step": 2420 }, { "epoch": 1.059625, "grad_norm": 0.33129122853279114, "learning_rate": 0.00013384560628956, "loss": 2.556411361694336, "step": 2430 }, { "epoch": 1.060875, "grad_norm": 0.3542681932449341, "learning_rate": 0.0001335855218629812, "loss": 2.469993782043457, "step": 2440 }, { "epoch": 1.062125, "grad_norm": 0.3372875154018402, "learning_rate": 0.00013332460790535473, "loss": 2.4866575241088866, "step": 2450 }, { "epoch": 1.063375, "grad_norm": 0.3469390273094177, "learning_rate": 0.000133062868649281, "loss": 2.4757783889770506, "step": 2460 }, { "epoch": 1.064625, "grad_norm": 0.3474292457103729, "learning_rate": 0.0001328003083407486, "loss": 2.4724506378173827, "step": 2470 }, { "epoch": 1.065875, "grad_norm": 0.3638366758823395, "learning_rate": 0.0001325369312390653, "loss": 2.5047348022460936, "step": 2480 }, { "epoch": 1.067125, "grad_norm": 0.33961430191993713, "learning_rate": 0.0001322727416167891, "loss": 2.50977783203125, "step": 2490 }, { "epoch": 1.068375, "grad_norm": 0.34472352266311646, "learning_rate": 0.00013200774375965883, "loss": 2.4912172317504884, "step": 2500 }, { "epoch": 1.069625, "grad_norm": 0.3468291461467743, "learning_rate": 0.00013174194196652477, "loss": 2.4802589416503906, "step": 2510 }, { "epoch": 1.070875, "grad_norm": 0.3530188500881195, "learning_rate": 0.00013147534054927878, "loss": 2.4657310485839843, "step": 2520 }, { "epoch": 1.072125, "grad_norm": 0.3488495647907257, "learning_rate": 0.00013120794383278438, "loss": 2.4873979568481444, "step": 2530 }, { "epoch": 1.073375, "grad_norm": 0.3713776171207428, "learning_rate": 0.0001309397561548066, "loss": 2.47833137512207, "step": 2540 }, { "epoch": 1.074625, "grad_norm": 0.33978715538978577, "learning_rate": 0.00013067078186594156, "loss": 2.4833608627319337, "step": 2550 }, { "epoch": 1.075875, "grad_norm": 0.34602415561676025, "learning_rate": 0.000130401025329546, "loss": 2.50838623046875, "step": 2560 }, { "epoch": 1.077125, "grad_norm": 0.33973225951194763, "learning_rate": 0.00013013049092166652, "loss": 2.4615432739257814, "step": 2570 }, { "epoch": 1.078375, "grad_norm": 0.35132381319999695, "learning_rate": 0.00012985918303096833, "loss": 2.4790775299072267, "step": 2580 }, { "epoch": 1.079625, "grad_norm": 0.3466247618198395, "learning_rate": 0.00012958710605866436, "loss": 2.4747478485107424, "step": 2590 }, { "epoch": 1.080875, "grad_norm": 0.3456020951271057, "learning_rate": 0.00012931426441844374, "loss": 2.5099910736083983, "step": 2600 }, { "epoch": 1.082125, "grad_norm": 0.3543621301651001, "learning_rate": 0.00012904066253640017, "loss": 2.4894287109375, "step": 2610 }, { "epoch": 1.083375, "grad_norm": 0.3563762605190277, "learning_rate": 0.00012876630485096017, "loss": 2.476998138427734, "step": 2620 }, { "epoch": 1.084625, "grad_norm": 0.35365182161331177, "learning_rate": 0.000128491195812811, "loss": 2.479985809326172, "step": 2630 }, { "epoch": 1.085875, "grad_norm": 0.3529748320579529, "learning_rate": 0.00012821533988482863, "loss": 2.4728267669677733, "step": 2640 }, { "epoch": 1.087125, "grad_norm": 0.33992883563041687, "learning_rate": 0.00012793874154200515, "loss": 2.4903228759765623, "step": 2650 }, { "epoch": 1.088375, "grad_norm": 0.3493664562702179, "learning_rate": 0.00012766140527137627, "loss": 2.4863492965698244, "step": 2660 }, { "epoch": 1.089625, "grad_norm": 0.3460827171802521, "learning_rate": 0.00012738333557194855, "loss": 2.449415588378906, "step": 2670 }, { "epoch": 1.090875, "grad_norm": 0.34357935190200806, "learning_rate": 0.00012710453695462633, "loss": 2.463718795776367, "step": 2680 }, { "epoch": 1.092125, "grad_norm": 0.3472927510738373, "learning_rate": 0.00012682501394213866, "loss": 2.445463943481445, "step": 2690 }, { "epoch": 1.093375, "grad_norm": 0.350157767534256, "learning_rate": 0.00012654477106896584, "loss": 2.4972408294677733, "step": 2700 }, { "epoch": 1.094625, "grad_norm": 0.34597107768058777, "learning_rate": 0.00012626381288126593, "loss": 2.487579917907715, "step": 2710 }, { "epoch": 1.095875, "grad_norm": 0.3472701609134674, "learning_rate": 0.00012598214393680097, "loss": 2.493511199951172, "step": 2720 }, { "epoch": 1.097125, "grad_norm": 0.35611027479171753, "learning_rate": 0.00012569976880486298, "loss": 2.4602516174316404, "step": 2730 }, { "epoch": 1.098375, "grad_norm": 0.34450188279151917, "learning_rate": 0.00012541669206620002, "loss": 2.47379093170166, "step": 2740 }, { "epoch": 1.099625, "grad_norm": 0.3402315080165863, "learning_rate": 0.0001251329183129416, "loss": 2.4741775512695314, "step": 2750 }, { "epoch": 1.100875, "grad_norm": 0.36293742060661316, "learning_rate": 0.00012484845214852453, "loss": 2.478403854370117, "step": 2760 }, { "epoch": 1.102125, "grad_norm": 0.34437844157218933, "learning_rate": 0.00012456329818761794, "loss": 2.489897918701172, "step": 2770 }, { "epoch": 1.103375, "grad_norm": 0.34425976872444153, "learning_rate": 0.0001242774610560485, "loss": 2.484636688232422, "step": 2780 }, { "epoch": 1.104625, "grad_norm": 0.3487798273563385, "learning_rate": 0.00012399094539072557, "loss": 2.4807788848876955, "step": 2790 }, { "epoch": 1.105875, "grad_norm": 0.36608222126960754, "learning_rate": 0.00012370375583956562, "loss": 2.498831939697266, "step": 2800 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.7508190343633306e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }