| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.105875, |
| "eval_steps": 500, |
| "global_step": 2800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00125, |
| "grad_norm": 0.3465445339679718, |
| "learning_rate": 7.4204999999999995e-06, |
| "loss": 2.7873512268066407, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 0.32606860995292664, |
| "learning_rate": 1.56655e-05, |
| "loss": 2.760304069519043, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 0.33225658535957336, |
| "learning_rate": 2.3910499999999997e-05, |
| "loss": 2.7759071350097657, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 0.31996211409568787, |
| "learning_rate": 3.21555e-05, |
| "loss": 2.7292430877685545, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 0.3153120279312134, |
| "learning_rate": 4.04005e-05, |
| "loss": 2.733371353149414, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 0.3135412037372589, |
| "learning_rate": 4.8645499999999994e-05, |
| "loss": 2.7492229461669924, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 0.3155956268310547, |
| "learning_rate": 5.6890499999999993e-05, |
| "loss": 2.7486228942871094, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.3149002194404602, |
| "learning_rate": 6.51355e-05, |
| "loss": 2.760879898071289, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 0.3194095492362976, |
| "learning_rate": 7.33805e-05, |
| "loss": 2.734035873413086, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 0.3121851980686188, |
| "learning_rate": 8.16255e-05, |
| "loss": 2.7368759155273437, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 0.3184032440185547, |
| "learning_rate": 8.98705e-05, |
| "loss": 2.736837387084961, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 0.3170839250087738, |
| "learning_rate": 9.81155e-05, |
| "loss": 2.7051807403564454, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 0.3180184066295624, |
| "learning_rate": 0.0001063605, |
| "loss": 2.7603172302246093, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 0.31468942761421204, |
| "learning_rate": 0.00011460549999999999, |
| "loss": 2.7113197326660154, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 0.31729385256767273, |
| "learning_rate": 0.00012285049999999999, |
| "loss": 2.7222190856933595, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.3197655975818634, |
| "learning_rate": 0.0001310955, |
| "loss": 2.7241241455078127, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 0.3256337642669678, |
| "learning_rate": 0.00013934049999999998, |
| "loss": 2.7403392791748047, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 0.3098828196525574, |
| "learning_rate": 0.0001475855, |
| "loss": 2.7496837615966796, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 0.3134927749633789, |
| "learning_rate": 0.00015583049999999998, |
| "loss": 2.750768280029297, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 0.3353506922721863, |
| "learning_rate": 0.0001640755, |
| "loss": 2.7532047271728515, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 0.3203900456428528, |
| "learning_rate": 0.0001648994583038516, |
| "loss": 2.7489036560058593, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 0.3266359865665436, |
| "learning_rate": 0.00016489758578309418, |
| "loss": 2.7316030502319335, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 0.31027814745903015, |
| "learning_rate": 0.00016489437578049018, |
| "loss": 2.7714206695556642, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.325736939907074, |
| "learning_rate": 0.0001648898283481129, |
| "loss": 2.7439931869506835, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 0.30528295040130615, |
| "learning_rate": 0.00016488394355973176, |
| "loss": 2.766144943237305, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 0.31271758675575256, |
| "learning_rate": 0.000164876721510811, |
| "loss": 2.7013065338134767, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 0.3198724687099457, |
| "learning_rate": 0.0001648681623185082, |
| "loss": 2.7379714965820314, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 0.33557799458503723, |
| "learning_rate": 0.00016485826612167237, |
| "loss": 2.76102352142334, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 0.3212190568447113, |
| "learning_rate": 0.00016484703308084162, |
| "loss": 2.7475757598876953, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 0.31533411145210266, |
| "learning_rate": 0.00016483446337824071, |
| "loss": 2.747650718688965, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 0.33507952094078064, |
| "learning_rate": 0.00016482055721777798, |
| "loss": 2.739873504638672, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.31843748688697815, |
| "learning_rate": 0.00016480531482504198, |
| "loss": 2.7478389739990234, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 0.3268890380859375, |
| "learning_rate": 0.00016478873644729805, |
| "loss": 2.7712429046630858, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 0.315518319606781, |
| "learning_rate": 0.00016477082235348404, |
| "loss": 2.7189746856689454, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 0.31345367431640625, |
| "learning_rate": 0.0001647515728342061, |
| "loss": 2.7359670639038085, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 0.325610876083374, |
| "learning_rate": 0.0001647309882017339, |
| "loss": 2.748139572143555, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 0.3194393813610077, |
| "learning_rate": 0.00016470906878999564, |
| "loss": 2.7462692260742188, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 0.3070792555809021, |
| "learning_rate": 0.0001646858149545726, |
| "loss": 2.757720184326172, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 0.32982465624809265, |
| "learning_rate": 0.00016466122707269328, |
| "loss": 2.7279708862304686, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.309640496969223, |
| "learning_rate": 0.0001646353055432274, |
| "loss": 2.724739837646484, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 0.31954118609428406, |
| "learning_rate": 0.00016460805078667945, |
| "loss": 2.7295236587524414, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 0.30906999111175537, |
| "learning_rate": 0.00016457946324518165, |
| "loss": 2.734362030029297, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 0.3177924156188965, |
| "learning_rate": 0.00016454954338248712, |
| "loss": 2.7312435150146483, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 0.3104606866836548, |
| "learning_rate": 0.00016451829168396203, |
| "loss": 2.7339248657226562, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 0.31935980916023254, |
| "learning_rate": 0.0001644857086565779, |
| "loss": 2.762462043762207, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 0.321206659078598, |
| "learning_rate": 0.0001644517948289035, |
| "loss": 2.7401878356933596, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 0.31553006172180176, |
| "learning_rate": 0.00016441655075109576, |
| "loss": 2.7154884338378906, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.3156311810016632, |
| "learning_rate": 0.0001643799769948916, |
| "loss": 2.731028747558594, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 0.31830424070358276, |
| "learning_rate": 0.00016434207415359802, |
| "loss": 2.748556137084961, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 0.3151983320713043, |
| "learning_rate": 0.0001643028428420828, |
| "loss": 2.7336639404296874, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06375, |
| "grad_norm": 0.32100728154182434, |
| "learning_rate": 0.00016426228369676436, |
| "loss": 2.733713150024414, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 0.3150577247142792, |
| "learning_rate": 0.00016422039737560163, |
| "loss": 2.747536849975586, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.06625, |
| "grad_norm": 0.32159915566444397, |
| "learning_rate": 0.0001641771845580832, |
| "loss": 2.7145294189453124, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 0.32887545228004456, |
| "learning_rate": 0.0001641326459452163, |
| "loss": 2.7391708374023436, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.06875, |
| "grad_norm": 0.3189705014228821, |
| "learning_rate": 0.00016408678225951563, |
| "loss": 2.724725341796875, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.3386867940425873, |
| "learning_rate": 0.0001640395942449914, |
| "loss": 2.7544118881225588, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.07125, |
| "grad_norm": 0.30630990862846375, |
| "learning_rate": 0.00016399108266713735, |
| "loss": 2.746489715576172, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 0.3187973201274872, |
| "learning_rate": 0.00016394124831291837, |
| "loss": 2.7217391967773437, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.07375, |
| "grad_norm": 0.316847562789917, |
| "learning_rate": 0.00016389009199075774, |
| "loss": 2.7319801330566404, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 0.3210533559322357, |
| "learning_rate": 0.00016383761453052384, |
| "loss": 2.7253528594970704, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07625, |
| "grad_norm": 0.30917614698410034, |
| "learning_rate": 0.00016378381678351702, |
| "loss": 2.7291168212890624, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 0.3088016211986542, |
| "learning_rate": 0.0001637286996224554, |
| "loss": 2.696218490600586, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.07875, |
| "grad_norm": 0.32467445731163025, |
| "learning_rate": 0.0001636722639414611, |
| "loss": 2.7149139404296876, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.3138329088687897, |
| "learning_rate": 0.0001636145106560454, |
| "loss": 2.73681755065918, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.08125, |
| "grad_norm": 0.3167824447154999, |
| "learning_rate": 0.0001635554407030941, |
| "loss": 2.7229454040527346, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 0.3144330680370331, |
| "learning_rate": 0.0001634950550408522, |
| "loss": 2.6987558364868165, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.08375, |
| "grad_norm": 0.311829149723053, |
| "learning_rate": 0.00016343335464890846, |
| "loss": 2.706182861328125, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 0.3265558183193207, |
| "learning_rate": 0.00016337034052817947, |
| "loss": 2.7086441040039064, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.08625, |
| "grad_norm": 0.3073708117008209, |
| "learning_rate": 0.00016330601370089334, |
| "loss": 2.7448238372802733, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 0.30871179699897766, |
| "learning_rate": 0.0001632403752105732, |
| "loss": 2.7313838958740235, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08875, |
| "grad_norm": 0.31331929564476013, |
| "learning_rate": 0.00016317342612202036, |
| "loss": 2.7109472274780275, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.3069651424884796, |
| "learning_rate": 0.0001631051675212967, |
| "loss": 2.698355865478516, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.09125, |
| "grad_norm": 0.3077262341976166, |
| "learning_rate": 0.00016303560051570746, |
| "loss": 2.707406997680664, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 0.3193919062614441, |
| "learning_rate": 0.00016296472623378308, |
| "loss": 2.709014129638672, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 0.31028079986572266, |
| "learning_rate": 0.0001628925458252608, |
| "loss": 2.7283496856689453, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 0.3203696310520172, |
| "learning_rate": 0.00016281906046106622, |
| "loss": 2.723176193237305, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.09625, |
| "grad_norm": 0.31216055154800415, |
| "learning_rate": 0.0001627442713332942, |
| "loss": 2.740637016296387, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 0.3120918869972229, |
| "learning_rate": 0.00016266817965518942, |
| "loss": 2.720622444152832, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.09875, |
| "grad_norm": 0.3088921308517456, |
| "learning_rate": 0.00016259078666112692, |
| "loss": 2.714591217041016, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.30949750542640686, |
| "learning_rate": 0.00016251209360659192, |
| "loss": 2.7191795349121093, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.10125, |
| "grad_norm": 0.32115787267684937, |
| "learning_rate": 0.00016243210176815944, |
| "loss": 2.6966245651245115, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 0.307424396276474, |
| "learning_rate": 0.00016235081244347373, |
| "loss": 2.730236625671387, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.10375, |
| "grad_norm": 0.31429022550582886, |
| "learning_rate": 0.00016226822695122704, |
| "loss": 2.691334533691406, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 0.30951419472694397, |
| "learning_rate": 0.00016218434663113843, |
| "loss": 2.690280532836914, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.10625, |
| "grad_norm": 0.31636205315589905, |
| "learning_rate": 0.00016209917284393176, |
| "loss": 2.7146608352661135, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 0.31698304414749146, |
| "learning_rate": 0.00016201270697131396, |
| "loss": 2.739955520629883, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.10875, |
| "grad_norm": 0.30591675639152527, |
| "learning_rate": 0.00016192495041595235, |
| "loss": 2.725113868713379, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.3064011037349701, |
| "learning_rate": 0.00016183590460145194, |
| "loss": 2.7186939239501955, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.11125, |
| "grad_norm": 0.30639246106147766, |
| "learning_rate": 0.00016174557097233246, |
| "loss": 2.713937187194824, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 0.3199147880077362, |
| "learning_rate": 0.00016165395099400478, |
| "loss": 2.7232639312744142, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.11375, |
| "grad_norm": 0.3103027045726776, |
| "learning_rate": 0.00016156104615274719, |
| "loss": 2.7207107543945312, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 0.32256069779396057, |
| "learning_rate": 0.0001614668579556813, |
| "loss": 2.7164112091064454, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.11625, |
| "grad_norm": 0.31795644760131836, |
| "learning_rate": 0.0001613713879307476, |
| "loss": 2.704681396484375, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 0.32402339577674866, |
| "learning_rate": 0.00016127463762668064, |
| "loss": 2.733686065673828, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.11875, |
| "grad_norm": 0.33603930473327637, |
| "learning_rate": 0.00016117660861298395, |
| "loss": 2.736924743652344, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.325527161359787, |
| "learning_rate": 0.0001610773024799045, |
| "loss": 2.7135137557983398, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.12125, |
| "grad_norm": 0.31715628504753113, |
| "learning_rate": 0.000160976720838407, |
| "loss": 2.702963638305664, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 0.3281555771827698, |
| "learning_rate": 0.0001608748653201477, |
| "loss": 2.718802261352539, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.12375, |
| "grad_norm": 0.3280923366546631, |
| "learning_rate": 0.00016077173757744805, |
| "loss": 2.722803497314453, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.3167899549007416, |
| "learning_rate": 0.00016066733928326755, |
| "loss": 2.7145980834960937, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.12625, |
| "grad_norm": 0.3199998438358307, |
| "learning_rate": 0.0001605616721311771, |
| "loss": 2.713690185546875, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 0.33201882243156433, |
| "learning_rate": 0.00016045473783533111, |
| "loss": 2.7083156585693358, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.12875, |
| "grad_norm": 0.321409672498703, |
| "learning_rate": 0.00016034653813043993, |
| "loss": 2.6916542053222656, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.3114752769470215, |
| "learning_rate": 0.00016023707477174167, |
| "loss": 2.7114416122436524, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.13125, |
| "grad_norm": 0.3244589567184448, |
| "learning_rate": 0.0001601263495349736, |
| "loss": 2.678660202026367, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 0.3137204945087433, |
| "learning_rate": 0.0001600143642163435, |
| "loss": 2.7046539306640627, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.13375, |
| "grad_norm": 0.3140222430229187, |
| "learning_rate": 0.0001599011206325005, |
| "loss": 2.7146488189697267, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 0.31908106803894043, |
| "learning_rate": 0.0001597866206205054, |
| "loss": 2.713479995727539, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.13625, |
| "grad_norm": 0.3061647415161133, |
| "learning_rate": 0.00015967086603780128, |
| "loss": 2.714076805114746, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 0.3262089490890503, |
| "learning_rate": 0.00015955385876218297, |
| "loss": 2.709738540649414, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.13875, |
| "grad_norm": 0.3090061545372009, |
| "learning_rate": 0.0001594356006917667, |
| "loss": 2.682490921020508, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.3089563548564911, |
| "learning_rate": 0.00015931609374495955, |
| "loss": 2.707094192504883, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.14125, |
| "grad_norm": 0.3150913417339325, |
| "learning_rate": 0.00015919533986042794, |
| "loss": 2.6884944915771483, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 0.3184945285320282, |
| "learning_rate": 0.00015907334099706644, |
| "loss": 2.668732833862305, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.14375, |
| "grad_norm": 0.3181245028972626, |
| "learning_rate": 0.00015895009913396594, |
| "loss": 2.699263000488281, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 0.3286084234714508, |
| "learning_rate": 0.00015882561627038154, |
| "loss": 2.6974639892578125, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.14625, |
| "grad_norm": 0.30604103207588196, |
| "learning_rate": 0.00015869989442570008, |
| "loss": 2.691238212585449, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 0.31512096524238586, |
| "learning_rate": 0.0001585729356394074, |
| "loss": 2.6900882720947266, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.14875, |
| "grad_norm": 0.324313759803772, |
| "learning_rate": 0.0001584447419710553, |
| "loss": 2.6862293243408204, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.32386448979377747, |
| "learning_rate": 0.00015831531550022804, |
| "loss": 2.7286815643310547, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.15125, |
| "grad_norm": 0.3133200705051422, |
| "learning_rate": 0.0001581846583265087, |
| "loss": 2.697834014892578, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 0.30789715051651, |
| "learning_rate": 0.00015805277256944507, |
| "loss": 2.6866151809692385, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.15375, |
| "grad_norm": 0.3052247166633606, |
| "learning_rate": 0.00015791966036851529, |
| "loss": 2.7111629486083983, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 0.312637597322464, |
| "learning_rate": 0.00015778532388309308, |
| "loss": 2.6961734771728514, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 0.3095453977584839, |
| "learning_rate": 0.0001576497652924128, |
| "loss": 2.6890350341796876, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 0.31984061002731323, |
| "learning_rate": 0.00015751298679553402, |
| "loss": 2.6957382202148437, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.15875, |
| "grad_norm": 0.3066132366657257, |
| "learning_rate": 0.00015737499061130596, |
| "loss": 2.721820068359375, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.31295251846313477, |
| "learning_rate": 0.00015723577897833128, |
| "loss": 2.688478469848633, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.16125, |
| "grad_norm": 0.326561838388443, |
| "learning_rate": 0.00015709535415493002, |
| "loss": 2.72012939453125, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 0.31419870257377625, |
| "learning_rate": 0.0001569537184191028, |
| "loss": 2.697279167175293, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.16375, |
| "grad_norm": 0.3069676160812378, |
| "learning_rate": 0.00015681087406849395, |
| "loss": 2.6784629821777344, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 0.3102596402168274, |
| "learning_rate": 0.00015666682342035414, |
| "loss": 2.7019378662109377, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.16625, |
| "grad_norm": 0.33090364933013916, |
| "learning_rate": 0.00015652156881150288, |
| "loss": 2.698979949951172, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 0.3196777105331421, |
| "learning_rate": 0.00015637511259829055, |
| "loss": 2.670425796508789, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.16875, |
| "grad_norm": 0.3207469582557678, |
| "learning_rate": 0.0001562274571565603, |
| "loss": 2.687581443786621, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.30899399518966675, |
| "learning_rate": 0.00015607860488160927, |
| "loss": 2.703385925292969, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.17125, |
| "grad_norm": 0.32463735342025757, |
| "learning_rate": 0.00015592855818815003, |
| "loss": 2.7129638671875, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 0.29863590002059937, |
| "learning_rate": 0.00015577731951027114, |
| "loss": 2.6898262023925783, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.17375, |
| "grad_norm": 0.30260539054870605, |
| "learning_rate": 0.00015562489130139783, |
| "loss": 2.696180725097656, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 0.30247101187705994, |
| "learning_rate": 0.0001554712760342521, |
| "loss": 2.667018508911133, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.17625, |
| "grad_norm": 0.3163856565952301, |
| "learning_rate": 0.0001553164762008128, |
| "loss": 2.7117942810058593, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 0.31918948888778687, |
| "learning_rate": 0.0001551604943122748, |
| "loss": 2.6868515014648438, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.17875, |
| "grad_norm": 0.3069145083427429, |
| "learning_rate": 0.00015500333289900878, |
| "loss": 2.665867042541504, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.3310893774032593, |
| "learning_rate": 0.00015484499451051976, |
| "loss": 2.6680227279663087, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.18125, |
| "grad_norm": 0.32211220264434814, |
| "learning_rate": 0.00015468548171540595, |
| "loss": 2.7012916564941407, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 0.3143543303012848, |
| "learning_rate": 0.00015452479710131699, |
| "loss": 2.711798667907715, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.18375, |
| "grad_norm": 0.33350202441215515, |
| "learning_rate": 0.00015436294327491207, |
| "loss": 2.692435455322266, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 0.3231949508190155, |
| "learning_rate": 0.00015419992286181756, |
| "loss": 2.6712711334228514, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.18625, |
| "grad_norm": 0.3143308758735657, |
| "learning_rate": 0.00015403573850658438, |
| "loss": 2.6955425262451174, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 0.3118044137954712, |
| "learning_rate": 0.0001538703928726452, |
| "loss": 2.6801069259643553, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.18875, |
| "grad_norm": 0.3099926710128784, |
| "learning_rate": 0.00015370388864227133, |
| "loss": 2.669751739501953, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.31752023100852966, |
| "learning_rate": 0.0001535362285165288, |
| "loss": 2.6963922500610353, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.19125, |
| "grad_norm": 0.3166843056678772, |
| "learning_rate": 0.00015336741521523506, |
| "loss": 2.6759317398071287, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 0.30386143922805786, |
| "learning_rate": 0.0001531974514769145, |
| "loss": 2.663748359680176, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.19375, |
| "grad_norm": 0.3149690628051758, |
| "learning_rate": 0.0001530263400587541, |
| "loss": 2.672575759887695, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 0.32157933712005615, |
| "learning_rate": 0.0001528540837365589, |
| "loss": 2.7002744674682617, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.19625, |
| "grad_norm": 0.31378722190856934, |
| "learning_rate": 0.0001526806853047066, |
| "loss": 2.7025676727294923, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 0.313424676656723, |
| "learning_rate": 0.00015250614757610258, |
| "loss": 2.7100372314453125, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.19875, |
| "grad_norm": 0.32746565341949463, |
| "learning_rate": 0.00015233047338213414, |
| "loss": 2.721282196044922, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.3191785216331482, |
| "learning_rate": 0.00015215366557262444, |
| "loss": 2.6832775115966796, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.20125, |
| "grad_norm": 0.3307384252548218, |
| "learning_rate": 0.00015197572701578654, |
| "loss": 2.683314323425293, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.2025, |
| "grad_norm": 0.3074938952922821, |
| "learning_rate": 0.00015179666059817658, |
| "loss": 2.6983566284179688, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.20375, |
| "grad_norm": 0.31642141938209534, |
| "learning_rate": 0.00015161646922464713, |
| "loss": 2.67681770324707, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 0.3204726576805115, |
| "learning_rate": 0.0001514351558183001, |
| "loss": 2.673402786254883, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.20625, |
| "grad_norm": 0.31102851033210754, |
| "learning_rate": 0.00015125272332043916, |
| "loss": 2.6676706314086913, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.2075, |
| "grad_norm": 0.31576183438301086, |
| "learning_rate": 0.00015106917469052215, |
| "loss": 2.691006088256836, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.20875, |
| "grad_norm": 0.3049616515636444, |
| "learning_rate": 0.00015088451290611304, |
| "loss": 2.6852401733398437, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.32038211822509766, |
| "learning_rate": 0.00015069874096283362, |
| "loss": 2.6850494384765624, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.21125, |
| "grad_norm": 0.31499341130256653, |
| "learning_rate": 0.00015051186187431495, |
| "loss": 2.685712432861328, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.2125, |
| "grad_norm": 0.3252309262752533, |
| "learning_rate": 0.0001503238786721483, |
| "loss": 2.6800838470458985, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.21375, |
| "grad_norm": 0.33030372858047485, |
| "learning_rate": 0.00015013479440583626, |
| "loss": 2.6957000732421874, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 0.31104838848114014, |
| "learning_rate": 0.00014994461214274302, |
| "loss": 2.6724735260009767, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.21625, |
| "grad_norm": 0.31927284598350525, |
| "learning_rate": 0.00014975333496804468, |
| "loss": 2.6581308364868166, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.2175, |
| "grad_norm": 0.3242516815662384, |
| "learning_rate": 0.00014956096598467932, |
| "loss": 2.6579944610595705, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.21875, |
| "grad_norm": 0.3098279535770416, |
| "learning_rate": 0.00014936750831329645, |
| "loss": 2.6656078338623046, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.309610515832901, |
| "learning_rate": 0.0001491729650922066, |
| "loss": 2.6563575744628904, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.22125, |
| "grad_norm": 0.31657662987709045, |
| "learning_rate": 0.00014897733947733031, |
| "loss": 2.6570175170898436, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.2225, |
| "grad_norm": 0.31096142530441284, |
| "learning_rate": 0.00014878063464214683, |
| "loss": 2.6638370513916017, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.22375, |
| "grad_norm": 0.3048711121082306, |
| "learning_rate": 0.00014858285377764284, |
| "loss": 2.6526607513427733, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 0.32042643427848816, |
| "learning_rate": 0.0001483840000922606, |
| "loss": 2.6601219177246094, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.22625, |
| "grad_norm": 0.324494332075119, |
| "learning_rate": 0.00014818407681184585, |
| "loss": 2.6538795471191405, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.2275, |
| "grad_norm": 0.3241287171840668, |
| "learning_rate": 0.00014798308717959552, |
| "loss": 2.678963851928711, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.22875, |
| "grad_norm": 0.31064486503601074, |
| "learning_rate": 0.00014778103445600512, |
| "loss": 2.6616994857788088, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.31154972314834595, |
| "learning_rate": 0.0001475779219188159, |
| "loss": 2.6822179794311523, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.23125, |
| "grad_norm": 0.32366329431533813, |
| "learning_rate": 0.00014737375286296158, |
| "loss": 2.689762496948242, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.2325, |
| "grad_norm": 0.3157241642475128, |
| "learning_rate": 0.00014716853060051493, |
| "loss": 2.6725814819335936, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.23375, |
| "grad_norm": 0.31811729073524475, |
| "learning_rate": 0.0001469622584606341, |
| "loss": 2.6730297088623045, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 0.3240484893321991, |
| "learning_rate": 0.00014675493978950855, |
| "loss": 2.6649261474609376, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.23625, |
| "grad_norm": 0.3145361542701721, |
| "learning_rate": 0.0001465465779503048, |
| "loss": 2.6716739654541017, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.2375, |
| "grad_norm": 0.30439531803131104, |
| "learning_rate": 0.0001463371763231118, |
| "loss": 2.6668254852294924, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.23875, |
| "grad_norm": 0.3104805052280426, |
| "learning_rate": 0.00014612673830488625, |
| "loss": 2.6472827911376955, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.3249180316925049, |
| "learning_rate": 0.00014591526730939734, |
| "loss": 2.6549278259277345, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.24125, |
| "grad_norm": 0.31549057364463806, |
| "learning_rate": 0.00014570276676717145, |
| "loss": 2.672433853149414, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.2425, |
| "grad_norm": 0.32735103368759155, |
| "learning_rate": 0.00014548924012543646, |
| "loss": 2.6650619506835938, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.24375, |
| "grad_norm": 0.3208616375923157, |
| "learning_rate": 0.00014527469084806585, |
| "loss": 2.6924251556396483, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.000875, |
| "grad_norm": 0.3361559808254242, |
| "learning_rate": 0.00014505912241552255, |
| "loss": 2.918643760681152, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.002125, |
| "grad_norm": 0.32232168316841125, |
| "learning_rate": 0.00014484253832480244, |
| "loss": 2.6152179718017576, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.003375, |
| "grad_norm": 0.32902058959007263, |
| "learning_rate": 0.0001446249420893775, |
| "loss": 2.6433155059814455, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.004625, |
| "grad_norm": 0.31211215257644653, |
| "learning_rate": 0.0001444063372391391, |
| "loss": 2.5884145736694335, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.005875, |
| "grad_norm": 0.32412853837013245, |
| "learning_rate": 0.00014418672732034043, |
| "loss": 2.5942047119140623, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.007125, |
| "grad_norm": 0.32079222798347473, |
| "learning_rate": 0.0001439661158955392, |
| "loss": 2.5999183654785156, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.008375, |
| "grad_norm": 0.3363247811794281, |
| "learning_rate": 0.00014374450654353968, |
| "loss": 2.5693603515625, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.009625, |
| "grad_norm": 0.3330596685409546, |
| "learning_rate": 0.00014352190285933487, |
| "loss": 2.577710723876953, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.010875, |
| "grad_norm": 0.31830593943595886, |
| "learning_rate": 0.00014329830845404782, |
| "loss": 2.580182647705078, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.012125, |
| "grad_norm": 0.3276713490486145, |
| "learning_rate": 0.00014307372695487343, |
| "loss": 2.5742984771728517, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.013375, |
| "grad_norm": 0.32609084248542786, |
| "learning_rate": 0.00014284816200501937, |
| "loss": 2.5697860717773438, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.014625, |
| "grad_norm": 0.32425832748413086, |
| "learning_rate": 0.00014262161726364707, |
| "loss": 2.5537353515625, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.015875, |
| "grad_norm": 0.3417907953262329, |
| "learning_rate": 0.00014239409640581238, |
| "loss": 2.5780372619628906, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.017125, |
| "grad_norm": 0.3302324116230011, |
| "learning_rate": 0.0001421656031224058, |
| "loss": 2.5682140350341798, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.018375, |
| "grad_norm": 0.33167314529418945, |
| "learning_rate": 0.00014193614112009283, |
| "loss": 2.545709228515625, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.019625, |
| "grad_norm": 0.3396015763282776, |
| "learning_rate": 0.00014170571412125367, |
| "loss": 2.544954299926758, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.020875, |
| "grad_norm": 0.33836308121681213, |
| "learning_rate": 0.00014147432586392297, |
| "loss": 2.5545772552490233, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.022125, |
| "grad_norm": 0.3312232196331024, |
| "learning_rate": 0.00014124198010172898, |
| "loss": 2.559113883972168, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.023375, |
| "grad_norm": 0.33059218525886536, |
| "learning_rate": 0.00014100868060383292, |
| "loss": 2.533283805847168, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.024625, |
| "grad_norm": 0.32571902871131897, |
| "learning_rate": 0.00014077443115486767, |
| "loss": 2.551566314697266, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.025875, |
| "grad_norm": 0.3243643045425415, |
| "learning_rate": 0.00014053923555487638, |
| "loss": 2.564662551879883, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.027125, |
| "grad_norm": 0.31755268573760986, |
| "learning_rate": 0.0001403030976192509, |
| "loss": 2.522117042541504, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.028375, |
| "grad_norm": 0.34630700945854187, |
| "learning_rate": 0.00014006602117866982, |
| "loss": 2.529287910461426, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.029625, |
| "grad_norm": 0.33032891154289246, |
| "learning_rate": 0.0001398280100790363, |
| "loss": 2.521525192260742, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.030875, |
| "grad_norm": 0.3408825993537903, |
| "learning_rate": 0.0001395890681814159, |
| "loss": 2.5370689392089845, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.032125, |
| "grad_norm": 0.3269711434841156, |
| "learning_rate": 0.0001393491993619736, |
| "loss": 2.5100967407226564, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.033375, |
| "grad_norm": 0.32242265343666077, |
| "learning_rate": 0.0001391084075119112, |
| "loss": 2.5302288055419924, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.034625, |
| "grad_norm": 0.3222724199295044, |
| "learning_rate": 0.000138866696537404, |
| "loss": 2.517455291748047, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.035875, |
| "grad_norm": 0.3199198246002197, |
| "learning_rate": 0.0001386240703595377, |
| "loss": 2.5055145263671874, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.037125, |
| "grad_norm": 0.33094459772109985, |
| "learning_rate": 0.0001383805329142444, |
| "loss": 2.5067977905273438, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.038375, |
| "grad_norm": 0.33781564235687256, |
| "learning_rate": 0.00013813608815223914, |
| "loss": 2.4964527130126952, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.039625, |
| "grad_norm": 0.34187182784080505, |
| "learning_rate": 0.00013789074003895557, |
| "loss": 2.4876964569091795, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.040875, |
| "grad_norm": 0.3467255234718323, |
| "learning_rate": 0.00013764449255448166, |
| "loss": 2.527250862121582, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.042125, |
| "grad_norm": 0.34287887811660767, |
| "learning_rate": 0.00013739734969349526, |
| "loss": 2.5136051177978516, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.043375, |
| "grad_norm": 0.3415592908859253, |
| "learning_rate": 0.0001371493154651991, |
| "loss": 2.5083173751831054, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.044625, |
| "grad_norm": 0.34434187412261963, |
| "learning_rate": 0.00013690039389325595, |
| "loss": 2.491905403137207, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.045875, |
| "grad_norm": 0.35805854201316833, |
| "learning_rate": 0.0001366505890157232, |
| "loss": 2.509074401855469, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.047125, |
| "grad_norm": 0.3360929787158966, |
| "learning_rate": 0.00013639990488498738, |
| "loss": 2.5023418426513673, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.048375, |
| "grad_norm": 0.33336424827575684, |
| "learning_rate": 0.00013614834556769853, |
| "loss": 2.5313945770263673, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.049625, |
| "grad_norm": 0.3515946567058563, |
| "learning_rate": 0.00013589591514470408, |
| "loss": 2.49786491394043, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.050875, |
| "grad_norm": 0.3500429391860962, |
| "learning_rate": 0.00013564261771098268, |
| "loss": 2.501786804199219, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.052125, |
| "grad_norm": 0.3275656998157501, |
| "learning_rate": 0.00013538845737557796, |
| "loss": 2.511077117919922, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.053375, |
| "grad_norm": 0.3502793610095978, |
| "learning_rate": 0.00013513343826153157, |
| "loss": 2.4827537536621094, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.054625, |
| "grad_norm": 0.3351482152938843, |
| "learning_rate": 0.0001348775645058165, |
| "loss": 2.5033424377441404, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.055875, |
| "grad_norm": 0.3501179814338684, |
| "learning_rate": 0.00013462084025927, |
| "loss": 2.4896453857421874, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.057125, |
| "grad_norm": 0.3435039222240448, |
| "learning_rate": 0.00013436326968652593, |
| "loss": 2.5125568389892576, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.058375, |
| "grad_norm": 0.34035417437553406, |
| "learning_rate": 0.00013410485696594768, |
| "loss": 2.4909286499023438, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.059625, |
| "grad_norm": 0.33129122853279114, |
| "learning_rate": 0.00013384560628956, |
| "loss": 2.556411361694336, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.060875, |
| "grad_norm": 0.3542681932449341, |
| "learning_rate": 0.0001335855218629812, |
| "loss": 2.469993782043457, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.062125, |
| "grad_norm": 0.3372875154018402, |
| "learning_rate": 0.00013332460790535473, |
| "loss": 2.4866575241088866, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.063375, |
| "grad_norm": 0.3469390273094177, |
| "learning_rate": 0.000133062868649281, |
| "loss": 2.4757783889770506, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.064625, |
| "grad_norm": 0.3474292457103729, |
| "learning_rate": 0.0001328003083407486, |
| "loss": 2.4724506378173827, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.065875, |
| "grad_norm": 0.3638366758823395, |
| "learning_rate": 0.0001325369312390653, |
| "loss": 2.5047348022460936, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.067125, |
| "grad_norm": 0.33961430191993713, |
| "learning_rate": 0.0001322727416167891, |
| "loss": 2.50977783203125, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.068375, |
| "grad_norm": 0.34472352266311646, |
| "learning_rate": 0.00013200774375965883, |
| "loss": 2.4912172317504884, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.069625, |
| "grad_norm": 0.3468291461467743, |
| "learning_rate": 0.00013174194196652477, |
| "loss": 2.4802589416503906, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.070875, |
| "grad_norm": 0.3530188500881195, |
| "learning_rate": 0.00013147534054927878, |
| "loss": 2.4657310485839843, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.072125, |
| "grad_norm": 0.3488495647907257, |
| "learning_rate": 0.00013120794383278438, |
| "loss": 2.4873979568481444, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.073375, |
| "grad_norm": 0.3713776171207428, |
| "learning_rate": 0.0001309397561548066, |
| "loss": 2.47833137512207, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.074625, |
| "grad_norm": 0.33978715538978577, |
| "learning_rate": 0.00013067078186594156, |
| "loss": 2.4833608627319337, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.075875, |
| "grad_norm": 0.34602415561676025, |
| "learning_rate": 0.000130401025329546, |
| "loss": 2.50838623046875, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.077125, |
| "grad_norm": 0.33973225951194763, |
| "learning_rate": 0.00013013049092166652, |
| "loss": 2.4615432739257814, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.078375, |
| "grad_norm": 0.35132381319999695, |
| "learning_rate": 0.00012985918303096833, |
| "loss": 2.4790775299072267, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.079625, |
| "grad_norm": 0.3466247618198395, |
| "learning_rate": 0.00012958710605866436, |
| "loss": 2.4747478485107424, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.080875, |
| "grad_norm": 0.3456020951271057, |
| "learning_rate": 0.00012931426441844374, |
| "loss": 2.5099910736083983, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.082125, |
| "grad_norm": 0.3543621301651001, |
| "learning_rate": 0.00012904066253640017, |
| "loss": 2.4894287109375, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.083375, |
| "grad_norm": 0.3563762605190277, |
| "learning_rate": 0.00012876630485096017, |
| "loss": 2.476998138427734, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.084625, |
| "grad_norm": 0.35365182161331177, |
| "learning_rate": 0.000128491195812811, |
| "loss": 2.479985809326172, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.085875, |
| "grad_norm": 0.3529748320579529, |
| "learning_rate": 0.00012821533988482863, |
| "loss": 2.4728267669677733, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.087125, |
| "grad_norm": 0.33992883563041687, |
| "learning_rate": 0.00012793874154200515, |
| "loss": 2.4903228759765623, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.088375, |
| "grad_norm": 0.3493664562702179, |
| "learning_rate": 0.00012766140527137627, |
| "loss": 2.4863492965698244, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.089625, |
| "grad_norm": 0.3460827171802521, |
| "learning_rate": 0.00012738333557194855, |
| "loss": 2.449415588378906, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.090875, |
| "grad_norm": 0.34357935190200806, |
| "learning_rate": 0.00012710453695462633, |
| "loss": 2.463718795776367, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.092125, |
| "grad_norm": 0.3472927510738373, |
| "learning_rate": 0.00012682501394213866, |
| "loss": 2.445463943481445, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.093375, |
| "grad_norm": 0.350157767534256, |
| "learning_rate": 0.00012654477106896584, |
| "loss": 2.4972408294677733, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.094625, |
| "grad_norm": 0.34597107768058777, |
| "learning_rate": 0.00012626381288126593, |
| "loss": 2.487579917907715, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.095875, |
| "grad_norm": 0.3472701609134674, |
| "learning_rate": 0.00012598214393680097, |
| "loss": 2.493511199951172, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.097125, |
| "grad_norm": 0.35611027479171753, |
| "learning_rate": 0.00012569976880486298, |
| "loss": 2.4602516174316404, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.098375, |
| "grad_norm": 0.34450188279151917, |
| "learning_rate": 0.00012541669206620002, |
| "loss": 2.47379093170166, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.099625, |
| "grad_norm": 0.3402315080165863, |
| "learning_rate": 0.0001251329183129416, |
| "loss": 2.4741775512695314, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.100875, |
| "grad_norm": 0.36293742060661316, |
| "learning_rate": 0.00012484845214852453, |
| "loss": 2.478403854370117, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.102125, |
| "grad_norm": 0.34437844157218933, |
| "learning_rate": 0.00012456329818761794, |
| "loss": 2.489897918701172, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.103375, |
| "grad_norm": 0.34425976872444153, |
| "learning_rate": 0.0001242774610560485, |
| "loss": 2.484636688232422, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.104625, |
| "grad_norm": 0.3487798273563385, |
| "learning_rate": 0.00012399094539072557, |
| "loss": 2.4807788848876955, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.105875, |
| "grad_norm": 0.36608222126960754, |
| "learning_rate": 0.00012370375583956562, |
| "loss": 2.498831939697266, |
| "step": 2800 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 8000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.7508190343633306e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|