| { |
| "best_metric": 0.015329813584685326, |
| "best_model_checkpoint": "./results_high/checkpoint-3200", |
| "epoch": 4.954668733049206, |
| "eval_steps": 400, |
| "global_step": 3200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.030995738086013174, |
| "grad_norm": 18.685022354125977, |
| "learning_rate": 0.0002, |
| "loss": 28.2054, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06199147617202635, |
| "grad_norm": 3.494961738586426, |
| "learning_rate": 0.0004, |
| "loss": 20.2888, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.09298721425803952, |
| "grad_norm": 4.7338151931762695, |
| "learning_rate": 0.0006, |
| "loss": 18.4692, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1239829523440527, |
| "grad_norm": 4.808109283447266, |
| "learning_rate": 0.0008, |
| "loss": 17.1405, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15497869043006587, |
| "grad_norm": 5.874823570251465, |
| "learning_rate": 0.001, |
| "loss": 15.9045, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.18597442851607904, |
| "grad_norm": 5.856851577758789, |
| "learning_rate": 0.0009998989386555814, |
| "loss": 14.6319, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2169701666020922, |
| "grad_norm": 5.156916618347168, |
| "learning_rate": 0.0009995957954759072, |
| "loss": 14.0393, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2479659046881054, |
| "grad_norm": 5.812623023986816, |
| "learning_rate": 0.0009990906930052064, |
| "loss": 12.2195, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.27896164277411856, |
| "grad_norm": 4.946173667907715, |
| "learning_rate": 0.000998383835428818, |
| "loss": 10.9657, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.30995738086013175, |
| "grad_norm": 6.92509651184082, |
| "learning_rate": 0.0009974755084906502, |
| "loss": 9.8633, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3409531189461449, |
| "grad_norm": 5.058427333831787, |
| "learning_rate": 0.0009963660793776688, |
| "loss": 8.7671, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3719488570321581, |
| "grad_norm": 5.186002731323242, |
| "learning_rate": 0.0009950559965714648, |
| "loss": 7.8407, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.40294459511817127, |
| "grad_norm": 5.172844886779785, |
| "learning_rate": 0.0009935457896669568, |
| "loss": 6.9268, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4339403332041844, |
| "grad_norm": 5.07811164855957, |
| "learning_rate": 0.0009918360691583054, |
| "loss": 6.2128, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4649360712901976, |
| "grad_norm": 5.332867622375488, |
| "learning_rate": 0.0009899275261921235, |
| "loss": 5.8147, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4959318093762108, |
| "grad_norm": 5.0256171226501465, |
| "learning_rate": 0.000987820932288083, |
| "loss": 5.2, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5269275474622239, |
| "grad_norm": 4.997400760650635, |
| "learning_rate": 0.0009855171390270323, |
| "loss": 4.8953, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5579232855482371, |
| "grad_norm": 5.24421501159668, |
| "learning_rate": 0.0009830170777067485, |
| "loss": 4.34, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5889190236342503, |
| "grad_norm": 5.273136615753174, |
| "learning_rate": 0.000980321758965464, |
| "loss": 4.2298, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6199147617202635, |
| "grad_norm": 5.154964923858643, |
| "learning_rate": 0.0009774322723733215, |
| "loss": 3.8631, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6199147617202635, |
| "eval_loss": 0.21621239185333252, |
| "eval_runtime": 1322.5637, |
| "eval_samples_per_second": 6.939, |
| "eval_steps_per_second": 0.868, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6509104998062767, |
| "grad_norm": 4.869210720062256, |
| "learning_rate": 0.0009743497859919196, |
| "loss": 3.7908, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6819062378922898, |
| "grad_norm": 4.612276077270508, |
| "learning_rate": 0.0009710755459021297, |
| "loss": 3.4467, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.712901975978303, |
| "grad_norm": 4.885288238525391, |
| "learning_rate": 0.0009676108757003736, |
| "loss": 3.3109, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7438977140643162, |
| "grad_norm": 4.861220359802246, |
| "learning_rate": 0.0009639571759635653, |
| "loss": 3.175, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7748934521503293, |
| "grad_norm": 4.333089351654053, |
| "learning_rate": 0.0009601159236829353, |
| "loss": 3.0602, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8058891902363425, |
| "grad_norm": 4.754833221435547, |
| "learning_rate": 0.0009560886716669619, |
| "loss": 2.9202, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8368849283223557, |
| "grad_norm": 4.719883918762207, |
| "learning_rate": 0.0009518770479136578, |
| "loss": 2.7844, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8678806664083688, |
| "grad_norm": 4.607946872711182, |
| "learning_rate": 0.0009474827549524574, |
| "loss": 2.649, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.898876404494382, |
| "grad_norm": 4.616612434387207, |
| "learning_rate": 0.0009429075691559787, |
| "loss": 2.5995, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9298721425803952, |
| "grad_norm": 4.475257396697998, |
| "learning_rate": 0.0009381533400219318, |
| "loss": 2.3863, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9608678806664084, |
| "grad_norm": 4.479520797729492, |
| "learning_rate": 0.0009332219894254686, |
| "loss": 2.3413, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9918636187524216, |
| "grad_norm": 4.254257678985596, |
| "learning_rate": 0.0009281155108422732, |
| "loss": 2.1925, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.0216970166602093, |
| "grad_norm": 4.349820613861084, |
| "learning_rate": 0.0009228359685427095, |
| "loss": 2.0963, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.0526927547462224, |
| "grad_norm": 4.13181209564209, |
| "learning_rate": 0.0009173854967573479, |
| "loss": 2.0195, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.0836884928322355, |
| "grad_norm": 3.947432279586792, |
| "learning_rate": 0.0009117662988142137, |
| "loss": 1.9265, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.1146842309182488, |
| "grad_norm": 4.134565353393555, |
| "learning_rate": 0.0009059806462481021, |
| "loss": 1.9981, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.1456799690042618, |
| "grad_norm": 4.007950305938721, |
| "learning_rate": 0.0009000308778823195, |
| "loss": 1.8992, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.1766757070902751, |
| "grad_norm": 3.916898012161255, |
| "learning_rate": 0.000893919398883226, |
| "loss": 1.8171, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.2076714451762882, |
| "grad_norm": 4.240416049957275, |
| "learning_rate": 0.0008876486797879579, |
| "loss": 1.7744, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.2386671832623015, |
| "grad_norm": 3.7714171409606934, |
| "learning_rate": 0.000881221255505724, |
| "loss": 1.7344, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2386671832623015, |
| "eval_loss": 0.08633554726839066, |
| "eval_runtime": 1321.2599, |
| "eval_samples_per_second": 6.946, |
| "eval_steps_per_second": 0.869, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2696629213483146, |
| "grad_norm": 3.9192988872528076, |
| "learning_rate": 0.0008746397242930808, |
| "loss": 1.7184, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.300658659434328, |
| "grad_norm": 3.8376963138580322, |
| "learning_rate": 0.0008679067467035988, |
| "loss": 1.7126, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.331654397520341, |
| "grad_norm": 4.027160167694092, |
| "learning_rate": 0.0008610250445123471, |
| "loss": 1.6277, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.362650135606354, |
| "grad_norm": 3.6905505657196045, |
| "learning_rate": 0.0008539973996156264, |
| "loss": 1.5709, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.3936458736923674, |
| "grad_norm": 3.442706346511841, |
| "learning_rate": 0.0008468266529064025, |
| "loss": 1.5135, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4246416117783804, |
| "grad_norm": 3.874969005584717, |
| "learning_rate": 0.000839515703125887, |
| "loss": 1.4924, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.4556373498643937, |
| "grad_norm": 4.037222385406494, |
| "learning_rate": 0.0008320675056917352, |
| "loss": 1.4229, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.4866330879504068, |
| "grad_norm": 3.6256048679351807, |
| "learning_rate": 0.0008244850715033315, |
| "loss": 1.4245, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.51762882603642, |
| "grad_norm": 3.6055657863616943, |
| "learning_rate": 0.0008167714657246485, |
| "loss": 1.3645, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.5486245641224332, |
| "grad_norm": 3.3049917221069336, |
| "learning_rate": 0.0008089298065451672, |
| "loss": 1.3765, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.5796203022084463, |
| "grad_norm": 3.7329185009002686, |
| "learning_rate": 0.0008009632639193642, |
| "loss": 1.3048, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.6106160402944596, |
| "grad_norm": 3.6136186122894287, |
| "learning_rate": 0.0007928750582852722, |
| "loss": 1.2793, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.6416117783804727, |
| "grad_norm": 3.500743865966797, |
| "learning_rate": 0.0007846684592626323, |
| "loss": 1.2545, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.6726075164664858, |
| "grad_norm": 3.393615961074829, |
| "learning_rate": 0.0007763467843311658, |
| "loss": 1.2531, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.703603254552499, |
| "grad_norm": 3.132471799850464, |
| "learning_rate": 0.0007679133974894983, |
| "loss": 1.192, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7345989926385124, |
| "grad_norm": 3.4565207958221436, |
| "learning_rate": 0.0007593717078952787, |
| "loss": 1.1406, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.7655947307245254, |
| "grad_norm": 3.3600223064422607, |
| "learning_rate": 0.0007507251684870432, |
| "loss": 1.1335, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.7965904688105385, |
| "grad_norm": 3.5565154552459717, |
| "learning_rate": 0.0007419772745883799, |
| "loss": 1.142, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.8275862068965516, |
| "grad_norm": 3.155515193939209, |
| "learning_rate": 0.0007331315624949624, |
| "loss": 1.0728, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.858581944982565, |
| "grad_norm": 3.3215274810791016, |
| "learning_rate": 0.0007241916080450163, |
| "loss": 1.0707, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.858581944982565, |
| "eval_loss": 0.048645660281181335, |
| "eval_runtime": 1327.8, |
| "eval_samples_per_second": 6.911, |
| "eval_steps_per_second": 0.865, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.8895776830685782, |
| "grad_norm": 3.2068769931793213, |
| "learning_rate": 0.0007151610251738044, |
| "loss": 1.0147, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.9205734211545913, |
| "grad_norm": 2.9419779777526855, |
| "learning_rate": 0.0007060434644527105, |
| "loss": 1.0187, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.9515691592406044, |
| "grad_norm": 3.336106777191162, |
| "learning_rate": 0.0006968426116135118, |
| "loss": 1.0282, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.9825648973266174, |
| "grad_norm": 3.152125597000122, |
| "learning_rate": 0.0006875621860584389, |
| "loss": 0.9777, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.012398295234405, |
| "grad_norm": 2.975085735321045, |
| "learning_rate": 0.0006782059393566253, |
| "loss": 0.9048, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.0433940333204186, |
| "grad_norm": 3.0185718536376953, |
| "learning_rate": 0.000668777653727553, |
| "loss": 0.9297, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.0743897714064317, |
| "grad_norm": 2.8180739879608154, |
| "learning_rate": 0.0006592811405121065, |
| "loss": 0.8795, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.1053855094924447, |
| "grad_norm": 2.9423587322235107, |
| "learning_rate": 0.0006497202386318572, |
| "loss": 0.8589, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.136381247578458, |
| "grad_norm": 2.615948438644409, |
| "learning_rate": 0.0006400988130371969, |
| "loss": 0.8508, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.167376985664471, |
| "grad_norm": 2.6553754806518555, |
| "learning_rate": 0.0006304207531449486, |
| "loss": 0.8377, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.1983727237504844, |
| "grad_norm": 2.841156482696533, |
| "learning_rate": 0.0006206899712660886, |
| "loss": 0.813, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.2293684618364975, |
| "grad_norm": 2.705960988998413, |
| "learning_rate": 0.0006109104010242127, |
| "loss": 0.7873, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.2603641999225106, |
| "grad_norm": 2.7412073612213135, |
| "learning_rate": 0.0006010859957653868, |
| "loss": 0.7773, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.2913599380085237, |
| "grad_norm": 2.5611062049865723, |
| "learning_rate": 0.0005912207269600251, |
| "loss": 0.7602, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.322355676094537, |
| "grad_norm": 2.5611677169799805, |
| "learning_rate": 0.0005813185825974419, |
| "loss": 0.7453, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.3533514141805503, |
| "grad_norm": 2.4461872577667236, |
| "learning_rate": 0.0005713835655737244, |
| "loss": 0.7208, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.3843471522665634, |
| "grad_norm": 2.6245830059051514, |
| "learning_rate": 0.0005614196920735821, |
| "loss": 0.7292, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.4153428903525764, |
| "grad_norm": 2.444567918777466, |
| "learning_rate": 0.0005514309899468208, |
| "loss": 0.6917, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.4463386284385895, |
| "grad_norm": 2.399683952331543, |
| "learning_rate": 0.0005414214970801041, |
| "loss": 0.6623, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.477334366524603, |
| "grad_norm": 2.5869252681732178, |
| "learning_rate": 0.0005313952597646568, |
| "loss": 0.6501, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.477334366524603, |
| "eval_loss": 0.030375245958566666, |
| "eval_runtime": 1322.7101, |
| "eval_samples_per_second": 6.938, |
| "eval_steps_per_second": 0.868, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.508330104610616, |
| "grad_norm": 2.286961078643799, |
| "learning_rate": 0.0005213563310605686, |
| "loss": 0.6232, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.539325842696629, |
| "grad_norm": 2.2856411933898926, |
| "learning_rate": 0.0005113087691583649, |
| "loss": 0.6375, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.5703215807826423, |
| "grad_norm": 2.464272975921631, |
| "learning_rate": 0.000501256635738502, |
| "loss": 0.6252, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.601317318868656, |
| "grad_norm": 2.1272389888763428, |
| "learning_rate": 0.0004912039943294501, |
| "loss": 0.5947, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.632313056954669, |
| "grad_norm": 2.389514446258545, |
| "learning_rate": 0.0004811549086650327, |
| "loss": 0.5886, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.663308795040682, |
| "grad_norm": 2.169243097305298, |
| "learning_rate": 0.0004711134410416794, |
| "loss": 0.577, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.694304533126695, |
| "grad_norm": 2.2765533924102783, |
| "learning_rate": 0.0004610836506762617, |
| "loss": 0.5496, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.725300271212708, |
| "grad_norm": 2.2728376388549805, |
| "learning_rate": 0.00045106959206517425, |
| "loss": 0.536, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.7562960092987216, |
| "grad_norm": 2.093844413757324, |
| "learning_rate": 0.0004410753133453222, |
| "loss": 0.5255, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.7872917473847347, |
| "grad_norm": 2.17061185836792, |
| "learning_rate": 0.000431104854657681, |
| "loss": 0.5153, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.818287485470748, |
| "grad_norm": 2.016740560531616, |
| "learning_rate": 0.0004211622465140887, |
| "loss": 0.5014, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.849283223556761, |
| "grad_norm": 1.8642460107803345, |
| "learning_rate": 0.00041125150816792946, |
| "loss": 0.5038, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.880278961642774, |
| "grad_norm": 2.1162288188934326, |
| "learning_rate": 0.00040137664598936855, |
| "loss": 0.4808, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.9112746997287875, |
| "grad_norm": 2.0367867946624756, |
| "learning_rate": 0.00039154165184579736, |
| "loss": 0.4777, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.9422704378148006, |
| "grad_norm": 2.0988779067993164, |
| "learning_rate": 0.0003817505014881378, |
| "loss": 0.4737, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.9732661759008137, |
| "grad_norm": 1.8902864456176758, |
| "learning_rate": 0.0003720071529436637, |
| "loss": 0.4467, |
| "step": 1920 |
| }, |
| { |
| "epoch": 3.0030995738086013, |
| "grad_norm": 1.977570652961731, |
| "learning_rate": 0.0003623155449159876, |
| "loss": 0.4367, |
| "step": 1940 |
| }, |
| { |
| "epoch": 3.0340953118946143, |
| "grad_norm": 1.571098804473877, |
| "learning_rate": 0.00035267959519285686, |
| "loss": 0.4162, |
| "step": 1960 |
| }, |
| { |
| "epoch": 3.065091049980628, |
| "grad_norm": 1.6555993556976318, |
| "learning_rate": 0.0003431031990624063, |
| "loss": 0.4047, |
| "step": 1980 |
| }, |
| { |
| "epoch": 3.096086788066641, |
| "grad_norm": 1.785839557647705, |
| "learning_rate": 0.00033359022773850675, |
| "loss": 0.4096, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.096086788066641, |
| "eval_loss": 0.020494887605309486, |
| "eval_runtime": 1323.6591, |
| "eval_samples_per_second": 6.933, |
| "eval_steps_per_second": 0.867, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.127082526152654, |
| "grad_norm": 1.5646486282348633, |
| "learning_rate": 0.00032414452679584377, |
| "loss": 0.4, |
| "step": 2020 |
| }, |
| { |
| "epoch": 3.158078264238667, |
| "grad_norm": 1.4028712511062622, |
| "learning_rate": 0.0003147699146153621, |
| "loss": 0.378, |
| "step": 2040 |
| }, |
| { |
| "epoch": 3.18907400232468, |
| "grad_norm": 1.4465529918670654, |
| "learning_rate": 0.00030547018084070343, |
| "loss": 0.3819, |
| "step": 2060 |
| }, |
| { |
| "epoch": 3.2200697404106937, |
| "grad_norm": 1.5283536911010742, |
| "learning_rate": 0.0002962490848462596, |
| "loss": 0.38, |
| "step": 2080 |
| }, |
| { |
| "epoch": 3.251065478496707, |
| "grad_norm": 1.420145034790039, |
| "learning_rate": 0.00028711035421746366, |
| "loss": 0.367, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.28206121658272, |
| "grad_norm": 1.378266453742981, |
| "learning_rate": 0.00027805768324393014, |
| "loss": 0.3509, |
| "step": 2120 |
| }, |
| { |
| "epoch": 3.313056954668733, |
| "grad_norm": 1.202911138534546, |
| "learning_rate": 0.00026909473142605524, |
| "loss": 0.3552, |
| "step": 2140 |
| }, |
| { |
| "epoch": 3.344052692754746, |
| "grad_norm": 1.2163655757904053, |
| "learning_rate": 0.00026022512199568205, |
| "loss": 0.3461, |
| "step": 2160 |
| }, |
| { |
| "epoch": 3.3750484308407596, |
| "grad_norm": 1.2958979606628418, |
| "learning_rate": 0.0002514524404514248, |
| "loss": 0.3452, |
| "step": 2180 |
| }, |
| { |
| "epoch": 3.4060441689267726, |
| "grad_norm": 1.3039333820343018, |
| "learning_rate": 0.00024278023310924675, |
| "loss": 0.3358, |
| "step": 2200 |
| }, |
| { |
| "epoch": 3.4370399070127857, |
| "grad_norm": 1.1213936805725098, |
| "learning_rate": 0.00023421200566888095, |
| "loss": 0.3309, |
| "step": 2220 |
| }, |
| { |
| "epoch": 3.468035645098799, |
| "grad_norm": 1.1700717210769653, |
| "learning_rate": 0.00022575122179666497, |
| "loss": 0.3186, |
| "step": 2240 |
| }, |
| { |
| "epoch": 3.4990313831848123, |
| "grad_norm": 1.2684606313705444, |
| "learning_rate": 0.0002174013017253701, |
| "loss": 0.3239, |
| "step": 2260 |
| }, |
| { |
| "epoch": 3.5300271212708254, |
| "grad_norm": 0.9373674392700195, |
| "learning_rate": 0.00020916562087158964, |
| "loss": 0.3155, |
| "step": 2280 |
| }, |
| { |
| "epoch": 3.5610228593568385, |
| "grad_norm": 1.221389889717102, |
| "learning_rate": 0.00020104750847124077, |
| "loss": 0.3133, |
| "step": 2300 |
| }, |
| { |
| "epoch": 3.5920185974428516, |
| "grad_norm": 1.0687464475631714, |
| "learning_rate": 0.00019305024623373618, |
| "loss": 0.3086, |
| "step": 2320 |
| }, |
| { |
| "epoch": 3.6230143355288646, |
| "grad_norm": 1.025343656539917, |
| "learning_rate": 0.00018517706701536997, |
| "loss": 0.306, |
| "step": 2340 |
| }, |
| { |
| "epoch": 3.654010073614878, |
| "grad_norm": 0.9350593090057373, |
| "learning_rate": 0.00017743115351244882, |
| "loss": 0.3072, |
| "step": 2360 |
| }, |
| { |
| "epoch": 3.6850058117008913, |
| "grad_norm": 1.0027642250061035, |
| "learning_rate": 0.00016981563697470158, |
| "loss": 0.2976, |
| "step": 2380 |
| }, |
| { |
| "epoch": 3.7160015497869043, |
| "grad_norm": 0.7607480883598328, |
| "learning_rate": 0.00016233359593948777, |
| "loss": 0.2864, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.7160015497869043, |
| "eval_loss": 0.016527026891708374, |
| "eval_runtime": 1322.2158, |
| "eval_samples_per_second": 6.941, |
| "eval_steps_per_second": 0.868, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.7469972878729174, |
| "grad_norm": 0.8243028521537781, |
| "learning_rate": 0.00015498805498731144, |
| "loss": 0.2848, |
| "step": 2420 |
| }, |
| { |
| "epoch": 3.7779930259589305, |
| "grad_norm": 0.7183871269226074, |
| "learning_rate": 0.00014778198351914852, |
| "loss": 0.2758, |
| "step": 2440 |
| }, |
| { |
| "epoch": 3.808988764044944, |
| "grad_norm": 0.71489417552948, |
| "learning_rate": 0.00014071829455608172, |
| "loss": 0.2759, |
| "step": 2460 |
| }, |
| { |
| "epoch": 3.839984502130957, |
| "grad_norm": 0.7238665223121643, |
| "learning_rate": 0.00013379984356172347, |
| "loss": 0.2687, |
| "step": 2480 |
| }, |
| { |
| "epoch": 3.87098024021697, |
| "grad_norm": 0.5687535405158997, |
| "learning_rate": 0.00012702942728790896, |
| "loss": 0.273, |
| "step": 2500 |
| }, |
| { |
| "epoch": 3.9019759783029833, |
| "grad_norm": 0.7781446576118469, |
| "learning_rate": 0.00012040978264412178, |
| "loss": 0.2694, |
| "step": 2520 |
| }, |
| { |
| "epoch": 3.9329717163889963, |
| "grad_norm": 0.6078444123268127, |
| "learning_rate": 0.000113943585591111, |
| "loss": 0.2708, |
| "step": 2540 |
| }, |
| { |
| "epoch": 3.96396745447501, |
| "grad_norm": 0.48452651500701904, |
| "learning_rate": 0.00010763345005914649, |
| "loss": 0.2612, |
| "step": 2560 |
| }, |
| { |
| "epoch": 3.994963192561023, |
| "grad_norm": 0.46242251992225647, |
| "learning_rate": 0.0001014819268913495, |
| "loss": 0.2647, |
| "step": 2580 |
| }, |
| { |
| "epoch": 4.02479659046881, |
| "grad_norm": 0.4267343282699585, |
| "learning_rate": 9.549150281252633e-05, |
| "loss": 0.2441, |
| "step": 2600 |
| }, |
| { |
| "epoch": 4.055792328554824, |
| "grad_norm": 0.40898454189300537, |
| "learning_rate": 8.966459942392108e-05, |
| "loss": 0.253, |
| "step": 2620 |
| }, |
| { |
| "epoch": 4.086788066640837, |
| "grad_norm": 0.4370998740196228, |
| "learning_rate": 8.400357222429472e-05, |
| "loss": 0.249, |
| "step": 2640 |
| }, |
| { |
| "epoch": 4.11778380472685, |
| "grad_norm": 0.39429062604904175, |
| "learning_rate": 7.851070965772572e-05, |
| "loss": 0.2449, |
| "step": 2660 |
| }, |
| { |
| "epoch": 4.148779542812863, |
| "grad_norm": 0.3252967894077301, |
| "learning_rate": 7.318823218851667e-05, |
| "loss": 0.2452, |
| "step": 2680 |
| }, |
| { |
| "epoch": 4.179775280898877, |
| "grad_norm": 0.35145244002342224, |
| "learning_rate": 6.803829140358236e-05, |
| "loss": 0.246, |
| "step": 2700 |
| }, |
| { |
| "epoch": 4.2107710189848895, |
| "grad_norm": 0.447158545255661, |
| "learning_rate": 6.306296914268039e-05, |
| "loss": 0.249, |
| "step": 2720 |
| }, |
| { |
| "epoch": 4.241766757070903, |
| "grad_norm": 0.3183532655239105, |
| "learning_rate": 5.8264276656837145e-05, |
| "loss": 0.2435, |
| "step": 2740 |
| }, |
| { |
| "epoch": 4.272762495156916, |
| "grad_norm": 0.3511893153190613, |
| "learning_rate": 5.36441537953089e-05, |
| "loss": 0.2475, |
| "step": 2760 |
| }, |
| { |
| "epoch": 4.303758233242929, |
| "grad_norm": 0.49244144558906555, |
| "learning_rate": 4.920446822140673e-05, |
| "loss": 0.2485, |
| "step": 2780 |
| }, |
| { |
| "epoch": 4.334753971328942, |
| "grad_norm": 0.30061423778533936, |
| "learning_rate": 4.494701465750217e-05, |
| "loss": 0.2515, |
| "step": 2800 |
| }, |
| { |
| "epoch": 4.334753971328942, |
| "eval_loss": 0.015458072535693645, |
| "eval_runtime": 1323.2432, |
| "eval_samples_per_second": 6.935, |
| "eval_steps_per_second": 0.868, |
| "step": 2800 |
| }, |
| { |
| "epoch": 4.365749709414955, |
| "grad_norm": 0.34871456027030945, |
| "learning_rate": 4.087351415951917e-05, |
| "loss": 0.2434, |
| "step": 2820 |
| }, |
| { |
| "epoch": 4.396745447500969, |
| "grad_norm": 0.39381858706474304, |
| "learning_rate": 3.698561342120499e-05, |
| "loss": 0.2476, |
| "step": 2840 |
| }, |
| { |
| "epoch": 4.4277411855869815, |
| "grad_norm": 0.38118091225624084, |
| "learning_rate": 3.3284884108461864e-05, |
| "loss": 0.245, |
| "step": 2860 |
| }, |
| { |
| "epoch": 4.458736923672995, |
| "grad_norm": 0.31697818636894226, |
| "learning_rate": 2.9772822224008513e-05, |
| "loss": 0.2432, |
| "step": 2880 |
| }, |
| { |
| "epoch": 4.4897326617590085, |
| "grad_norm": 0.30253851413726807, |
| "learning_rate": 2.6450847502627883e-05, |
| "loss": 0.2462, |
| "step": 2900 |
| }, |
| { |
| "epoch": 4.520728399845021, |
| "grad_norm": 0.2832024097442627, |
| "learning_rate": 2.3320302837245844e-05, |
| "loss": 0.2445, |
| "step": 2920 |
| }, |
| { |
| "epoch": 4.551724137931035, |
| "grad_norm": 0.259616494178772, |
| "learning_rate": 2.0382453736072835e-05, |
| "loss": 0.248, |
| "step": 2940 |
| }, |
| { |
| "epoch": 4.582719876017047, |
| "grad_norm": 0.361459881067276, |
| "learning_rate": 1.7638487811028614e-05, |
| "loss": 0.2444, |
| "step": 2960 |
| }, |
| { |
| "epoch": 4.613715614103061, |
| "grad_norm": 0.33997073769569397, |
| "learning_rate": 1.5089514297654594e-05, |
| "loss": 0.2402, |
| "step": 2980 |
| }, |
| { |
| "epoch": 4.644711352189074, |
| "grad_norm": 0.33987364172935486, |
| "learning_rate": 1.2736563606711382e-05, |
| "loss": 0.2456, |
| "step": 3000 |
| }, |
| { |
| "epoch": 4.675707090275087, |
| "grad_norm": 0.3494039475917816, |
| "learning_rate": 1.0580586907639911e-05, |
| "loss": 0.2415, |
| "step": 3020 |
| }, |
| { |
| "epoch": 4.7067028283611005, |
| "grad_norm": 0.27838659286499023, |
| "learning_rate": 8.622455744054958e-06, |
| "loss": 0.2442, |
| "step": 3040 |
| }, |
| { |
| "epoch": 4.737698566447113, |
| "grad_norm": 0.26378950476646423, |
| "learning_rate": 6.8629616814283035e-06, |
| "loss": 0.2412, |
| "step": 3060 |
| }, |
| { |
| "epoch": 4.768694304533127, |
| "grad_norm": 0.3408554494380951, |
| "learning_rate": 5.302815987101917e-06, |
| "loss": 0.2405, |
| "step": 3080 |
| }, |
| { |
| "epoch": 4.79969004261914, |
| "grad_norm": 0.3381432592868805, |
| "learning_rate": 3.942649342761117e-06, |
| "loss": 0.2406, |
| "step": 3100 |
| }, |
| { |
| "epoch": 4.830685780705153, |
| "grad_norm": 0.2882293164730072, |
| "learning_rate": 2.7830115894847407e-06, |
| "loss": 0.2448, |
| "step": 3120 |
| }, |
| { |
| "epoch": 4.861681518791166, |
| "grad_norm": 0.26501309871673584, |
| "learning_rate": 1.8243715054744313e-06, |
| "loss": 0.2427, |
| "step": 3140 |
| }, |
| { |
| "epoch": 4.892677256877179, |
| "grad_norm": 0.25310027599334717, |
| "learning_rate": 1.067116616552899e-06, |
| "loss": 0.2463, |
| "step": 3160 |
| }, |
| { |
| "epoch": 4.9236729949631926, |
| "grad_norm": 0.31968578696250916, |
| "learning_rate": 5.115530395087276e-07, |
| "loss": 0.2432, |
| "step": 3180 |
| }, |
| { |
| "epoch": 4.954668733049206, |
| "grad_norm": 0.2826690375804901, |
| "learning_rate": 1.5790535835003005e-07, |
| "loss": 0.2409, |
| "step": 3200 |
| }, |
| { |
| "epoch": 4.954668733049206, |
| "eval_loss": 0.015329813584685326, |
| "eval_runtime": 1323.5417, |
| "eval_samples_per_second": 6.934, |
| "eval_steps_per_second": 0.867, |
| "step": 3200 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 3225, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 400, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 2, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8917658856433517e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|