| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 2500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 115.85907745361328, | |
| "learning_rate": 4.997995991983968e-05, | |
| "loss": 3.7476, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 66.54586029052734, | |
| "learning_rate": 4.977955911823648e-05, | |
| "loss": 2.8414, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 55.67858123779297, | |
| "learning_rate": 4.957915831663327e-05, | |
| "loss": 2.3755, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 49.31084060668945, | |
| "learning_rate": 4.937875751503006e-05, | |
| "loss": 1.9815, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 50.2120475769043, | |
| "learning_rate": 4.917835671342685e-05, | |
| "loss": 2.2454, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 33.266048431396484, | |
| "learning_rate": 4.897795591182365e-05, | |
| "loss": 2.4204, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 25.996694564819336, | |
| "learning_rate": 4.877755511022044e-05, | |
| "loss": 1.4711, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 43.97441864013672, | |
| "learning_rate": 4.8577154308617234e-05, | |
| "loss": 1.697, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 37.83395767211914, | |
| "learning_rate": 4.8376753507014026e-05, | |
| "loss": 1.7633, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 26.54341697692871, | |
| "learning_rate": 4.8176352705410824e-05, | |
| "loss": 1.3992, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 42.20634841918945, | |
| "learning_rate": 4.797595190380762e-05, | |
| "loss": 1.6443, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 24.657821655273438, | |
| "learning_rate": 4.7775551102204415e-05, | |
| "loss": 1.8471, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 38.727420806884766, | |
| "learning_rate": 4.7575150300601207e-05, | |
| "loss": 1.5293, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 31.97869873046875, | |
| "learning_rate": 4.7374749498998e-05, | |
| "loss": 1.6212, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 31.056962966918945, | |
| "learning_rate": 4.717434869739479e-05, | |
| "loss": 1.4407, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 29.63347053527832, | |
| "learning_rate": 4.697394789579159e-05, | |
| "loss": 1.1833, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 44.844268798828125, | |
| "learning_rate": 4.677354709418838e-05, | |
| "loss": 1.5756, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 31.4070987701416, | |
| "learning_rate": 4.657314629258517e-05, | |
| "loss": 1.4358, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 26.982776641845703, | |
| "learning_rate": 4.6372745490981964e-05, | |
| "loss": 1.234, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 19.802730560302734, | |
| "learning_rate": 4.617234468937876e-05, | |
| "loss": 1.3504, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 33.88198471069336, | |
| "learning_rate": 4.5971943887775554e-05, | |
| "loss": 1.5029, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 24.533716201782227, | |
| "learning_rate": 4.5771543086172346e-05, | |
| "loss": 1.2978, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 27.563339233398438, | |
| "learning_rate": 4.557114228456914e-05, | |
| "loss": 1.7014, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 29.428752899169922, | |
| "learning_rate": 4.5370741482965936e-05, | |
| "loss": 1.3845, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 20.272520065307617, | |
| "learning_rate": 4.517034068136273e-05, | |
| "loss": 1.2192, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 47.12469482421875, | |
| "learning_rate": 4.496993987975952e-05, | |
| "loss": 1.2814, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 18.20330238342285, | |
| "learning_rate": 4.476953907815631e-05, | |
| "loss": 1.2717, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 39.07451248168945, | |
| "learning_rate": 4.456913827655311e-05, | |
| "loss": 1.3291, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 50.37272644042969, | |
| "learning_rate": 4.43687374749499e-05, | |
| "loss": 1.3691, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 23.233367919921875, | |
| "learning_rate": 4.4168336673346694e-05, | |
| "loss": 1.4183, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 22.46800422668457, | |
| "learning_rate": 4.3967935871743486e-05, | |
| "loss": 1.1226, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 24.424856185913086, | |
| "learning_rate": 4.3767535070140284e-05, | |
| "loss": 1.3413, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 14.698283195495605, | |
| "learning_rate": 4.3567134268537076e-05, | |
| "loss": 1.2009, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 23.5820369720459, | |
| "learning_rate": 4.336673346693387e-05, | |
| "loss": 1.0799, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 26.510631561279297, | |
| "learning_rate": 4.316633266533066e-05, | |
| "loss": 1.0801, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 18.498275756835938, | |
| "learning_rate": 4.296593186372745e-05, | |
| "loss": 1.1631, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 35.2937126159668, | |
| "learning_rate": 4.2765531062124256e-05, | |
| "loss": 1.3777, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 40.05356216430664, | |
| "learning_rate": 4.256513026052105e-05, | |
| "loss": 1.1189, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 18.918344497680664, | |
| "learning_rate": 4.236472945891784e-05, | |
| "loss": 1.0721, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 20.29583168029785, | |
| "learning_rate": 4.216432865731463e-05, | |
| "loss": 0.9337, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 19.447803497314453, | |
| "learning_rate": 4.2081993569131834e-05, | |
| "loss": 1.2047, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 26.82716178894043, | |
| "learning_rate": 4.188102893890675e-05, | |
| "loss": 1.1294, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 12.995594024658203, | |
| "learning_rate": 4.168006430868168e-05, | |
| "loss": 1.0033, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 21.796598434448242, | |
| "learning_rate": 4.14790996784566e-05, | |
| "loss": 0.8864, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 13.911988258361816, | |
| "learning_rate": 4.1278135048231516e-05, | |
| "loss": 0.8974, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 25.945011138916016, | |
| "learning_rate": 4.1077170418006434e-05, | |
| "loss": 1.261, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 19.943857192993164, | |
| "learning_rate": 4.087620578778135e-05, | |
| "loss": 1.0262, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 23.558696746826172, | |
| "learning_rate": 4.067524115755627e-05, | |
| "loss": 0.9572, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 42.70231628417969, | |
| "learning_rate": 4.047427652733119e-05, | |
| "loss": 1.044, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 16.41856575012207, | |
| "learning_rate": 4.027331189710611e-05, | |
| "loss": 1.0562, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 13.945029258728027, | |
| "learning_rate": 4.0072347266881035e-05, | |
| "loss": 0.9637, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 16.55429458618164, | |
| "learning_rate": 3.9871382636655953e-05, | |
| "loss": 0.8104, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 24.434778213500977, | |
| "learning_rate": 3.967041800643087e-05, | |
| "loss": 1.1942, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 20.01283836364746, | |
| "learning_rate": 3.946945337620579e-05, | |
| "loss": 0.9209, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 18.98524284362793, | |
| "learning_rate": 3.926848874598071e-05, | |
| "loss": 0.7445, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 22.44414710998535, | |
| "learning_rate": 3.906752411575563e-05, | |
| "loss": 0.9528, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 19.79057502746582, | |
| "learning_rate": 3.886655948553055e-05, | |
| "loss": 0.9216, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 17.453460693359375, | |
| "learning_rate": 3.866559485530547e-05, | |
| "loss": 0.9834, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 29.218969345092773, | |
| "learning_rate": 3.846463022508039e-05, | |
| "loss": 0.9945, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 17.652963638305664, | |
| "learning_rate": 3.826366559485531e-05, | |
| "loss": 0.9308, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 20.76468849182129, | |
| "learning_rate": 3.806270096463023e-05, | |
| "loss": 0.8735, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 26.41815757751465, | |
| "learning_rate": 3.786173633440515e-05, | |
| "loss": 1.2878, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 41.02421951293945, | |
| "learning_rate": 3.7660771704180066e-05, | |
| "loss": 1.1299, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 17.34744644165039, | |
| "learning_rate": 3.7459807073954985e-05, | |
| "loss": 0.8315, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 14.293941497802734, | |
| "learning_rate": 3.725884244372991e-05, | |
| "loss": 0.8405, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 15.149956703186035, | |
| "learning_rate": 3.705787781350483e-05, | |
| "loss": 1.0297, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 17.754810333251953, | |
| "learning_rate": 3.685691318327975e-05, | |
| "loss": 0.9322, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 21.743669509887695, | |
| "learning_rate": 3.6655948553054666e-05, | |
| "loss": 0.9069, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 29.161598205566406, | |
| "learning_rate": 3.6454983922829585e-05, | |
| "loss": 0.8633, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 16.16539192199707, | |
| "learning_rate": 3.6254019292604503e-05, | |
| "loss": 0.7662, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 25.36922264099121, | |
| "learning_rate": 3.605305466237942e-05, | |
| "loss": 0.7945, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 24.251853942871094, | |
| "learning_rate": 3.585209003215435e-05, | |
| "loss": 0.9693, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 15.235057830810547, | |
| "learning_rate": 3.5651125401929266e-05, | |
| "loss": 0.8969, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 14.464040756225586, | |
| "learning_rate": 3.5450160771704185e-05, | |
| "loss": 0.7205, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 23.044424057006836, | |
| "learning_rate": 3.5249196141479104e-05, | |
| "loss": 0.839, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 25.620925903320312, | |
| "learning_rate": 3.504823151125402e-05, | |
| "loss": 0.8887, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 10.347396850585938, | |
| "learning_rate": 3.484726688102894e-05, | |
| "loss": 0.645, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 19.114471435546875, | |
| "learning_rate": 3.464630225080386e-05, | |
| "loss": 0.8341, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 17.528043746948242, | |
| "learning_rate": 3.4445337620578785e-05, | |
| "loss": 0.7108, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 13.186959266662598, | |
| "learning_rate": 3.4244372990353704e-05, | |
| "loss": 0.7991, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 25.02106475830078, | |
| "learning_rate": 3.404340836012862e-05, | |
| "loss": 0.7284, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 14.035198211669922, | |
| "learning_rate": 3.384244372990354e-05, | |
| "loss": 0.7589, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 11.368013381958008, | |
| "learning_rate": 3.364147909967846e-05, | |
| "loss": 0.7638, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 21.951080322265625, | |
| "learning_rate": 3.344051446945338e-05, | |
| "loss": 0.7869, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 17.966073989868164, | |
| "learning_rate": 3.32395498392283e-05, | |
| "loss": 0.6792, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 36.02198791503906, | |
| "learning_rate": 3.3038585209003216e-05, | |
| "loss": 0.6968, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 32.43560791015625, | |
| "learning_rate": 3.283762057877814e-05, | |
| "loss": 0.7523, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 30.29490852355957, | |
| "learning_rate": 3.263665594855306e-05, | |
| "loss": 0.6548, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 8.957921981811523, | |
| "learning_rate": 3.243569131832798e-05, | |
| "loss": 0.7151, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 15.583487510681152, | |
| "learning_rate": 3.22347266881029e-05, | |
| "loss": 0.625, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 23.470478057861328, | |
| "learning_rate": 3.2033762057877816e-05, | |
| "loss": 0.9571, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 31.515092849731445, | |
| "learning_rate": 3.1832797427652735e-05, | |
| "loss": 0.7395, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 14.246073722839355, | |
| "learning_rate": 3.1631832797427654e-05, | |
| "loss": 0.6884, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 25.352590560913086, | |
| "learning_rate": 3.143086816720258e-05, | |
| "loss": 0.772, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 14.441354751586914, | |
| "learning_rate": 3.12299035369775e-05, | |
| "loss": 0.7406, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 29.33234405517578, | |
| "learning_rate": 3.102893890675242e-05, | |
| "loss": 0.7242, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 16.018104553222656, | |
| "learning_rate": 3.0827974276527335e-05, | |
| "loss": 0.9183, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 14.766180992126465, | |
| "learning_rate": 3.0627009646302254e-05, | |
| "loss": 0.8971, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 10.733450889587402, | |
| "learning_rate": 3.042604501607717e-05, | |
| "loss": 0.5546, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 15.318602561950684, | |
| "learning_rate": 3.0225080385852088e-05, | |
| "loss": 0.7582, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": 16.220378875732422, | |
| "learning_rate": 3.0024115755627013e-05, | |
| "loss": 0.7034, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 24.005294799804688, | |
| "learning_rate": 2.9823151125401932e-05, | |
| "loss": 0.692, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": 10.050074577331543, | |
| "learning_rate": 2.962218649517685e-05, | |
| "loss": 0.6043, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 14.405035972595215, | |
| "learning_rate": 2.942122186495177e-05, | |
| "loss": 0.779, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 21.141399383544922, | |
| "learning_rate": 2.9220257234726688e-05, | |
| "loss": 0.6058, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 12.982873916625977, | |
| "learning_rate": 2.9019292604501607e-05, | |
| "loss": 0.6479, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": 11.213690757751465, | |
| "learning_rate": 2.8818327974276526e-05, | |
| "loss": 0.5357, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 9.400264739990234, | |
| "learning_rate": 2.861736334405145e-05, | |
| "loss": 0.7004, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": 24.078365325927734, | |
| "learning_rate": 2.841639871382637e-05, | |
| "loss": 0.574, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 10.422481536865234, | |
| "learning_rate": 2.821543408360129e-05, | |
| "loss": 0.5758, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": 14.57651424407959, | |
| "learning_rate": 2.8014469453376207e-05, | |
| "loss": 0.6732, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 11.885628700256348, | |
| "learning_rate": 2.7813504823151126e-05, | |
| "loss": 0.5472, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": 14.696109771728516, | |
| "learning_rate": 2.7612540192926045e-05, | |
| "loss": 0.6875, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 10.769063949584961, | |
| "learning_rate": 2.7411575562700963e-05, | |
| "loss": 0.63, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 14.238389015197754, | |
| "learning_rate": 2.7210610932475882e-05, | |
| "loss": 0.6574, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 10.263813972473145, | |
| "learning_rate": 2.7009646302250807e-05, | |
| "loss": 0.6817, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": 18.851377487182617, | |
| "learning_rate": 2.6808681672025726e-05, | |
| "loss": 0.49, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 17.136259078979492, | |
| "learning_rate": 2.6607717041800645e-05, | |
| "loss": 0.6844, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": 12.520773887634277, | |
| "learning_rate": 2.6406752411575564e-05, | |
| "loss": 0.542, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 8.42679214477539, | |
| "learning_rate": 2.6205787781350482e-05, | |
| "loss": 0.517, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": 16.89554214477539, | |
| "learning_rate": 2.60048231511254e-05, | |
| "loss": 0.5378, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 20.790712356567383, | |
| "learning_rate": 2.580385852090032e-05, | |
| "loss": 0.4598, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": 13.592977523803711, | |
| "learning_rate": 2.5602893890675245e-05, | |
| "loss": 0.5201, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 17.89519500732422, | |
| "learning_rate": 2.5401929260450164e-05, | |
| "loss": 0.5422, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 12.296363830566406, | |
| "learning_rate": 2.5200964630225083e-05, | |
| "loss": 0.5155, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 14.820162773132324, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.5141, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "grad_norm": 14.70346736907959, | |
| "learning_rate": 2.479903536977492e-05, | |
| "loss": 0.5812, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 18.85811424255371, | |
| "learning_rate": 2.4598070739549842e-05, | |
| "loss": 0.5159, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "grad_norm": 12.222372055053711, | |
| "learning_rate": 2.439710610932476e-05, | |
| "loss": 0.4363, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 49.18086242675781, | |
| "learning_rate": 2.419614147909968e-05, | |
| "loss": 0.7475, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "grad_norm": 12.499253273010254, | |
| "learning_rate": 2.3995176848874598e-05, | |
| "loss": 0.6813, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 14.978521347045898, | |
| "learning_rate": 2.379421221864952e-05, | |
| "loss": 0.5517, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "grad_norm": 16.008121490478516, | |
| "learning_rate": 2.359324758842444e-05, | |
| "loss": 0.5561, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 17.769926071166992, | |
| "learning_rate": 2.3392282958199358e-05, | |
| "loss": 0.5745, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 11.014586448669434, | |
| "learning_rate": 2.3191318327974276e-05, | |
| "loss": 0.4567, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 10.055672645568848, | |
| "learning_rate": 2.29903536977492e-05, | |
| "loss": 0.5071, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "grad_norm": 10.68797779083252, | |
| "learning_rate": 2.2789389067524117e-05, | |
| "loss": 0.6491, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 9.747801780700684, | |
| "learning_rate": 2.2588424437299036e-05, | |
| "loss": 0.4888, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "grad_norm": 11.837730407714844, | |
| "learning_rate": 2.2387459807073958e-05, | |
| "loss": 0.4526, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 14.05400562286377, | |
| "learning_rate": 2.2186495176848876e-05, | |
| "loss": 0.6307, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.564, | |
| "grad_norm": 12.709571838378906, | |
| "learning_rate": 2.1985530546623795e-05, | |
| "loss": 0.427, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 10.895342826843262, | |
| "learning_rate": 2.1784565916398714e-05, | |
| "loss": 0.4546, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.572, | |
| "grad_norm": 13.848987579345703, | |
| "learning_rate": 2.1583601286173636e-05, | |
| "loss": 0.4159, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 16.61017608642578, | |
| "learning_rate": 2.1382636655948555e-05, | |
| "loss": 0.733, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 10.283616065979004, | |
| "learning_rate": 2.1181672025723473e-05, | |
| "loss": 0.4581, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 11.248019218444824, | |
| "learning_rate": 2.0980707395498395e-05, | |
| "loss": 0.462, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.588, | |
| "grad_norm": 10.817742347717285, | |
| "learning_rate": 2.0779742765273314e-05, | |
| "loss": 0.451, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 10.615836143493652, | |
| "learning_rate": 2.0578778135048233e-05, | |
| "loss": 0.5714, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.596, | |
| "grad_norm": 12.22169017791748, | |
| "learning_rate": 2.037781350482315e-05, | |
| "loss": 0.8718, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 13.216890335083008, | |
| "learning_rate": 2.0176848874598074e-05, | |
| "loss": 0.3766, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.604, | |
| "grad_norm": 13.472972869873047, | |
| "learning_rate": 1.9975884244372992e-05, | |
| "loss": 0.5891, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 20.179187774658203, | |
| "learning_rate": 1.977491961414791e-05, | |
| "loss": 0.3744, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.612, | |
| "grad_norm": 12.630617141723633, | |
| "learning_rate": 1.957395498392283e-05, | |
| "loss": 0.4564, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 20.459508895874023, | |
| "learning_rate": 1.9372990353697752e-05, | |
| "loss": 0.4566, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 16.580251693725586, | |
| "learning_rate": 1.917202572347267e-05, | |
| "loss": 0.447, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 13.90858268737793, | |
| "learning_rate": 1.897106109324759e-05, | |
| "loss": 0.4133, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.628, | |
| "grad_norm": 10.297750473022461, | |
| "learning_rate": 1.877009646302251e-05, | |
| "loss": 0.9703, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 19.886884689331055, | |
| "learning_rate": 1.856913183279743e-05, | |
| "loss": 0.6253, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.636, | |
| "grad_norm": 10.709681510925293, | |
| "learning_rate": 1.836816720257235e-05, | |
| "loss": 0.4657, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 16.986331939697266, | |
| "learning_rate": 1.8167202572347267e-05, | |
| "loss": 0.3986, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.644, | |
| "grad_norm": 13.649085998535156, | |
| "learning_rate": 1.796623794212219e-05, | |
| "loss": 0.4337, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 7.645134449005127, | |
| "learning_rate": 1.7765273311897108e-05, | |
| "loss": 0.3901, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.652, | |
| "grad_norm": 11.727263450622559, | |
| "learning_rate": 1.7564308681672027e-05, | |
| "loss": 0.3545, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 6.705881595611572, | |
| "learning_rate": 1.736334405144695e-05, | |
| "loss": 0.3471, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 12.363304138183594, | |
| "learning_rate": 1.7162379421221868e-05, | |
| "loss": 0.4351, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 20.208723068237305, | |
| "learning_rate": 1.6961414790996786e-05, | |
| "loss": 0.4284, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.668, | |
| "grad_norm": 10.82363224029541, | |
| "learning_rate": 1.6760450160771705e-05, | |
| "loss": 0.3369, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 9.544486045837402, | |
| "learning_rate": 1.6559485530546627e-05, | |
| "loss": 0.4059, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.676, | |
| "grad_norm": 8.426627159118652, | |
| "learning_rate": 1.6358520900321546e-05, | |
| "loss": 0.4494, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 8.424084663391113, | |
| "learning_rate": 1.6157556270096464e-05, | |
| "loss": 0.4807, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.684, | |
| "grad_norm": 9.49954891204834, | |
| "learning_rate": 1.5956591639871383e-05, | |
| "loss": 0.3647, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 14.690208435058594, | |
| "learning_rate": 1.5755627009646305e-05, | |
| "loss": 0.3715, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.692, | |
| "grad_norm": 12.074922561645508, | |
| "learning_rate": 1.5554662379421224e-05, | |
| "loss": 0.491, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 13.278485298156738, | |
| "learning_rate": 1.5353697749196143e-05, | |
| "loss": 0.4185, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 12.987263679504395, | |
| "learning_rate": 1.5152733118971063e-05, | |
| "loss": 0.5613, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 6.863049030303955, | |
| "learning_rate": 1.4951768488745982e-05, | |
| "loss": 0.3245, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.708, | |
| "grad_norm": 11.087668418884277, | |
| "learning_rate": 1.47508038585209e-05, | |
| "loss": 0.4174, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 5.16309118270874, | |
| "learning_rate": 1.4549839228295819e-05, | |
| "loss": 0.3233, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.716, | |
| "grad_norm": 12.031776428222656, | |
| "learning_rate": 1.4348874598070741e-05, | |
| "loss": 0.3574, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 13.569413185119629, | |
| "learning_rate": 1.414790996784566e-05, | |
| "loss": 0.5619, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.724, | |
| "grad_norm": 5.905683517456055, | |
| "learning_rate": 1.3946945337620579e-05, | |
| "loss": 0.3286, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 10.061610221862793, | |
| "learning_rate": 1.3745980707395497e-05, | |
| "loss": 0.3899, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.732, | |
| "grad_norm": 12.293854713439941, | |
| "learning_rate": 1.354501607717042e-05, | |
| "loss": 0.4059, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 13.248871803283691, | |
| "learning_rate": 1.3344051446945338e-05, | |
| "loss": 0.4339, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 9.589434623718262, | |
| "learning_rate": 1.3143086816720257e-05, | |
| "loss": 0.4178, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 8.538604736328125, | |
| "learning_rate": 1.2942122186495179e-05, | |
| "loss": 0.3152, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.748, | |
| "grad_norm": 18.58129119873047, | |
| "learning_rate": 1.2741157556270097e-05, | |
| "loss": 0.4276, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 8.69501781463623, | |
| "learning_rate": 1.2540192926045016e-05, | |
| "loss": 0.4304, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.756, | |
| "grad_norm": 14.74836254119873, | |
| "learning_rate": 1.2339228295819937e-05, | |
| "loss": 0.3541, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 7.415429592132568, | |
| "learning_rate": 1.2138263665594855e-05, | |
| "loss": 0.3713, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.764, | |
| "grad_norm": 8.78702163696289, | |
| "learning_rate": 1.1937299035369776e-05, | |
| "loss": 0.3025, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 2.6222591400146484, | |
| "learning_rate": 1.1736334405144696e-05, | |
| "loss": 0.2279, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.772, | |
| "grad_norm": 8.457213401794434, | |
| "learning_rate": 1.1535369774919615e-05, | |
| "loss": 0.3841, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 9.097604751586914, | |
| "learning_rate": 1.1334405144694535e-05, | |
| "loss": 0.3436, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 9.933280944824219, | |
| "learning_rate": 1.1133440514469454e-05, | |
| "loss": 0.4276, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 9.58340072631836, | |
| "learning_rate": 1.0932475884244374e-05, | |
| "loss": 0.281, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.788, | |
| "grad_norm": 13.846723556518555, | |
| "learning_rate": 1.0731511254019293e-05, | |
| "loss": 0.2836, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 30.122060775756836, | |
| "learning_rate": 1.0530546623794213e-05, | |
| "loss": 0.3722, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.796, | |
| "grad_norm": 8.666303634643555, | |
| "learning_rate": 1.0329581993569132e-05, | |
| "loss": 0.2778, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 7.968908786773682, | |
| "learning_rate": 1.0128617363344052e-05, | |
| "loss": 0.2778, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.804, | |
| "grad_norm": 6.987481594085693, | |
| "learning_rate": 9.927652733118971e-06, | |
| "loss": 0.2778, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 11.547746658325195, | |
| "learning_rate": 9.726688102893891e-06, | |
| "loss": 0.3181, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.812, | |
| "grad_norm": 7.187608242034912, | |
| "learning_rate": 9.525723472668812e-06, | |
| "loss": 0.3211, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 14.975872039794922, | |
| "learning_rate": 9.32475884244373e-06, | |
| "loss": 0.2335, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 5.20744514465332, | |
| "learning_rate": 9.123794212218651e-06, | |
| "loss": 0.3012, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 9.876429557800293, | |
| "learning_rate": 8.92282958199357e-06, | |
| "loss": 0.3095, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.828, | |
| "grad_norm": 7.847969055175781, | |
| "learning_rate": 8.72186495176849e-06, | |
| "loss": 0.3336, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 5.847342014312744, | |
| "learning_rate": 8.520900321543409e-06, | |
| "loss": 0.3471, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.836, | |
| "grad_norm": 12.866349220275879, | |
| "learning_rate": 8.319935691318329e-06, | |
| "loss": 0.5096, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 5.676148891448975, | |
| "learning_rate": 8.118971061093248e-06, | |
| "loss": 0.3028, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.844, | |
| "grad_norm": 10.639591217041016, | |
| "learning_rate": 7.918006430868168e-06, | |
| "loss": 0.3065, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 4.760974884033203, | |
| "learning_rate": 7.717041800643089e-06, | |
| "loss": 0.2966, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.852, | |
| "grad_norm": 11.585098266601562, | |
| "learning_rate": 7.516077170418006e-06, | |
| "loss": 0.3466, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 4.685300827026367, | |
| "learning_rate": 7.315112540192927e-06, | |
| "loss": 0.397, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 7.504604339599609, | |
| "learning_rate": 7.1141479099678455e-06, | |
| "loss": 0.2216, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 16.2221736907959, | |
| "learning_rate": 6.913183279742766e-06, | |
| "loss": 0.3863, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.868, | |
| "grad_norm": 9.542070388793945, | |
| "learning_rate": 6.732315112540192e-06, | |
| "loss": 0.2478, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 12.575300216674805, | |
| "learning_rate": 6.531350482315113e-06, | |
| "loss": 0.3366, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.876, | |
| "grad_norm": 12.5275297164917, | |
| "learning_rate": 6.330385852090033e-06, | |
| "loss": 0.42, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 7.2454514503479, | |
| "learning_rate": 6.129421221864952e-06, | |
| "loss": 0.2943, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.884, | |
| "grad_norm": 5.587230682373047, | |
| "learning_rate": 5.928456591639871e-06, | |
| "loss": 0.2124, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 12.111313819885254, | |
| "learning_rate": 5.727491961414791e-06, | |
| "loss": 0.4733, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.892, | |
| "grad_norm": 13.287424087524414, | |
| "learning_rate": 5.526527331189711e-06, | |
| "loss": 0.3033, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 16.977447509765625, | |
| "learning_rate": 5.325562700964631e-06, | |
| "loss": 0.3256, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 6.201746463775635, | |
| "learning_rate": 5.12459807073955e-06, | |
| "loss": 0.1995, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 10.706707954406738, | |
| "learning_rate": 4.92363344051447e-06, | |
| "loss": 0.3303, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.908, | |
| "grad_norm": 14.989971160888672, | |
| "learning_rate": 4.7226688102893895e-06, | |
| "loss": 0.2842, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 15.570676803588867, | |
| "learning_rate": 4.521704180064309e-06, | |
| "loss": 0.212, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.916, | |
| "grad_norm": 3.9141690731048584, | |
| "learning_rate": 4.320739549839229e-06, | |
| "loss": 0.263, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 6.531361103057861, | |
| "learning_rate": 4.119774919614148e-06, | |
| "loss": 0.2503, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.924, | |
| "grad_norm": 8.718388557434082, | |
| "learning_rate": 3.918810289389068e-06, | |
| "loss": 0.2793, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 11.070220947265625, | |
| "learning_rate": 3.7178456591639876e-06, | |
| "loss": 0.2585, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.932, | |
| "grad_norm": 7.7603759765625, | |
| "learning_rate": 3.516881028938907e-06, | |
| "loss": 0.5099, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 15.190449714660645, | |
| "learning_rate": 3.3159163987138267e-06, | |
| "loss": 0.2601, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 9.578422546386719, | |
| "learning_rate": 3.1149517684887463e-06, | |
| "loss": 0.264, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 10.097108840942383, | |
| "learning_rate": 2.913987138263666e-06, | |
| "loss": 0.6162, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.948, | |
| "grad_norm": 8.5523042678833, | |
| "learning_rate": 2.7130225080385853e-06, | |
| "loss": 0.2649, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 8.127545356750488, | |
| "learning_rate": 2.512057877813505e-06, | |
| "loss": 0.4394, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.956, | |
| "grad_norm": 8.142045021057129, | |
| "learning_rate": 2.311093247588425e-06, | |
| "loss": 0.2339, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 6.302300930023193, | |
| "learning_rate": 2.1101286173633444e-06, | |
| "loss": 0.2135, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.964, | |
| "grad_norm": 4.838590621948242, | |
| "learning_rate": 1.909163987138264e-06, | |
| "loss": 0.3181, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 6.775151252746582, | |
| "learning_rate": 1.7081993569131833e-06, | |
| "loss": 0.2516, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.972, | |
| "grad_norm": 13.491608619689941, | |
| "learning_rate": 1.507234726688103e-06, | |
| "loss": 0.2294, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 7.067889213562012, | |
| "learning_rate": 1.3062700964630226e-06, | |
| "loss": 0.2206, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 6.7473530769348145, | |
| "learning_rate": 1.1053054662379423e-06, | |
| "loss": 0.2003, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 9.074109077453613, | |
| "learning_rate": 9.043408360128617e-07, | |
| "loss": 0.2343, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.988, | |
| "grad_norm": 10.73699951171875, | |
| "learning_rate": 7.033762057877814e-07, | |
| "loss": 0.6721, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 5.29847526550293, | |
| "learning_rate": 5.02411575562701e-07, | |
| "loss": 0.236, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.996, | |
| "grad_norm": 10.172593116760254, | |
| "learning_rate": 3.014469453376206e-07, | |
| "loss": 0.216, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 16.507993698120117, | |
| "learning_rate": 1.0048231511254019e-07, | |
| "loss": 0.2836, | |
| "step": 2500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.513321594098893e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |