| { | |
| "best_global_step": 3500, | |
| "best_metric": 0.12329943478107452, | |
| "best_model_checkpoint": "./deit-ena24/checkpoint-3500", | |
| "epoch": 7.0, | |
| "eval_steps": 100, | |
| "global_step": 5376, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013020833333333334, | |
| "grad_norm": 14.326886177062988, | |
| "learning_rate": 0.00019970238095238098, | |
| "loss": 2.8758, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.026041666666666668, | |
| "grad_norm": 11.887316703796387, | |
| "learning_rate": 0.00019933035714285715, | |
| "loss": 2.3464, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0390625, | |
| "grad_norm": 8.772107124328613, | |
| "learning_rate": 0.00019895833333333332, | |
| "loss": 2.1228, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.052083333333333336, | |
| "grad_norm": 10.218932151794434, | |
| "learning_rate": 0.00019858630952380955, | |
| "loss": 1.794, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06510416666666667, | |
| "grad_norm": 14.392820358276367, | |
| "learning_rate": 0.00019821428571428572, | |
| "loss": 1.335, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 6.984030723571777, | |
| "learning_rate": 0.00019784226190476192, | |
| "loss": 1.3528, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09114583333333333, | |
| "grad_norm": 14.042470932006836, | |
| "learning_rate": 0.0001974702380952381, | |
| "loss": 1.5397, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10416666666666667, | |
| "grad_norm": 7.9572296142578125, | |
| "learning_rate": 0.0001970982142857143, | |
| "loss": 0.9913, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1171875, | |
| "grad_norm": 12.12057113647461, | |
| "learning_rate": 0.00019672619047619047, | |
| "loss": 1.0173, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13020833333333334, | |
| "grad_norm": 11.924510955810547, | |
| "learning_rate": 0.00019635416666666667, | |
| "loss": 1.396, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13020833333333334, | |
| "eval_accuracy": 0.7106870229007634, | |
| "eval_f1": 0.6601507828100853, | |
| "eval_loss": 1.0114130973815918, | |
| "eval_runtime": 81.0113, | |
| "eval_samples_per_second": 16.171, | |
| "eval_steps_per_second": 2.024, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14322916666666666, | |
| "grad_norm": 5.5064802169799805, | |
| "learning_rate": 0.00019598214285714287, | |
| "loss": 0.9496, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 9.486794471740723, | |
| "learning_rate": 0.00019561011904761905, | |
| "loss": 0.9777, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16927083333333334, | |
| "grad_norm": 4.409852981567383, | |
| "learning_rate": 0.00019523809523809525, | |
| "loss": 1.137, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18229166666666666, | |
| "grad_norm": 8.373451232910156, | |
| "learning_rate": 0.00019486607142857142, | |
| "loss": 1.1658, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 11.354508399963379, | |
| "learning_rate": 0.00019449404761904765, | |
| "loss": 0.795, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20833333333333334, | |
| "grad_norm": 12.690673828125, | |
| "learning_rate": 0.00019412202380952382, | |
| "loss": 1.1273, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.22135416666666666, | |
| "grad_norm": 14.219934463500977, | |
| "learning_rate": 0.00019375000000000002, | |
| "loss": 0.7724, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 11.321800231933594, | |
| "learning_rate": 0.0001934151785714286, | |
| "loss": 1.4944, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.24739583333333334, | |
| "grad_norm": 9.014796257019043, | |
| "learning_rate": 0.00019304315476190476, | |
| "loss": 0.7343, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2604166666666667, | |
| "grad_norm": 7.218810081481934, | |
| "learning_rate": 0.00019267113095238096, | |
| "loss": 1.0428, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2604166666666667, | |
| "eval_accuracy": 0.7938931297709924, | |
| "eval_f1": 0.7693871519967889, | |
| "eval_loss": 0.7400494813919067, | |
| "eval_runtime": 81.6318, | |
| "eval_samples_per_second": 16.048, | |
| "eval_steps_per_second": 2.009, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2734375, | |
| "grad_norm": 8.421069145202637, | |
| "learning_rate": 0.00019229910714285714, | |
| "loss": 0.5237, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2864583333333333, | |
| "grad_norm": 13.428995132446289, | |
| "learning_rate": 0.00019192708333333334, | |
| "loss": 0.8903, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2994791666666667, | |
| "grad_norm": 19.187602996826172, | |
| "learning_rate": 0.00019155505952380954, | |
| "loss": 0.6695, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 15.194157600402832, | |
| "learning_rate": 0.00019118303571428574, | |
| "loss": 0.7389, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3255208333333333, | |
| "grad_norm": 8.316120147705078, | |
| "learning_rate": 0.0001908110119047619, | |
| "loss": 0.553, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3385416666666667, | |
| "grad_norm": 11.237637519836426, | |
| "learning_rate": 0.0001904389880952381, | |
| "loss": 0.8798, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3515625, | |
| "grad_norm": 5.77901554107666, | |
| "learning_rate": 0.00019006696428571429, | |
| "loss": 0.628, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3645833333333333, | |
| "grad_norm": 7.172740936279297, | |
| "learning_rate": 0.00018969494047619049, | |
| "loss": 0.739, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3776041666666667, | |
| "grad_norm": 6.8057403564453125, | |
| "learning_rate": 0.0001893229166666667, | |
| "loss": 0.5944, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 6.600423336029053, | |
| "learning_rate": 0.00018895089285714286, | |
| "loss": 0.6952, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "eval_accuracy": 0.816030534351145, | |
| "eval_f1": 0.798128977659313, | |
| "eval_loss": 0.6129177808761597, | |
| "eval_runtime": 81.4399, | |
| "eval_samples_per_second": 16.085, | |
| "eval_steps_per_second": 2.014, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4036458333333333, | |
| "grad_norm": 3.195241689682007, | |
| "learning_rate": 0.00018857886904761906, | |
| "loss": 0.6764, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 5.749640941619873, | |
| "learning_rate": 0.00018820684523809523, | |
| "loss": 0.5771, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4296875, | |
| "grad_norm": 3.773176908493042, | |
| "learning_rate": 0.00018783482142857143, | |
| "loss": 0.5225, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4427083333333333, | |
| "grad_norm": 0.6245913505554199, | |
| "learning_rate": 0.00018746279761904763, | |
| "loss": 0.3174, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4557291666666667, | |
| "grad_norm": 19.686187744140625, | |
| "learning_rate": 0.00018709077380952384, | |
| "loss": 0.475, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 11.524737358093262, | |
| "learning_rate": 0.00018671875, | |
| "loss": 0.5356, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4817708333333333, | |
| "grad_norm": 5.832364082336426, | |
| "learning_rate": 0.00018634672619047618, | |
| "loss": 0.6856, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4947916666666667, | |
| "grad_norm": 6.053863048553467, | |
| "learning_rate": 0.00018597470238095238, | |
| "loss": 0.772, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5078125, | |
| "grad_norm": 8.778549194335938, | |
| "learning_rate": 0.00018560267857142858, | |
| "loss": 0.7792, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5208333333333334, | |
| "grad_norm": 5.532504081726074, | |
| "learning_rate": 0.00018523065476190478, | |
| "loss": 0.4429, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5208333333333334, | |
| "eval_accuracy": 0.8618320610687022, | |
| "eval_f1": 0.8171047071835743, | |
| "eval_loss": 0.49910008907318115, | |
| "eval_runtime": 81.7142, | |
| "eval_samples_per_second": 16.031, | |
| "eval_steps_per_second": 2.007, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5338541666666666, | |
| "grad_norm": 9.393836975097656, | |
| "learning_rate": 0.00018485863095238096, | |
| "loss": 0.6779, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 6.4658894538879395, | |
| "learning_rate": 0.00018448660714285716, | |
| "loss": 0.3908, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5598958333333334, | |
| "grad_norm": 10.486028671264648, | |
| "learning_rate": 0.00018411458333333333, | |
| "loss": 0.4656, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5729166666666666, | |
| "grad_norm": 5.66467809677124, | |
| "learning_rate": 0.00018374255952380953, | |
| "loss": 0.4538, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 4.779189586639404, | |
| "learning_rate": 0.00018337053571428573, | |
| "loss": 0.6714, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5989583333333334, | |
| "grad_norm": 10.495992660522461, | |
| "learning_rate": 0.0001829985119047619, | |
| "loss": 0.6211, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6119791666666666, | |
| "grad_norm": 14.25648307800293, | |
| "learning_rate": 0.0001826264880952381, | |
| "loss": 0.764, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 11.401782035827637, | |
| "learning_rate": 0.00018225446428571428, | |
| "loss": 0.425, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6380208333333334, | |
| "grad_norm": 1.5994138717651367, | |
| "learning_rate": 0.00018188244047619048, | |
| "loss": 0.3748, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6510416666666666, | |
| "grad_norm": 4.534453868865967, | |
| "learning_rate": 0.00018151041666666668, | |
| "loss": 0.5441, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6510416666666666, | |
| "eval_accuracy": 0.8839694656488549, | |
| "eval_f1": 0.8630539968188508, | |
| "eval_loss": 0.4392457902431488, | |
| "eval_runtime": 80.6437, | |
| "eval_samples_per_second": 16.244, | |
| "eval_steps_per_second": 2.034, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6640625, | |
| "grad_norm": 6.4235405921936035, | |
| "learning_rate": 0.00018113839285714288, | |
| "loss": 0.6159, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6770833333333334, | |
| "grad_norm": 13.802733421325684, | |
| "learning_rate": 0.00018076636904761905, | |
| "loss": 0.6188, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6901041666666666, | |
| "grad_norm": 5.669039726257324, | |
| "learning_rate": 0.00018039434523809525, | |
| "loss": 0.626, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 10.001558303833008, | |
| "learning_rate": 0.00018002232142857143, | |
| "loss": 0.3108, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7161458333333334, | |
| "grad_norm": 10.934700012207031, | |
| "learning_rate": 0.00017965029761904763, | |
| "loss": 0.444, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7291666666666666, | |
| "grad_norm": 0.13122598826885223, | |
| "learning_rate": 0.00017927827380952383, | |
| "loss": 0.5144, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7421875, | |
| "grad_norm": 16.866697311401367, | |
| "learning_rate": 0.00017890625, | |
| "loss": 0.3035, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7552083333333334, | |
| "grad_norm": 7.949023723602295, | |
| "learning_rate": 0.0001785342261904762, | |
| "loss": 0.3475, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7682291666666666, | |
| "grad_norm": 12.488534927368164, | |
| "learning_rate": 0.00017816220238095238, | |
| "loss": 0.2507, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 8.129867553710938, | |
| "learning_rate": 0.00017779017857142858, | |
| "loss": 0.4533, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "eval_accuracy": 0.898473282442748, | |
| "eval_f1": 0.8765276393244534, | |
| "eval_loss": 0.41201263666152954, | |
| "eval_runtime": 81.9039, | |
| "eval_samples_per_second": 15.994, | |
| "eval_steps_per_second": 2.002, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7942708333333334, | |
| "grad_norm": 1.629770278930664, | |
| "learning_rate": 0.00017741815476190478, | |
| "loss": 0.3667, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8072916666666666, | |
| "grad_norm": 6.583140850067139, | |
| "learning_rate": 0.00017704613095238095, | |
| "loss": 0.4283, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8203125, | |
| "grad_norm": 10.798684120178223, | |
| "learning_rate": 0.00017667410714285715, | |
| "loss": 0.4709, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 0.880183219909668, | |
| "learning_rate": 0.00017630208333333332, | |
| "loss": 0.4416, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8463541666666666, | |
| "grad_norm": 6.968727111816406, | |
| "learning_rate": 0.00017593005952380955, | |
| "loss": 0.2443, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 8.899871826171875, | |
| "learning_rate": 0.00017555803571428572, | |
| "loss": 0.3888, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8723958333333334, | |
| "grad_norm": 0.15646734833717346, | |
| "learning_rate": 0.00017518601190476193, | |
| "loss": 0.4988, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8854166666666666, | |
| "grad_norm": 9.366886138916016, | |
| "learning_rate": 0.0001748139880952381, | |
| "loss": 0.512, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8984375, | |
| "grad_norm": 8.295921325683594, | |
| "learning_rate": 0.0001744419642857143, | |
| "loss": 0.8808, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9114583333333334, | |
| "grad_norm": 5.960359573364258, | |
| "learning_rate": 0.00017406994047619047, | |
| "loss": 0.1213, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9114583333333334, | |
| "eval_accuracy": 0.8916030534351145, | |
| "eval_f1": 0.8738406978318818, | |
| "eval_loss": 0.39530593156814575, | |
| "eval_runtime": 82.1784, | |
| "eval_samples_per_second": 15.941, | |
| "eval_steps_per_second": 1.996, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9244791666666666, | |
| "grad_norm": 7.695637226104736, | |
| "learning_rate": 0.00017369791666666667, | |
| "loss": 0.4612, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 18.374658584594727, | |
| "learning_rate": 0.00017332589285714287, | |
| "loss": 0.7359, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9505208333333334, | |
| "grad_norm": 1.4197808504104614, | |
| "learning_rate": 0.00017295386904761905, | |
| "loss": 0.4227, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9635416666666666, | |
| "grad_norm": 6.668863296508789, | |
| "learning_rate": 0.00017258184523809525, | |
| "loss": 0.3364, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 1.1208908557891846, | |
| "learning_rate": 0.00017220982142857142, | |
| "loss": 0.5498, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9895833333333334, | |
| "grad_norm": 0.032729148864746094, | |
| "learning_rate": 0.00017183779761904765, | |
| "loss": 0.4161, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.0026041666666667, | |
| "grad_norm": 9.334514617919922, | |
| "learning_rate": 0.00017146577380952382, | |
| "loss": 0.2128, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.015625, | |
| "grad_norm": 1.7203080654144287, | |
| "learning_rate": 0.00017109375, | |
| "loss": 0.2949, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.0286458333333333, | |
| "grad_norm": 0.1545121818780899, | |
| "learning_rate": 0.0001707217261904762, | |
| "loss": 0.3386, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.0416666666666667, | |
| "grad_norm": 3.9173028469085693, | |
| "learning_rate": 0.00017034970238095237, | |
| "loss": 0.1151, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0416666666666667, | |
| "eval_accuracy": 0.9236641221374046, | |
| "eval_f1": 0.9141492016135527, | |
| "eval_loss": 0.31464245915412903, | |
| "eval_runtime": 81.3626, | |
| "eval_samples_per_second": 16.101, | |
| "eval_steps_per_second": 2.016, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0546875, | |
| "grad_norm": 9.882590293884277, | |
| "learning_rate": 0.0001699776785714286, | |
| "loss": 0.1838, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.0677083333333333, | |
| "grad_norm": 6.242782115936279, | |
| "learning_rate": 0.00016960565476190477, | |
| "loss": 0.1449, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.0807291666666667, | |
| "grad_norm": 8.182112693786621, | |
| "learning_rate": 0.00016923363095238097, | |
| "loss": 0.1741, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.09375, | |
| "grad_norm": 1.6447360515594482, | |
| "learning_rate": 0.00016886160714285714, | |
| "loss": 0.103, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.1067708333333333, | |
| "grad_norm": 0.6450613141059875, | |
| "learning_rate": 0.00016848958333333334, | |
| "loss": 0.1066, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.1197916666666667, | |
| "grad_norm": 17.398643493652344, | |
| "learning_rate": 0.00016811755952380954, | |
| "loss": 0.1932, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.1328125, | |
| "grad_norm": 0.9206380844116211, | |
| "learning_rate": 0.00016774553571428572, | |
| "loss": 0.318, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1458333333333333, | |
| "grad_norm": 15.930572509765625, | |
| "learning_rate": 0.00016737351190476192, | |
| "loss": 0.368, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.1588541666666667, | |
| "grad_norm": 11.435195922851562, | |
| "learning_rate": 0.0001670014880952381, | |
| "loss": 0.5996, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "grad_norm": 7.578676223754883, | |
| "learning_rate": 0.0001666294642857143, | |
| "loss": 0.0953, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "eval_accuracy": 0.9015267175572519, | |
| "eval_f1": 0.8785814180999953, | |
| "eval_loss": 0.46558240056037903, | |
| "eval_runtime": 81.8401, | |
| "eval_samples_per_second": 16.007, | |
| "eval_steps_per_second": 2.004, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1848958333333333, | |
| "grad_norm": 0.44435903429985046, | |
| "learning_rate": 0.00016625744047619047, | |
| "loss": 0.1502, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1979166666666667, | |
| "grad_norm": 16.61111831665039, | |
| "learning_rate": 0.0001658854166666667, | |
| "loss": 0.2223, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.2109375, | |
| "grad_norm": 13.735160827636719, | |
| "learning_rate": 0.00016551339285714287, | |
| "loss": 0.2065, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.2239583333333333, | |
| "grad_norm": 16.117063522338867, | |
| "learning_rate": 0.00016514136904761907, | |
| "loss": 0.3977, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.2369791666666667, | |
| "grad_norm": 9.875779151916504, | |
| "learning_rate": 0.00016476934523809524, | |
| "loss": 0.456, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.8816986083984375, | |
| "learning_rate": 0.0001643973214285714, | |
| "loss": 0.3129, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.2630208333333333, | |
| "grad_norm": 3.4509758949279785, | |
| "learning_rate": 0.00016402529761904764, | |
| "loss": 0.225, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.2760416666666667, | |
| "grad_norm": 0.08307598531246185, | |
| "learning_rate": 0.00016365327380952381, | |
| "loss": 0.2918, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2890625, | |
| "grad_norm": 14.681130409240723, | |
| "learning_rate": 0.00016328125000000001, | |
| "loss": 0.3139, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.3020833333333333, | |
| "grad_norm": 1.1951783895492554, | |
| "learning_rate": 0.0001629092261904762, | |
| "loss": 0.1876, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.3020833333333333, | |
| "eval_accuracy": 0.916793893129771, | |
| "eval_f1": 0.9023348397180161, | |
| "eval_loss": 0.31640931963920593, | |
| "eval_runtime": 81.4055, | |
| "eval_samples_per_second": 16.092, | |
| "eval_steps_per_second": 2.015, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.3151041666666667, | |
| "grad_norm": 6.059875011444092, | |
| "learning_rate": 0.0001625372023809524, | |
| "loss": 0.1299, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.328125, | |
| "grad_norm": 1.8086495399475098, | |
| "learning_rate": 0.0001621651785714286, | |
| "loss": 0.3216, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.3411458333333333, | |
| "grad_norm": 7.017325401306152, | |
| "learning_rate": 0.00016179315476190476, | |
| "loss": 0.2612, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.3541666666666667, | |
| "grad_norm": 6.558719635009766, | |
| "learning_rate": 0.00016142113095238096, | |
| "loss": 0.253, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.3671875, | |
| "grad_norm": 0.298846960067749, | |
| "learning_rate": 0.00016104910714285714, | |
| "loss": 0.0666, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.3802083333333333, | |
| "grad_norm": 0.36731642484664917, | |
| "learning_rate": 0.00016067708333333334, | |
| "loss": 0.2507, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.3932291666666667, | |
| "grad_norm": 0.01786353439092636, | |
| "learning_rate": 0.00016030505952380954, | |
| "loss": 0.1932, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.40625, | |
| "grad_norm": 12.699788093566895, | |
| "learning_rate": 0.00015993303571428574, | |
| "loss": 0.055, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.4192708333333333, | |
| "grad_norm": 0.13174857199192047, | |
| "learning_rate": 0.0001595610119047619, | |
| "loss": 0.2443, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.4322916666666667, | |
| "grad_norm": 0.023631755262613297, | |
| "learning_rate": 0.0001591889880952381, | |
| "loss": 0.2368, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4322916666666667, | |
| "eval_accuracy": 0.9305343511450381, | |
| "eval_f1": 0.9218890582250652, | |
| "eval_loss": 0.2996809780597687, | |
| "eval_runtime": 82.3375, | |
| "eval_samples_per_second": 15.91, | |
| "eval_steps_per_second": 1.992, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4453125, | |
| "grad_norm": 23.099807739257812, | |
| "learning_rate": 0.00015881696428571429, | |
| "loss": 0.4036, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.4583333333333333, | |
| "grad_norm": 6.55803108215332, | |
| "learning_rate": 0.00015844494047619049, | |
| "loss": 0.1221, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.4713541666666667, | |
| "grad_norm": 0.007849316112697124, | |
| "learning_rate": 0.00015807291666666669, | |
| "loss": 0.0611, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.484375, | |
| "grad_norm": 11.71566390991211, | |
| "learning_rate": 0.00015770089285714286, | |
| "loss": 0.2897, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.4973958333333333, | |
| "grad_norm": 0.912572979927063, | |
| "learning_rate": 0.00015732886904761906, | |
| "loss": 0.3816, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.5104166666666665, | |
| "grad_norm": 13.910286903381348, | |
| "learning_rate": 0.00015695684523809523, | |
| "loss": 0.3181, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.5234375, | |
| "grad_norm": 5.9048285484313965, | |
| "learning_rate": 0.00015658482142857143, | |
| "loss": 0.2975, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.5364583333333335, | |
| "grad_norm": 0.6787735819816589, | |
| "learning_rate": 0.00015621279761904763, | |
| "loss": 0.2799, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.5494791666666665, | |
| "grad_norm": 6.673707962036133, | |
| "learning_rate": 0.00015584077380952383, | |
| "loss": 0.1498, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 2.7397446632385254, | |
| "learning_rate": 0.00015546875, | |
| "loss": 0.0658, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "eval_accuracy": 0.9534351145038168, | |
| "eval_f1": 0.947306466301055, | |
| "eval_loss": 0.23241807520389557, | |
| "eval_runtime": 81.3852, | |
| "eval_samples_per_second": 16.096, | |
| "eval_steps_per_second": 2.015, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5755208333333335, | |
| "grad_norm": 0.881657063961029, | |
| "learning_rate": 0.00015509672619047618, | |
| "loss": 0.1693, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.5885416666666665, | |
| "grad_norm": 12.591848373413086, | |
| "learning_rate": 0.00015472470238095238, | |
| "loss": 0.3892, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.6015625, | |
| "grad_norm": 0.16977375745773315, | |
| "learning_rate": 0.00015435267857142858, | |
| "loss": 0.1674, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.6145833333333335, | |
| "grad_norm": 0.5286293625831604, | |
| "learning_rate": 0.00015398065476190478, | |
| "loss": 0.195, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.6276041666666665, | |
| "grad_norm": 1.1396342515945435, | |
| "learning_rate": 0.00015360863095238096, | |
| "loss": 0.1445, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.640625, | |
| "grad_norm": 0.09726618230342865, | |
| "learning_rate": 0.00015323660714285716, | |
| "loss": 0.1403, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.6536458333333335, | |
| "grad_norm": 0.1769399642944336, | |
| "learning_rate": 0.00015286458333333333, | |
| "loss": 0.0544, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.11187902092933655, | |
| "learning_rate": 0.00015249255952380953, | |
| "loss": 0.3287, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.6796875, | |
| "grad_norm": 8.724081993103027, | |
| "learning_rate": 0.00015212053571428573, | |
| "loss": 0.1115, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.6927083333333335, | |
| "grad_norm": 11.985766410827637, | |
| "learning_rate": 0.0001517485119047619, | |
| "loss": 0.0566, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.6927083333333335, | |
| "eval_accuracy": 0.917557251908397, | |
| "eval_f1": 0.9076922718710467, | |
| "eval_loss": 0.3443506956100464, | |
| "eval_runtime": 81.6243, | |
| "eval_samples_per_second": 16.049, | |
| "eval_steps_per_second": 2.009, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.7057291666666665, | |
| "grad_norm": 11.513726234436035, | |
| "learning_rate": 0.0001513764880952381, | |
| "loss": 0.3674, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.71875, | |
| "grad_norm": 9.249573707580566, | |
| "learning_rate": 0.00015100446428571428, | |
| "loss": 0.32, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.7317708333333335, | |
| "grad_norm": 7.928230285644531, | |
| "learning_rate": 0.00015063244047619048, | |
| "loss": 0.2356, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.7447916666666665, | |
| "grad_norm": 0.10742262750864029, | |
| "learning_rate": 0.00015026041666666668, | |
| "loss": 0.1413, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.7578125, | |
| "grad_norm": 1.2628021240234375, | |
| "learning_rate": 0.00014988839285714288, | |
| "loss": 0.0266, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.7708333333333335, | |
| "grad_norm": 1.3294028043746948, | |
| "learning_rate": 0.00014951636904761905, | |
| "loss": 0.058, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.7838541666666665, | |
| "grad_norm": 13.2035493850708, | |
| "learning_rate": 0.00014914434523809525, | |
| "loss": 0.365, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.796875, | |
| "grad_norm": 2.251077651977539, | |
| "learning_rate": 0.00014877232142857143, | |
| "loss": 0.0734, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.8098958333333335, | |
| "grad_norm": 0.023287804797291756, | |
| "learning_rate": 0.00014840029761904763, | |
| "loss": 0.0929, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.8229166666666665, | |
| "grad_norm": 8.798796653747559, | |
| "learning_rate": 0.00014802827380952383, | |
| "loss": 0.2437, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.8229166666666665, | |
| "eval_accuracy": 0.9435114503816794, | |
| "eval_f1": 0.9363223667968604, | |
| "eval_loss": 0.3032698631286621, | |
| "eval_runtime": 80.4388, | |
| "eval_samples_per_second": 16.286, | |
| "eval_steps_per_second": 2.039, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.8359375, | |
| "grad_norm": 9.36063003540039, | |
| "learning_rate": 0.00014765625, | |
| "loss": 0.0919, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.8489583333333335, | |
| "grad_norm": 0.35603514313697815, | |
| "learning_rate": 0.0001472842261904762, | |
| "loss": 0.0993, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.8619791666666665, | |
| "grad_norm": 14.949860572814941, | |
| "learning_rate": 0.00014691220238095237, | |
| "loss": 0.2426, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 5.062771797180176, | |
| "learning_rate": 0.00014654017857142858, | |
| "loss": 0.2083, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.8880208333333335, | |
| "grad_norm": 1.161504864692688, | |
| "learning_rate": 0.00014616815476190478, | |
| "loss": 0.0764, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.9010416666666665, | |
| "grad_norm": 8.974434852600098, | |
| "learning_rate": 0.00014579613095238095, | |
| "loss": 0.1749, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.9140625, | |
| "grad_norm": 2.4918391704559326, | |
| "learning_rate": 0.00014542410714285715, | |
| "loss": 0.1547, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.9270833333333335, | |
| "grad_norm": 1.259583830833435, | |
| "learning_rate": 0.00014505208333333332, | |
| "loss": 0.3021, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.9401041666666665, | |
| "grad_norm": 0.0064702448435127735, | |
| "learning_rate": 0.00014468005952380955, | |
| "loss": 0.1864, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "grad_norm": 0.37315505743026733, | |
| "learning_rate": 0.00014430803571428572, | |
| "loss": 0.1011, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "eval_accuracy": 0.9450381679389313, | |
| "eval_f1": 0.9330381042359441, | |
| "eval_loss": 0.2739551365375519, | |
| "eval_runtime": 81.5196, | |
| "eval_samples_per_second": 16.07, | |
| "eval_steps_per_second": 2.012, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.9661458333333335, | |
| "grad_norm": 7.149832248687744, | |
| "learning_rate": 0.00014393601190476192, | |
| "loss": 0.1863, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.9791666666666665, | |
| "grad_norm": 13.7689847946167, | |
| "learning_rate": 0.0001435639880952381, | |
| "loss": 0.1759, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.9921875, | |
| "grad_norm": 0.02422071062028408, | |
| "learning_rate": 0.0001431919642857143, | |
| "loss": 0.4282, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.0052083333333335, | |
| "grad_norm": 14.325161933898926, | |
| "learning_rate": 0.00014281994047619047, | |
| "loss": 0.1968, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.0182291666666665, | |
| "grad_norm": 2.962752103805542, | |
| "learning_rate": 0.00014244791666666667, | |
| "loss": 0.2029, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.03125, | |
| "grad_norm": 3.5664641857147217, | |
| "learning_rate": 0.00014207589285714287, | |
| "loss": 0.1359, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.0442708333333335, | |
| "grad_norm": 0.6715424656867981, | |
| "learning_rate": 0.00014170386904761905, | |
| "loss": 0.0789, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.0572916666666665, | |
| "grad_norm": 4.6800055503845215, | |
| "learning_rate": 0.00014133184523809525, | |
| "loss": 0.0545, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.0703125, | |
| "grad_norm": 1.1109752655029297, | |
| "learning_rate": 0.00014095982142857142, | |
| "loss": 0.1057, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 14.279526710510254, | |
| "learning_rate": 0.00014058779761904765, | |
| "loss": 0.2987, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "eval_accuracy": 0.9488549618320611, | |
| "eval_f1": 0.9418813280523878, | |
| "eval_loss": 0.27153658866882324, | |
| "eval_runtime": 81.3659, | |
| "eval_samples_per_second": 16.1, | |
| "eval_steps_per_second": 2.016, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.0963541666666665, | |
| "grad_norm": 0.29308801889419556, | |
| "learning_rate": 0.00014021577380952382, | |
| "loss": 0.0385, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.109375, | |
| "grad_norm": 0.08972585946321487, | |
| "learning_rate": 0.00013984375, | |
| "loss": 0.2563, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.1223958333333335, | |
| "grad_norm": 4.163389682769775, | |
| "learning_rate": 0.0001394717261904762, | |
| "loss": 0.0729, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.1354166666666665, | |
| "grad_norm": 0.004595208913087845, | |
| "learning_rate": 0.00013909970238095237, | |
| "loss": 0.2001, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.1484375, | |
| "grad_norm": 16.369443893432617, | |
| "learning_rate": 0.0001387276785714286, | |
| "loss": 0.2567, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.1614583333333335, | |
| "grad_norm": 0.13416332006454468, | |
| "learning_rate": 0.00013835565476190477, | |
| "loss": 0.0102, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.1744791666666665, | |
| "grad_norm": 0.22717712819576263, | |
| "learning_rate": 0.00013798363095238097, | |
| "loss": 0.0984, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 0.05834342539310455, | |
| "learning_rate": 0.00013761160714285714, | |
| "loss": 0.1726, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.2005208333333335, | |
| "grad_norm": 0.01719828136265278, | |
| "learning_rate": 0.00013723958333333334, | |
| "loss": 0.01, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.2135416666666665, | |
| "grad_norm": 0.015803588554263115, | |
| "learning_rate": 0.00013686755952380954, | |
| "loss": 0.0227, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.2135416666666665, | |
| "eval_accuracy": 0.9603053435114504, | |
| "eval_f1": 0.9562499649385449, | |
| "eval_loss": 0.20495426654815674, | |
| "eval_runtime": 81.7033, | |
| "eval_samples_per_second": 16.034, | |
| "eval_steps_per_second": 2.007, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.2265625, | |
| "grad_norm": 0.09097200632095337, | |
| "learning_rate": 0.00013649553571428572, | |
| "loss": 0.0364, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.2395833333333335, | |
| "grad_norm": 0.13292516767978668, | |
| "learning_rate": 0.00013612351190476192, | |
| "loss": 0.1408, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.2526041666666665, | |
| "grad_norm": 0.00954443123191595, | |
| "learning_rate": 0.0001357514880952381, | |
| "loss": 0.0485, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.265625, | |
| "grad_norm": 0.006970481481403112, | |
| "learning_rate": 0.0001353794642857143, | |
| "loss": 0.0769, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.2786458333333335, | |
| "grad_norm": 0.002196338726207614, | |
| "learning_rate": 0.00013500744047619046, | |
| "loss": 0.0844, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.2916666666666665, | |
| "grad_norm": 0.008257012814283371, | |
| "learning_rate": 0.0001346354166666667, | |
| "loss": 0.0291, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.3046875, | |
| "grad_norm": 0.0019565862603485584, | |
| "learning_rate": 0.00013426339285714287, | |
| "loss": 0.2234, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.3177083333333335, | |
| "grad_norm": 0.2584744095802307, | |
| "learning_rate": 0.00013389136904761907, | |
| "loss": 0.139, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.3307291666666665, | |
| "grad_norm": 10.60031795501709, | |
| "learning_rate": 0.00013351934523809524, | |
| "loss": 0.1197, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 0.4733524024486542, | |
| "learning_rate": 0.0001331473214285714, | |
| "loss": 0.1891, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "eval_accuracy": 0.9541984732824428, | |
| "eval_f1": 0.949418067653998, | |
| "eval_loss": 0.20545873045921326, | |
| "eval_runtime": 81.2551, | |
| "eval_samples_per_second": 16.122, | |
| "eval_steps_per_second": 2.018, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.3567708333333335, | |
| "grad_norm": 0.9352707862854004, | |
| "learning_rate": 0.00013277529761904764, | |
| "loss": 0.0601, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.3697916666666665, | |
| "grad_norm": 0.035993486642837524, | |
| "learning_rate": 0.00013240327380952381, | |
| "loss": 0.0631, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.3828125, | |
| "grad_norm": 0.26035353541374207, | |
| "learning_rate": 0.00013203125000000001, | |
| "loss": 0.055, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.3958333333333335, | |
| "grad_norm": 0.1341722011566162, | |
| "learning_rate": 0.0001316592261904762, | |
| "loss": 0.0024, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.4088541666666665, | |
| "grad_norm": 0.3686296343803406, | |
| "learning_rate": 0.0001312872023809524, | |
| "loss": 0.0607, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.421875, | |
| "grad_norm": 0.2185925841331482, | |
| "learning_rate": 0.0001309151785714286, | |
| "loss": 0.179, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.4348958333333335, | |
| "grad_norm": 0.0028680674731731415, | |
| "learning_rate": 0.00013054315476190476, | |
| "loss": 0.0425, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.4479166666666665, | |
| "grad_norm": 14.099466323852539, | |
| "learning_rate": 0.00013017113095238096, | |
| "loss": 0.0506, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.4609375, | |
| "grad_norm": 6.842142105102539, | |
| "learning_rate": 0.00012979910714285714, | |
| "loss": 0.2308, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.4739583333333335, | |
| "grad_norm": 0.04054475575685501, | |
| "learning_rate": 0.00012942708333333334, | |
| "loss": 0.0325, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.4739583333333335, | |
| "eval_accuracy": 0.9625954198473282, | |
| "eval_f1": 0.9604463198147362, | |
| "eval_loss": 0.20703551173210144, | |
| "eval_runtime": 81.1143, | |
| "eval_samples_per_second": 16.15, | |
| "eval_steps_per_second": 2.022, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.4869791666666665, | |
| "grad_norm": 8.099538803100586, | |
| "learning_rate": 0.00012905505952380954, | |
| "loss": 0.2277, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 2.1970572471618652, | |
| "learning_rate": 0.00012868303571428574, | |
| "loss": 0.2454, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.5130208333333335, | |
| "grad_norm": 0.4118788540363312, | |
| "learning_rate": 0.0001283110119047619, | |
| "loss": 0.1846, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.5260416666666665, | |
| "grad_norm": 0.02326013147830963, | |
| "learning_rate": 0.0001279389880952381, | |
| "loss": 0.0227, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.5390625, | |
| "grad_norm": 1.2317097187042236, | |
| "learning_rate": 0.00012756696428571428, | |
| "loss": 0.1277, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.5520833333333335, | |
| "grad_norm": 0.013419517315924168, | |
| "learning_rate": 0.00012719494047619048, | |
| "loss": 0.0591, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.5651041666666665, | |
| "grad_norm": 0.023129452019929886, | |
| "learning_rate": 0.00012682291666666669, | |
| "loss": 0.0341, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.578125, | |
| "grad_norm": 0.15288208425045013, | |
| "learning_rate": 0.00012645089285714286, | |
| "loss": 0.1505, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.5911458333333335, | |
| "grad_norm": 1.5579731464385986, | |
| "learning_rate": 0.00012607886904761906, | |
| "loss": 0.174, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.6041666666666665, | |
| "grad_norm": 0.010995297692716122, | |
| "learning_rate": 0.00012570684523809523, | |
| "loss": 0.0407, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.6041666666666665, | |
| "eval_accuracy": 0.9610687022900763, | |
| "eval_f1": 0.9549819582767186, | |
| "eval_loss": 0.18762578070163727, | |
| "eval_runtime": 81.7819, | |
| "eval_samples_per_second": 16.018, | |
| "eval_steps_per_second": 2.005, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.6171875, | |
| "grad_norm": 0.09427079558372498, | |
| "learning_rate": 0.00012533482142857143, | |
| "loss": 0.0994, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.6302083333333335, | |
| "grad_norm": 0.2120276242494583, | |
| "learning_rate": 0.00012496279761904763, | |
| "loss": 0.0055, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.6432291666666665, | |
| "grad_norm": 18.408981323242188, | |
| "learning_rate": 0.0001245907738095238, | |
| "loss": 0.1705, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.65625, | |
| "grad_norm": 10.559670448303223, | |
| "learning_rate": 0.00012421875, | |
| "loss": 0.3207, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.6692708333333335, | |
| "grad_norm": 11.863653182983398, | |
| "learning_rate": 0.00012384672619047618, | |
| "loss": 0.1375, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.6822916666666665, | |
| "grad_norm": 0.012622825801372528, | |
| "learning_rate": 0.00012347470238095238, | |
| "loss": 0.0465, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.6953125, | |
| "grad_norm": 0.005969795864075422, | |
| "learning_rate": 0.00012310267857142858, | |
| "loss": 0.0451, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.7083333333333335, | |
| "grad_norm": 1.3794277906417847, | |
| "learning_rate": 0.00012273065476190478, | |
| "loss": 0.0026, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.7213541666666665, | |
| "grad_norm": 0.5586200952529907, | |
| "learning_rate": 0.00012235863095238096, | |
| "loss": 0.0032, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "grad_norm": 0.007813535630702972, | |
| "learning_rate": 0.00012198660714285714, | |
| "loss": 0.0112, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "eval_accuracy": 0.9748091603053435, | |
| "eval_f1": 0.9718675382934706, | |
| "eval_loss": 0.1701812744140625, | |
| "eval_runtime": 80.7927, | |
| "eval_samples_per_second": 16.214, | |
| "eval_steps_per_second": 2.03, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.7473958333333335, | |
| "grad_norm": 0.031850416213274, | |
| "learning_rate": 0.00012161458333333333, | |
| "loss": 0.1931, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.7604166666666665, | |
| "grad_norm": 0.9639140963554382, | |
| "learning_rate": 0.00012124255952380954, | |
| "loss": 0.0105, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.7734375, | |
| "grad_norm": 0.2940481901168823, | |
| "learning_rate": 0.00012087053571428573, | |
| "loss": 0.1237, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.7864583333333335, | |
| "grad_norm": 0.5645309090614319, | |
| "learning_rate": 0.00012049851190476192, | |
| "loss": 0.2693, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.7994791666666665, | |
| "grad_norm": 0.004514560103416443, | |
| "learning_rate": 0.0001201264880952381, | |
| "loss": 0.0083, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 11.058444023132324, | |
| "learning_rate": 0.00011975446428571429, | |
| "loss": 0.1717, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.8255208333333335, | |
| "grad_norm": 7.021084785461426, | |
| "learning_rate": 0.00011938244047619048, | |
| "loss": 0.0942, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.8385416666666665, | |
| "grad_norm": 0.16477301716804504, | |
| "learning_rate": 0.00011901041666666668, | |
| "loss": 0.0023, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.8515625, | |
| "grad_norm": 6.9443440437316895, | |
| "learning_rate": 0.00011863839285714287, | |
| "loss": 0.0194, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.8645833333333335, | |
| "grad_norm": 7.429518699645996, | |
| "learning_rate": 0.00011826636904761905, | |
| "loss": 0.112, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.8645833333333335, | |
| "eval_accuracy": 0.9656488549618321, | |
| "eval_f1": 0.962369507098473, | |
| "eval_loss": 0.16947515308856964, | |
| "eval_runtime": 81.2806, | |
| "eval_samples_per_second": 16.117, | |
| "eval_steps_per_second": 2.018, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.8776041666666665, | |
| "grad_norm": 0.24400918185710907, | |
| "learning_rate": 0.00011789434523809524, | |
| "loss": 0.2001, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.890625, | |
| "grad_norm": 31.43627166748047, | |
| "learning_rate": 0.00011752232142857143, | |
| "loss": 0.049, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.9036458333333335, | |
| "grad_norm": 0.16998760402202606, | |
| "learning_rate": 0.00011715029761904763, | |
| "loss": 0.0652, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.9166666666666665, | |
| "grad_norm": 0.0031110956333577633, | |
| "learning_rate": 0.00011677827380952381, | |
| "loss": 0.1108, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.9296875, | |
| "grad_norm": 0.47696933150291443, | |
| "learning_rate": 0.00011640625, | |
| "loss": 0.0641, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.9427083333333335, | |
| "grad_norm": 0.011921875178813934, | |
| "learning_rate": 0.00011603422619047619, | |
| "loss": 0.0573, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.9557291666666665, | |
| "grad_norm": 0.21888484060764313, | |
| "learning_rate": 0.00011566220238095237, | |
| "loss": 0.0947, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.96875, | |
| "grad_norm": 0.09869220852851868, | |
| "learning_rate": 0.00011529017857142859, | |
| "loss": 0.1122, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.9817708333333335, | |
| "grad_norm": 0.002251110738143325, | |
| "learning_rate": 0.00011491815476190478, | |
| "loss": 0.0506, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.9947916666666665, | |
| "grad_norm": 10.688546180725098, | |
| "learning_rate": 0.00011454613095238096, | |
| "loss": 0.184, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.9947916666666665, | |
| "eval_accuracy": 0.9625954198473282, | |
| "eval_f1": 0.9589633309666014, | |
| "eval_loss": 0.20880243182182312, | |
| "eval_runtime": 80.8948, | |
| "eval_samples_per_second": 16.194, | |
| "eval_steps_per_second": 2.027, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.0078125, | |
| "grad_norm": 0.005016920156776905, | |
| "learning_rate": 0.00011417410714285715, | |
| "loss": 0.0091, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 3.0208333333333335, | |
| "grad_norm": 0.006470136810094118, | |
| "learning_rate": 0.00011380208333333334, | |
| "loss": 0.0253, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 3.0338541666666665, | |
| "grad_norm": 0.005450215190649033, | |
| "learning_rate": 0.00011343005952380954, | |
| "loss": 0.0229, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 3.046875, | |
| "grad_norm": 0.12995079159736633, | |
| "learning_rate": 0.00011305803571428572, | |
| "loss": 0.0369, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 3.0598958333333335, | |
| "grad_norm": 0.01697579026222229, | |
| "learning_rate": 0.00011268601190476191, | |
| "loss": 0.061, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.0729166666666665, | |
| "grad_norm": 0.001176235731691122, | |
| "learning_rate": 0.0001123139880952381, | |
| "loss": 0.0445, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 3.0859375, | |
| "grad_norm": 0.0572282038629055, | |
| "learning_rate": 0.00011194196428571428, | |
| "loss": 0.0461, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 3.0989583333333335, | |
| "grad_norm": 0.15515916049480438, | |
| "learning_rate": 0.00011156994047619047, | |
| "loss": 0.0025, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 3.1119791666666665, | |
| "grad_norm": 0.2916058599948883, | |
| "learning_rate": 0.00011119791666666669, | |
| "loss": 0.0205, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "grad_norm": 5.928340911865234, | |
| "learning_rate": 0.00011082589285714287, | |
| "loss": 0.0464, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "eval_accuracy": 0.9656488549618321, | |
| "eval_f1": 0.9613338775306285, | |
| "eval_loss": 0.18046243488788605, | |
| "eval_runtime": 81.8318, | |
| "eval_samples_per_second": 16.008, | |
| "eval_steps_per_second": 2.004, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.1380208333333335, | |
| "grad_norm": 0.00031931319972500205, | |
| "learning_rate": 0.00011045386904761905, | |
| "loss": 0.0839, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 3.1510416666666665, | |
| "grad_norm": 0.006044325418770313, | |
| "learning_rate": 0.00011008184523809523, | |
| "loss": 0.002, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 3.1640625, | |
| "grad_norm": 3.1625492572784424, | |
| "learning_rate": 0.00010970982142857142, | |
| "loss": 0.1109, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 3.1770833333333335, | |
| "grad_norm": 0.016439249739050865, | |
| "learning_rate": 0.00010933779761904763, | |
| "loss": 0.1323, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 3.1901041666666665, | |
| "grad_norm": 0.06573644280433655, | |
| "learning_rate": 0.00010896577380952382, | |
| "loss": 0.0047, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 3.203125, | |
| "grad_norm": 0.018080944195389748, | |
| "learning_rate": 0.00010859375000000001, | |
| "loss": 0.0344, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 3.2161458333333335, | |
| "grad_norm": 0.20206616818904877, | |
| "learning_rate": 0.0001082217261904762, | |
| "loss": 0.013, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 3.2291666666666665, | |
| "grad_norm": 0.003558138618245721, | |
| "learning_rate": 0.00010784970238095238, | |
| "loss": 0.0084, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 3.2421875, | |
| "grad_norm": 0.0043561323545873165, | |
| "learning_rate": 0.00010747767857142858, | |
| "loss": 0.0302, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 3.2552083333333335, | |
| "grad_norm": 3.4022269248962402, | |
| "learning_rate": 0.00010710565476190477, | |
| "loss": 0.0794, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.2552083333333335, | |
| "eval_accuracy": 0.9633587786259542, | |
| "eval_f1": 0.9607579593408613, | |
| "eval_loss": 0.2089044451713562, | |
| "eval_runtime": 81.806, | |
| "eval_samples_per_second": 16.013, | |
| "eval_steps_per_second": 2.005, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.2682291666666665, | |
| "grad_norm": 0.0350501574575901, | |
| "learning_rate": 0.00010673363095238096, | |
| "loss": 0.0591, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 3.28125, | |
| "grad_norm": 0.003951572813093662, | |
| "learning_rate": 0.00010636160714285714, | |
| "loss": 0.1486, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 3.2942708333333335, | |
| "grad_norm": 3.3104865550994873, | |
| "learning_rate": 0.00010598958333333333, | |
| "loss": 0.0101, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 3.3072916666666665, | |
| "grad_norm": 0.09495699405670166, | |
| "learning_rate": 0.00010561755952380954, | |
| "loss": 0.0118, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 3.3203125, | |
| "grad_norm": 0.056691791862249374, | |
| "learning_rate": 0.00010524553571428573, | |
| "loss": 0.0431, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 7.412529468536377, | |
| "learning_rate": 0.00010487351190476192, | |
| "loss": 0.2413, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 3.3463541666666665, | |
| "grad_norm": 0.3181236982345581, | |
| "learning_rate": 0.0001045014880952381, | |
| "loss": 0.0183, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 3.359375, | |
| "grad_norm": 0.8227505087852478, | |
| "learning_rate": 0.00010412946428571429, | |
| "loss": 0.0288, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 3.3723958333333335, | |
| "grad_norm": 11.943220138549805, | |
| "learning_rate": 0.00010375744047619046, | |
| "loss": 0.0515, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 3.3854166666666665, | |
| "grad_norm": 0.01074813213199377, | |
| "learning_rate": 0.00010338541666666668, | |
| "loss": 0.0033, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.3854166666666665, | |
| "eval_accuracy": 0.9603053435114504, | |
| "eval_f1": 0.9622549804604048, | |
| "eval_loss": 0.2128184586763382, | |
| "eval_runtime": 81.829, | |
| "eval_samples_per_second": 16.009, | |
| "eval_steps_per_second": 2.004, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.3984375, | |
| "grad_norm": 5.348459243774414, | |
| "learning_rate": 0.00010301339285714286, | |
| "loss": 0.2931, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 3.4114583333333335, | |
| "grad_norm": 0.07041466236114502, | |
| "learning_rate": 0.00010264136904761905, | |
| "loss": 0.0137, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 3.4244791666666665, | |
| "grad_norm": 0.13537417352199554, | |
| "learning_rate": 0.00010226934523809524, | |
| "loss": 0.0462, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 3.4375, | |
| "grad_norm": 1.6239345073699951, | |
| "learning_rate": 0.00010189732142857143, | |
| "loss": 0.0083, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 3.4505208333333335, | |
| "grad_norm": 0.0702078565955162, | |
| "learning_rate": 0.00010152529761904763, | |
| "loss": 0.1283, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 3.4635416666666665, | |
| "grad_norm": 0.022191686555743217, | |
| "learning_rate": 0.00010115327380952381, | |
| "loss": 0.0884, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 3.4765625, | |
| "grad_norm": 0.04772581532597542, | |
| "learning_rate": 0.00010078125, | |
| "loss": 0.1509, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 3.4895833333333335, | |
| "grad_norm": 0.08327681571245193, | |
| "learning_rate": 0.00010040922619047619, | |
| "loss": 0.0369, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 3.5026041666666665, | |
| "grad_norm": 0.0018887658370658755, | |
| "learning_rate": 0.00010003720238095237, | |
| "loss": 0.0024, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 3.515625, | |
| "grad_norm": 0.15314579010009766, | |
| "learning_rate": 9.966517857142857e-05, | |
| "loss": 0.0422, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.515625, | |
| "eval_accuracy": 0.9702290076335878, | |
| "eval_f1": 0.9700942338753799, | |
| "eval_loss": 0.13781629502773285, | |
| "eval_runtime": 81.7577, | |
| "eval_samples_per_second": 16.023, | |
| "eval_steps_per_second": 2.006, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.5286458333333335, | |
| "grad_norm": 0.006440164055675268, | |
| "learning_rate": 9.929315476190477e-05, | |
| "loss": 0.0047, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 3.5416666666666665, | |
| "grad_norm": 0.04540511220693588, | |
| "learning_rate": 9.892113095238096e-05, | |
| "loss": 0.0193, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 3.5546875, | |
| "grad_norm": 0.061146996915340424, | |
| "learning_rate": 9.854910714285715e-05, | |
| "loss": 0.1849, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 3.5677083333333335, | |
| "grad_norm": 0.2543550729751587, | |
| "learning_rate": 9.817708333333334e-05, | |
| "loss": 0.003, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 3.5807291666666665, | |
| "grad_norm": 0.057148758322000504, | |
| "learning_rate": 9.780505952380952e-05, | |
| "loss": 0.2103, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 3.59375, | |
| "grad_norm": 0.024192342534661293, | |
| "learning_rate": 9.743303571428571e-05, | |
| "loss": 0.0247, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 3.6067708333333335, | |
| "grad_norm": 0.0015990460524335504, | |
| "learning_rate": 9.706101190476191e-05, | |
| "loss": 0.1191, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 3.6197916666666665, | |
| "grad_norm": 0.009267842397093773, | |
| "learning_rate": 9.66889880952381e-05, | |
| "loss": 0.0816, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 3.6328125, | |
| "grad_norm": 0.10694319754838943, | |
| "learning_rate": 9.63169642857143e-05, | |
| "loss": 0.0076, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 3.6458333333333335, | |
| "grad_norm": 3.6166841983795166, | |
| "learning_rate": 9.594494047619048e-05, | |
| "loss": 0.2038, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.6458333333333335, | |
| "eval_accuracy": 0.9687022900763359, | |
| "eval_f1": 0.968490249375277, | |
| "eval_loss": 0.16744300723075867, | |
| "eval_runtime": 81.7922, | |
| "eval_samples_per_second": 16.016, | |
| "eval_steps_per_second": 2.005, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.6588541666666665, | |
| "grad_norm": 0.0016034733271226287, | |
| "learning_rate": 9.557291666666667e-05, | |
| "loss": 0.0345, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 3.671875, | |
| "grad_norm": 0.003382716327905655, | |
| "learning_rate": 9.520089285714287e-05, | |
| "loss": 0.037, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 3.6848958333333335, | |
| "grad_norm": 0.02836257964372635, | |
| "learning_rate": 9.482886904761904e-05, | |
| "loss": 0.1927, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 3.6979166666666665, | |
| "grad_norm": 5.114245414733887, | |
| "learning_rate": 9.445684523809523e-05, | |
| "loss": 0.0824, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 3.7109375, | |
| "grad_norm": 9.979170799255371, | |
| "learning_rate": 9.408482142857143e-05, | |
| "loss": 0.2195, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 3.7239583333333335, | |
| "grad_norm": 0.0006564153009094298, | |
| "learning_rate": 9.371279761904762e-05, | |
| "loss": 0.0332, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 3.7369791666666665, | |
| "grad_norm": 0.30073022842407227, | |
| "learning_rate": 9.334077380952382e-05, | |
| "loss": 0.0343, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 1.1301170587539673, | |
| "learning_rate": 9.296875e-05, | |
| "loss": 0.0483, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 3.7630208333333335, | |
| "grad_norm": 0.2611207962036133, | |
| "learning_rate": 9.25967261904762e-05, | |
| "loss": 0.0063, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 3.7760416666666665, | |
| "grad_norm": 0.0027096800040453672, | |
| "learning_rate": 9.22247023809524e-05, | |
| "loss": 0.0156, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.7760416666666665, | |
| "eval_accuracy": 0.9755725190839695, | |
| "eval_f1": 0.9757719352570504, | |
| "eval_loss": 0.13834095001220703, | |
| "eval_runtime": 82.0543, | |
| "eval_samples_per_second": 15.965, | |
| "eval_steps_per_second": 1.999, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.7890625, | |
| "grad_norm": 0.007965387776494026, | |
| "learning_rate": 9.185267857142858e-05, | |
| "loss": 0.0148, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 3.8020833333333335, | |
| "grad_norm": 0.2072007954120636, | |
| "learning_rate": 9.148065476190477e-05, | |
| "loss": 0.0876, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 3.8151041666666665, | |
| "grad_norm": 0.0005560634890571237, | |
| "learning_rate": 9.110863095238095e-05, | |
| "loss": 0.0007, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 3.828125, | |
| "grad_norm": 0.008656589314341545, | |
| "learning_rate": 9.073660714285714e-05, | |
| "loss": 0.0183, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 3.8411458333333335, | |
| "grad_norm": 2.0654067993164062, | |
| "learning_rate": 9.036458333333334e-05, | |
| "loss": 0.0039, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 3.8541666666666665, | |
| "grad_norm": 0.034493111073970795, | |
| "learning_rate": 8.999255952380953e-05, | |
| "loss": 0.0004, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 3.8671875, | |
| "grad_norm": 0.0011976959649473429, | |
| "learning_rate": 8.962053571428572e-05, | |
| "loss": 0.1166, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 3.8802083333333335, | |
| "grad_norm": 0.018453743308782578, | |
| "learning_rate": 8.924851190476192e-05, | |
| "loss": 0.0012, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 3.8932291666666665, | |
| "grad_norm": 0.03002144768834114, | |
| "learning_rate": 8.88764880952381e-05, | |
| "loss": 0.025, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 3.90625, | |
| "grad_norm": 0.009749704040586948, | |
| "learning_rate": 8.850446428571429e-05, | |
| "loss": 0.0004, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.90625, | |
| "eval_accuracy": 0.9732824427480916, | |
| "eval_f1": 0.9714599830496027, | |
| "eval_loss": 0.15442687273025513, | |
| "eval_runtime": 81.2184, | |
| "eval_samples_per_second": 16.129, | |
| "eval_steps_per_second": 2.019, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.9192708333333335, | |
| "grad_norm": 0.004181410651654005, | |
| "learning_rate": 8.813244047619048e-05, | |
| "loss": 0.0001, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 3.9322916666666665, | |
| "grad_norm": 0.0011721157934516668, | |
| "learning_rate": 8.776041666666666e-05, | |
| "loss": 0.0721, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 3.9453125, | |
| "grad_norm": 0.052945539355278015, | |
| "learning_rate": 8.738839285714286e-05, | |
| "loss": 0.0445, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 3.9583333333333335, | |
| "grad_norm": 0.0015457631088793278, | |
| "learning_rate": 8.701636904761905e-05, | |
| "loss": 0.0035, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 3.9713541666666665, | |
| "grad_norm": 0.004201805219054222, | |
| "learning_rate": 8.664434523809524e-05, | |
| "loss": 0.0395, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 3.984375, | |
| "grad_norm": 0.06384053826332092, | |
| "learning_rate": 8.627232142857144e-05, | |
| "loss": 0.0002, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 3.9973958333333335, | |
| "grad_norm": 0.0068823350593447685, | |
| "learning_rate": 8.590029761904763e-05, | |
| "loss": 0.0102, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 4.010416666666667, | |
| "grad_norm": 0.0019308422924950719, | |
| "learning_rate": 8.552827380952381e-05, | |
| "loss": 0.1089, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 4.0234375, | |
| "grad_norm": 0.004668346140533686, | |
| "learning_rate": 8.515625e-05, | |
| "loss": 0.017, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 4.036458333333333, | |
| "grad_norm": 0.029917776584625244, | |
| "learning_rate": 8.478422619047619e-05, | |
| "loss": 0.002, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 4.036458333333333, | |
| "eval_accuracy": 0.9709923664122138, | |
| "eval_f1": 0.9690154918238618, | |
| "eval_loss": 0.15522035956382751, | |
| "eval_runtime": 81.6347, | |
| "eval_samples_per_second": 16.047, | |
| "eval_steps_per_second": 2.009, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 4.049479166666667, | |
| "grad_norm": 0.0026706114877015352, | |
| "learning_rate": 8.441220238095239e-05, | |
| "loss": 0.0122, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 4.0625, | |
| "grad_norm": 0.0022576150950044394, | |
| "learning_rate": 8.404017857142857e-05, | |
| "loss": 0.033, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 4.075520833333333, | |
| "grad_norm": 0.0022961096838116646, | |
| "learning_rate": 8.366815476190477e-05, | |
| "loss": 0.0011, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 4.088541666666667, | |
| "grad_norm": 0.009144813753664494, | |
| "learning_rate": 8.329613095238096e-05, | |
| "loss": 0.1056, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 4.1015625, | |
| "grad_norm": 0.002261270536109805, | |
| "learning_rate": 8.292410714285715e-05, | |
| "loss": 0.0001, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 4.114583333333333, | |
| "grad_norm": 0.0007401434704661369, | |
| "learning_rate": 8.255208333333334e-05, | |
| "loss": 0.0542, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 4.127604166666667, | |
| "grad_norm": 0.00339931040070951, | |
| "learning_rate": 8.218005952380952e-05, | |
| "loss": 0.0664, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 4.140625, | |
| "grad_norm": 0.19860436022281647, | |
| "learning_rate": 8.180803571428571e-05, | |
| "loss": 0.0326, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 4.153645833333333, | |
| "grad_norm": 0.004367632791399956, | |
| "learning_rate": 8.143601190476191e-05, | |
| "loss": 0.0002, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.010312506929039955, | |
| "learning_rate": 8.10639880952381e-05, | |
| "loss": 0.0405, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "eval_accuracy": 0.9763358778625955, | |
| "eval_f1": 0.975096407506889, | |
| "eval_loss": 0.1325622797012329, | |
| "eval_runtime": 82.0572, | |
| "eval_samples_per_second": 15.964, | |
| "eval_steps_per_second": 1.999, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 4.1796875, | |
| "grad_norm": 0.010638444684445858, | |
| "learning_rate": 8.06919642857143e-05, | |
| "loss": 0.0004, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 4.192708333333333, | |
| "grad_norm": 0.0020264596678316593, | |
| "learning_rate": 8.031994047619048e-05, | |
| "loss": 0.0505, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 4.205729166666667, | |
| "grad_norm": 0.04995977506041527, | |
| "learning_rate": 7.994791666666667e-05, | |
| "loss": 0.0018, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 4.21875, | |
| "grad_norm": 0.01499191764742136, | |
| "learning_rate": 7.957589285714286e-05, | |
| "loss": 0.0014, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 4.231770833333333, | |
| "grad_norm": 0.03724166750907898, | |
| "learning_rate": 7.920386904761904e-05, | |
| "loss": 0.0005, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 4.244791666666667, | |
| "grad_norm": 12.521564483642578, | |
| "learning_rate": 7.883184523809523e-05, | |
| "loss": 0.0248, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 4.2578125, | |
| "grad_norm": 0.004707773681730032, | |
| "learning_rate": 7.845982142857143e-05, | |
| "loss": 0.001, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 4.270833333333333, | |
| "grad_norm": 0.0035837057512253523, | |
| "learning_rate": 7.808779761904762e-05, | |
| "loss": 0.0057, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 4.283854166666667, | |
| "grad_norm": 0.2490946650505066, | |
| "learning_rate": 7.771577380952382e-05, | |
| "loss": 0.0008, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 4.296875, | |
| "grad_norm": 0.003916221205145121, | |
| "learning_rate": 7.734375e-05, | |
| "loss": 0.0031, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.296875, | |
| "eval_accuracy": 0.9755725190839695, | |
| "eval_f1": 0.9758721105664637, | |
| "eval_loss": 0.1436639279127121, | |
| "eval_runtime": 79.9301, | |
| "eval_samples_per_second": 16.389, | |
| "eval_steps_per_second": 2.052, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.309895833333333, | |
| "grad_norm": 0.09141277521848679, | |
| "learning_rate": 7.69717261904762e-05, | |
| "loss": 0.0025, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 4.322916666666667, | |
| "grad_norm": 0.00042766123078763485, | |
| "learning_rate": 7.65997023809524e-05, | |
| "loss": 0.0008, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 4.3359375, | |
| "grad_norm": 0.03846605867147446, | |
| "learning_rate": 7.622767857142857e-05, | |
| "loss": 0.0796, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 4.348958333333333, | |
| "grad_norm": 0.0002219278976554051, | |
| "learning_rate": 7.585565476190477e-05, | |
| "loss": 0.0002, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 4.361979166666667, | |
| "grad_norm": 0.0005795376491732895, | |
| "learning_rate": 7.548363095238095e-05, | |
| "loss": 0.1919, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 4.375, | |
| "grad_norm": 0.00021400375408120453, | |
| "learning_rate": 7.511160714285714e-05, | |
| "loss": 0.024, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 4.388020833333333, | |
| "grad_norm": 0.0010912902653217316, | |
| "learning_rate": 7.473958333333334e-05, | |
| "loss": 0.0019, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 4.401041666666667, | |
| "grad_norm": 0.0013969383435323834, | |
| "learning_rate": 7.436755952380953e-05, | |
| "loss": 0.0005, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 4.4140625, | |
| "grad_norm": 0.0073776631616055965, | |
| "learning_rate": 7.399553571428572e-05, | |
| "loss": 0.04, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 4.427083333333333, | |
| "grad_norm": 0.9691316485404968, | |
| "learning_rate": 7.362351190476192e-05, | |
| "loss": 0.0022, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 4.427083333333333, | |
| "eval_accuracy": 0.9793893129770992, | |
| "eval_f1": 0.9790343360825328, | |
| "eval_loss": 0.13159744441509247, | |
| "eval_runtime": 80.1372, | |
| "eval_samples_per_second": 16.347, | |
| "eval_steps_per_second": 2.046, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 4.440104166666667, | |
| "grad_norm": 0.0026658005081117153, | |
| "learning_rate": 7.32514880952381e-05, | |
| "loss": 0.0293, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 4.453125, | |
| "grad_norm": 0.0003751592885237187, | |
| "learning_rate": 7.287946428571429e-05, | |
| "loss": 0.0048, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 4.466145833333333, | |
| "grad_norm": 0.000976251030806452, | |
| "learning_rate": 7.250744047619048e-05, | |
| "loss": 0.0006, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 4.479166666666667, | |
| "grad_norm": 0.0021119702141731977, | |
| "learning_rate": 7.213541666666666e-05, | |
| "loss": 0.0003, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 4.4921875, | |
| "grad_norm": 0.000726861588191241, | |
| "learning_rate": 7.176339285714286e-05, | |
| "loss": 0.0807, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 4.505208333333333, | |
| "grad_norm": 3.3035876750946045, | |
| "learning_rate": 7.139136904761905e-05, | |
| "loss": 0.0029, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 4.518229166666667, | |
| "grad_norm": 0.0007139093359000981, | |
| "learning_rate": 7.101934523809524e-05, | |
| "loss": 0.0007, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 4.53125, | |
| "grad_norm": 0.0011296101147308946, | |
| "learning_rate": 7.064732142857144e-05, | |
| "loss": 0.0031, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 4.544270833333333, | |
| "grad_norm": 0.16946792602539062, | |
| "learning_rate": 7.027529761904763e-05, | |
| "loss": 0.1019, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 4.557291666666667, | |
| "grad_norm": 0.15916898846626282, | |
| "learning_rate": 6.990327380952381e-05, | |
| "loss": 0.0019, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 4.557291666666667, | |
| "eval_accuracy": 0.9809160305343512, | |
| "eval_f1": 0.9799064143413599, | |
| "eval_loss": 0.12329943478107452, | |
| "eval_runtime": 80.3582, | |
| "eval_samples_per_second": 16.302, | |
| "eval_steps_per_second": 2.041, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 4.5703125, | |
| "grad_norm": 0.0023877692874521017, | |
| "learning_rate": 6.953125e-05, | |
| "loss": 0.0025, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 4.583333333333333, | |
| "grad_norm": 0.0013898513279855251, | |
| "learning_rate": 6.915922619047619e-05, | |
| "loss": 0.0001, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 4.596354166666667, | |
| "grad_norm": 6.135566234588623, | |
| "learning_rate": 6.878720238095239e-05, | |
| "loss": 0.0097, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 4.609375, | |
| "grad_norm": 0.07613278925418854, | |
| "learning_rate": 6.841517857142857e-05, | |
| "loss": 0.0245, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 4.622395833333333, | |
| "grad_norm": 7.778736591339111, | |
| "learning_rate": 6.804315476190477e-05, | |
| "loss": 0.1511, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 4.635416666666667, | |
| "grad_norm": 0.009522577747702599, | |
| "learning_rate": 6.767113095238096e-05, | |
| "loss": 0.0159, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 4.6484375, | |
| "grad_norm": 0.07819275557994843, | |
| "learning_rate": 6.729910714285715e-05, | |
| "loss": 0.0005, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 4.661458333333333, | |
| "grad_norm": 0.0013676213566213846, | |
| "learning_rate": 6.692708333333333e-05, | |
| "loss": 0.0013, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 4.674479166666667, | |
| "grad_norm": 0.006399152334779501, | |
| "learning_rate": 6.655505952380952e-05, | |
| "loss": 0.0007, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 4.6875, | |
| "grad_norm": 0.11802547425031662, | |
| "learning_rate": 6.618303571428571e-05, | |
| "loss": 0.0005, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.6875, | |
| "eval_accuracy": 0.9770992366412213, | |
| "eval_f1": 0.9762889754093934, | |
| "eval_loss": 0.14002743363380432, | |
| "eval_runtime": 80.3556, | |
| "eval_samples_per_second": 16.303, | |
| "eval_steps_per_second": 2.041, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.700520833333333, | |
| "grad_norm": 0.006584244314581156, | |
| "learning_rate": 6.581101190476191e-05, | |
| "loss": 0.0081, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 4.713541666666667, | |
| "grad_norm": 0.00923603679984808, | |
| "learning_rate": 6.54389880952381e-05, | |
| "loss": 0.1082, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 4.7265625, | |
| "grad_norm": 0.0013322306331247091, | |
| "learning_rate": 6.50669642857143e-05, | |
| "loss": 0.007, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 4.739583333333333, | |
| "grad_norm": 0.008299018256366253, | |
| "learning_rate": 6.469494047619048e-05, | |
| "loss": 0.0012, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 4.752604166666667, | |
| "grad_norm": 0.0036693657748401165, | |
| "learning_rate": 6.432291666666667e-05, | |
| "loss": 0.0203, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 4.765625, | |
| "grad_norm": 0.04534276947379112, | |
| "learning_rate": 6.395089285714286e-05, | |
| "loss": 0.0003, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 4.778645833333333, | |
| "grad_norm": 0.0026455000042915344, | |
| "learning_rate": 6.357886904761904e-05, | |
| "loss": 0.0236, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 4.791666666666667, | |
| "grad_norm": 0.02503148838877678, | |
| "learning_rate": 6.320684523809523e-05, | |
| "loss": 0.0002, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 4.8046875, | |
| "grad_norm": 0.00035225218744017184, | |
| "learning_rate": 6.283482142857143e-05, | |
| "loss": 0.0016, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 4.817708333333333, | |
| "grad_norm": 0.0013643777929246426, | |
| "learning_rate": 6.246279761904762e-05, | |
| "loss": 0.0002, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 4.817708333333333, | |
| "eval_accuracy": 0.9793893129770992, | |
| "eval_f1": 0.9796748878572995, | |
| "eval_loss": 0.13390180468559265, | |
| "eval_runtime": 79.2151, | |
| "eval_samples_per_second": 16.537, | |
| "eval_steps_per_second": 2.07, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 4.830729166666667, | |
| "grad_norm": 0.016638770699501038, | |
| "learning_rate": 6.209077380952382e-05, | |
| "loss": 0.0002, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 4.84375, | |
| "grad_norm": 0.18891775608062744, | |
| "learning_rate": 6.171875e-05, | |
| "loss": 0.0008, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 4.856770833333333, | |
| "grad_norm": 0.028085488826036453, | |
| "learning_rate": 6.134672619047619e-05, | |
| "loss": 0.0002, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 4.869791666666667, | |
| "grad_norm": 0.000595388759393245, | |
| "learning_rate": 6.097470238095239e-05, | |
| "loss": 0.0734, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 4.8828125, | |
| "grad_norm": 0.011049636639654636, | |
| "learning_rate": 6.0602678571428574e-05, | |
| "loss": 0.004, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 4.895833333333333, | |
| "grad_norm": 0.008435525931417942, | |
| "learning_rate": 6.023065476190477e-05, | |
| "loss": 0.0612, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 4.908854166666667, | |
| "grad_norm": 0.0031228559091687202, | |
| "learning_rate": 5.9858630952380954e-05, | |
| "loss": 0.0278, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 4.921875, | |
| "grad_norm": 0.0127940122038126, | |
| "learning_rate": 5.948660714285714e-05, | |
| "loss": 0.0001, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 4.934895833333333, | |
| "grad_norm": 0.15452243387699127, | |
| "learning_rate": 5.911458333333334e-05, | |
| "loss": 0.0014, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 4.947916666666667, | |
| "grad_norm": 5.700990676879883, | |
| "learning_rate": 5.874255952380953e-05, | |
| "loss": 0.0304, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 4.947916666666667, | |
| "eval_accuracy": 0.9793893129770992, | |
| "eval_f1": 0.9795123034118706, | |
| "eval_loss": 0.1468745619058609, | |
| "eval_runtime": 80.6461, | |
| "eval_samples_per_second": 16.244, | |
| "eval_steps_per_second": 2.034, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 4.9609375, | |
| "grad_norm": 0.0008812499581836164, | |
| "learning_rate": 5.8370535714285715e-05, | |
| "loss": 0.0004, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 4.973958333333333, | |
| "grad_norm": 0.009404584765434265, | |
| "learning_rate": 5.799851190476191e-05, | |
| "loss": 0.0021, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 4.986979166666667, | |
| "grad_norm": 0.0016421687323600054, | |
| "learning_rate": 5.7626488095238096e-05, | |
| "loss": 0.032, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.029247036203742027, | |
| "learning_rate": 5.7254464285714296e-05, | |
| "loss": 0.0001, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 5.013020833333333, | |
| "grad_norm": 0.06536845117807388, | |
| "learning_rate": 5.688244047619048e-05, | |
| "loss": 0.0005, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 5.026041666666667, | |
| "grad_norm": 0.0019784579053521156, | |
| "learning_rate": 5.6510416666666664e-05, | |
| "loss": 0.0001, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 5.0390625, | |
| "grad_norm": 0.0014279839815571904, | |
| "learning_rate": 5.6138392857142864e-05, | |
| "loss": 0.0002, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 5.052083333333333, | |
| "grad_norm": 0.014671358279883862, | |
| "learning_rate": 5.576636904761905e-05, | |
| "loss": 0.0008, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 5.065104166666667, | |
| "grad_norm": 0.0024294396862387657, | |
| "learning_rate": 5.539434523809524e-05, | |
| "loss": 0.0002, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 5.078125, | |
| "grad_norm": 0.00397472595795989, | |
| "learning_rate": 5.502232142857143e-05, | |
| "loss": 0.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 5.078125, | |
| "eval_accuracy": 0.9763358778625955, | |
| "eval_f1": 0.9760263505554957, | |
| "eval_loss": 0.15323668718338013, | |
| "eval_runtime": 80.294, | |
| "eval_samples_per_second": 16.315, | |
| "eval_steps_per_second": 2.042, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 5.091145833333333, | |
| "grad_norm": 0.02703716978430748, | |
| "learning_rate": 5.465029761904762e-05, | |
| "loss": 0.0039, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 5.104166666666667, | |
| "grad_norm": 0.0009181397617794573, | |
| "learning_rate": 5.427827380952382e-05, | |
| "loss": 0.0002, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 5.1171875, | |
| "grad_norm": 0.006500236690044403, | |
| "learning_rate": 5.3906250000000006e-05, | |
| "loss": 0.0007, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 5.130208333333333, | |
| "grad_norm": 0.0006530183600261807, | |
| "learning_rate": 5.3534226190476186e-05, | |
| "loss": 0.0002, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 5.143229166666667, | |
| "grad_norm": 0.003845563158392906, | |
| "learning_rate": 5.3162202380952386e-05, | |
| "loss": 0.0005, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 5.15625, | |
| "grad_norm": 0.0008400387596338987, | |
| "learning_rate": 5.2790178571428573e-05, | |
| "loss": 0.0013, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 5.169270833333333, | |
| "grad_norm": 0.00313665927387774, | |
| "learning_rate": 5.241815476190477e-05, | |
| "loss": 0.0002, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 5.182291666666667, | |
| "grad_norm": 0.0030664566438645124, | |
| "learning_rate": 5.2046130952380954e-05, | |
| "loss": 0.066, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 5.1953125, | |
| "grad_norm": 0.0006601907080039382, | |
| "learning_rate": 5.167410714285714e-05, | |
| "loss": 0.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 5.208333333333333, | |
| "grad_norm": 0.0010025645606219769, | |
| "learning_rate": 5.130208333333334e-05, | |
| "loss": 0.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 5.208333333333333, | |
| "eval_accuracy": 0.9778625954198473, | |
| "eval_f1": 0.9770889116046164, | |
| "eval_loss": 0.15303944051265717, | |
| "eval_runtime": 79.3589, | |
| "eval_samples_per_second": 16.507, | |
| "eval_steps_per_second": 2.067, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 5.221354166666667, | |
| "grad_norm": 0.008759081363677979, | |
| "learning_rate": 5.093005952380953e-05, | |
| "loss": 0.0411, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 5.234375, | |
| "grad_norm": 0.0015470011858269572, | |
| "learning_rate": 5.0558035714285715e-05, | |
| "loss": 0.0303, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 5.247395833333333, | |
| "grad_norm": 0.0008398283389396966, | |
| "learning_rate": 5.018601190476191e-05, | |
| "loss": 0.0853, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 5.260416666666667, | |
| "grad_norm": 0.0009364528232254088, | |
| "learning_rate": 4.9813988095238096e-05, | |
| "loss": 0.0596, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 5.2734375, | |
| "grad_norm": 0.0247739776968956, | |
| "learning_rate": 4.944196428571429e-05, | |
| "loss": 0.0004, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 5.286458333333333, | |
| "grad_norm": 0.004827846307307482, | |
| "learning_rate": 4.9069940476190476e-05, | |
| "loss": 0.0638, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 5.299479166666667, | |
| "grad_norm": 0.00023243791656568646, | |
| "learning_rate": 4.869791666666667e-05, | |
| "loss": 0.0007, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 5.3125, | |
| "grad_norm": 0.01796838268637657, | |
| "learning_rate": 4.832589285714286e-05, | |
| "loss": 0.0055, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 5.325520833333333, | |
| "grad_norm": 0.0007179395761340857, | |
| "learning_rate": 4.795386904761905e-05, | |
| "loss": 0.0888, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 5.338541666666667, | |
| "grad_norm": 0.0310695581138134, | |
| "learning_rate": 4.758184523809524e-05, | |
| "loss": 0.0006, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 5.338541666666667, | |
| "eval_accuracy": 0.9770992366412213, | |
| "eval_f1": 0.976523202547543, | |
| "eval_loss": 0.1433548629283905, | |
| "eval_runtime": 79.725, | |
| "eval_samples_per_second": 16.431, | |
| "eval_steps_per_second": 2.057, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 5.3515625, | |
| "grad_norm": 0.02403601072728634, | |
| "learning_rate": 4.720982142857143e-05, | |
| "loss": 0.0019, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 5.364583333333333, | |
| "grad_norm": 0.0012427668552845716, | |
| "learning_rate": 4.683779761904762e-05, | |
| "loss": 0.0001, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 5.377604166666667, | |
| "grad_norm": 0.001180665334686637, | |
| "learning_rate": 4.646577380952381e-05, | |
| "loss": 0.0002, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 5.390625, | |
| "grad_norm": 0.0012769517488777637, | |
| "learning_rate": 4.609375e-05, | |
| "loss": 0.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 5.403645833333333, | |
| "grad_norm": 0.000978892669081688, | |
| "learning_rate": 4.572172619047619e-05, | |
| "loss": 0.0005, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 5.416666666666667, | |
| "grad_norm": 0.00048785540275275707, | |
| "learning_rate": 4.5349702380952386e-05, | |
| "loss": 0.0004, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 5.4296875, | |
| "grad_norm": 0.08053936809301376, | |
| "learning_rate": 4.497767857142857e-05, | |
| "loss": 0.0002, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 5.442708333333333, | |
| "grad_norm": 0.0015671910950914025, | |
| "learning_rate": 4.460565476190477e-05, | |
| "loss": 0.0001, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 5.455729166666667, | |
| "grad_norm": 0.004697693511843681, | |
| "learning_rate": 4.4233630952380954e-05, | |
| "loss": 0.0005, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 5.46875, | |
| "grad_norm": 0.0010685070883482695, | |
| "learning_rate": 4.386160714285715e-05, | |
| "loss": 0.0218, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 5.46875, | |
| "eval_accuracy": 0.9763358778625955, | |
| "eval_f1": 0.9750904497675442, | |
| "eval_loss": 0.14676247537136078, | |
| "eval_runtime": 78.9043, | |
| "eval_samples_per_second": 16.602, | |
| "eval_steps_per_second": 2.078, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 5.481770833333333, | |
| "grad_norm": 0.0027723798993974924, | |
| "learning_rate": 4.3489583333333334e-05, | |
| "loss": 0.0003, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 5.494791666666667, | |
| "grad_norm": 0.019772697240114212, | |
| "learning_rate": 4.311755952380953e-05, | |
| "loss": 0.0001, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 5.5078125, | |
| "grad_norm": 0.0006771979387849569, | |
| "learning_rate": 4.2745535714285715e-05, | |
| "loss": 0.0055, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 5.520833333333333, | |
| "grad_norm": 0.0004024896479677409, | |
| "learning_rate": 4.237351190476191e-05, | |
| "loss": 0.0035, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 5.533854166666667, | |
| "grad_norm": 0.10993379354476929, | |
| "learning_rate": 4.2001488095238096e-05, | |
| "loss": 0.0003, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 5.546875, | |
| "grad_norm": 0.0007820904138498008, | |
| "learning_rate": 4.162946428571429e-05, | |
| "loss": 0.0155, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 5.559895833333333, | |
| "grad_norm": 0.003410749603062868, | |
| "learning_rate": 4.1257440476190476e-05, | |
| "loss": 0.0001, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 5.572916666666667, | |
| "grad_norm": 0.0026554546784609556, | |
| "learning_rate": 4.088541666666667e-05, | |
| "loss": 0.0002, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 5.5859375, | |
| "grad_norm": 0.06518341600894928, | |
| "learning_rate": 4.051339285714286e-05, | |
| "loss": 0.0008, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 5.598958333333333, | |
| "grad_norm": 1.8737592697143555, | |
| "learning_rate": 4.014136904761905e-05, | |
| "loss": 0.0043, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 5.598958333333333, | |
| "eval_accuracy": 0.9778625954198473, | |
| "eval_f1": 0.9763098419010506, | |
| "eval_loss": 0.15678274631500244, | |
| "eval_runtime": 79.3625, | |
| "eval_samples_per_second": 16.507, | |
| "eval_steps_per_second": 2.066, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 5.611979166666667, | |
| "grad_norm": 0.0003776724624913186, | |
| "learning_rate": 3.976934523809524e-05, | |
| "loss": 0.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 5.625, | |
| "grad_norm": 0.0002889227762352675, | |
| "learning_rate": 3.939732142857143e-05, | |
| "loss": 0.0001, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 5.638020833333333, | |
| "grad_norm": 0.010027917101979256, | |
| "learning_rate": 3.902529761904762e-05, | |
| "loss": 0.0001, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 5.651041666666667, | |
| "grad_norm": 0.06570053845643997, | |
| "learning_rate": 3.865327380952381e-05, | |
| "loss": 0.0004, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 5.6640625, | |
| "grad_norm": 0.0006370337796397507, | |
| "learning_rate": 3.828125e-05, | |
| "loss": 0.0017, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 5.677083333333333, | |
| "grad_norm": 0.0792364776134491, | |
| "learning_rate": 3.790922619047619e-05, | |
| "loss": 0.0002, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 5.690104166666667, | |
| "grad_norm": 0.0021686868276447058, | |
| "learning_rate": 3.7537202380952386e-05, | |
| "loss": 0.0001, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 5.703125, | |
| "grad_norm": 0.001032188767567277, | |
| "learning_rate": 3.716517857142857e-05, | |
| "loss": 0.0003, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 5.716145833333333, | |
| "grad_norm": 0.00048284087097272277, | |
| "learning_rate": 3.679315476190477e-05, | |
| "loss": 0.0003, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 5.729166666666667, | |
| "grad_norm": 0.003614840330556035, | |
| "learning_rate": 3.6421130952380954e-05, | |
| "loss": 0.0246, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 5.729166666666667, | |
| "eval_accuracy": 0.9770992366412213, | |
| "eval_f1": 0.9748227385190463, | |
| "eval_loss": 0.1582217961549759, | |
| "eval_runtime": 80.0201, | |
| "eval_samples_per_second": 16.371, | |
| "eval_steps_per_second": 2.049, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 5.7421875, | |
| "grad_norm": 0.0005544687155634165, | |
| "learning_rate": 3.604910714285715e-05, | |
| "loss": 0.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 5.755208333333333, | |
| "grad_norm": 0.0011366292601451278, | |
| "learning_rate": 3.5677083333333334e-05, | |
| "loss": 0.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 5.768229166666667, | |
| "grad_norm": 0.0005046608857810497, | |
| "learning_rate": 3.530505952380953e-05, | |
| "loss": 0.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 5.78125, | |
| "grad_norm": 0.0004096832708455622, | |
| "learning_rate": 3.4933035714285715e-05, | |
| "loss": 0.0001, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 5.794270833333333, | |
| "grad_norm": 0.00013856917212251574, | |
| "learning_rate": 3.456101190476191e-05, | |
| "loss": 0.0014, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 5.807291666666667, | |
| "grad_norm": 0.001096011488698423, | |
| "learning_rate": 3.4188988095238096e-05, | |
| "loss": 0.0001, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 5.8203125, | |
| "grad_norm": 0.0001263749145437032, | |
| "learning_rate": 3.381696428571429e-05, | |
| "loss": 0.0002, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 0.0001115978229790926, | |
| "learning_rate": 3.3444940476190476e-05, | |
| "loss": 0.0029, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 5.846354166666667, | |
| "grad_norm": 0.0029253140091896057, | |
| "learning_rate": 3.307291666666667e-05, | |
| "loss": 0.0001, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 5.859375, | |
| "grad_norm": 5.999381065368652, | |
| "learning_rate": 3.270089285714286e-05, | |
| "loss": 0.0052, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 5.859375, | |
| "eval_accuracy": 0.9786259541984733, | |
| "eval_f1": 0.9773756116949442, | |
| "eval_loss": 0.14893028140068054, | |
| "eval_runtime": 79.4853, | |
| "eval_samples_per_second": 16.481, | |
| "eval_steps_per_second": 2.063, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 5.872395833333333, | |
| "grad_norm": 0.002452920190989971, | |
| "learning_rate": 3.232886904761905e-05, | |
| "loss": 0.0005, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 5.885416666666667, | |
| "grad_norm": 0.0010826429352164268, | |
| "learning_rate": 3.195684523809524e-05, | |
| "loss": 0.0749, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 5.8984375, | |
| "grad_norm": Infinity, | |
| "learning_rate": 3.158482142857143e-05, | |
| "loss": 0.1748, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 5.911458333333333, | |
| "grad_norm": 0.0003211931325495243, | |
| "learning_rate": 3.125e-05, | |
| "loss": 0.0007, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 5.924479166666667, | |
| "grad_norm": 0.0004456841852515936, | |
| "learning_rate": 3.0877976190476194e-05, | |
| "loss": 0.0015, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 5.9375, | |
| "grad_norm": 0.0001370434183627367, | |
| "learning_rate": 3.0505952380952385e-05, | |
| "loss": 0.0022, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 5.950520833333333, | |
| "grad_norm": 0.0002913082716986537, | |
| "learning_rate": 3.013392857142857e-05, | |
| "loss": 0.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 5.963541666666667, | |
| "grad_norm": 0.001946036471053958, | |
| "learning_rate": 2.9761904761904762e-05, | |
| "loss": 0.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 5.9765625, | |
| "grad_norm": 0.0001424805959686637, | |
| "learning_rate": 2.9389880952380956e-05, | |
| "loss": 0.0018, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 5.989583333333333, | |
| "grad_norm": 0.002276055980473757, | |
| "learning_rate": 2.9017857142857146e-05, | |
| "loss": 0.0003, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 5.989583333333333, | |
| "eval_accuracy": 0.9778625954198473, | |
| "eval_f1": 0.9774822690011192, | |
| "eval_loss": 0.14993642270565033, | |
| "eval_runtime": 79.3572, | |
| "eval_samples_per_second": 16.508, | |
| "eval_steps_per_second": 2.067, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 6.002604166666667, | |
| "grad_norm": 0.0016556038754060864, | |
| "learning_rate": 2.8645833333333333e-05, | |
| "loss": 0.0003, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 6.015625, | |
| "grad_norm": 0.0016299332492053509, | |
| "learning_rate": 2.8273809523809523e-05, | |
| "loss": 0.0019, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 6.028645833333333, | |
| "grad_norm": 0.00010488690168131143, | |
| "learning_rate": 2.7901785714285717e-05, | |
| "loss": 0.005, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 6.041666666666667, | |
| "grad_norm": 0.004511065781116486, | |
| "learning_rate": 2.7529761904761907e-05, | |
| "loss": 0.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 6.0546875, | |
| "grad_norm": 0.00028204693808220327, | |
| "learning_rate": 2.7157738095238094e-05, | |
| "loss": 0.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 6.067708333333333, | |
| "grad_norm": 0.0001716447586659342, | |
| "learning_rate": 2.6785714285714288e-05, | |
| "loss": 0.0003, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 6.080729166666667, | |
| "grad_norm": 0.00024869455955922604, | |
| "learning_rate": 2.6413690476190478e-05, | |
| "loss": 0.0002, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 6.09375, | |
| "grad_norm": 0.0005961268325336277, | |
| "learning_rate": 2.604166666666667e-05, | |
| "loss": 0.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 6.106770833333333, | |
| "grad_norm": 0.0006300471723079681, | |
| "learning_rate": 2.5669642857142855e-05, | |
| "loss": 0.0005, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 6.119791666666667, | |
| "grad_norm": 0.0018155310535803437, | |
| "learning_rate": 2.529761904761905e-05, | |
| "loss": 0.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 6.119791666666667, | |
| "eval_accuracy": 0.9793893129770992, | |
| "eval_f1": 0.9786424828277345, | |
| "eval_loss": 0.14569835364818573, | |
| "eval_runtime": 79.9671, | |
| "eval_samples_per_second": 16.382, | |
| "eval_steps_per_second": 2.051, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 6.1328125, | |
| "grad_norm": 0.00018157267186325043, | |
| "learning_rate": 2.492559523809524e-05, | |
| "loss": 0.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 6.145833333333333, | |
| "grad_norm": 0.0024571309331804514, | |
| "learning_rate": 2.455357142857143e-05, | |
| "loss": 0.0351, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 6.158854166666667, | |
| "grad_norm": 0.0013230579206719995, | |
| "learning_rate": 2.418154761904762e-05, | |
| "loss": 0.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 6.171875, | |
| "grad_norm": 0.00048594691907055676, | |
| "learning_rate": 2.380952380952381e-05, | |
| "loss": 0.0001, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 6.184895833333333, | |
| "grad_norm": 0.0002501567651052028, | |
| "learning_rate": 2.34375e-05, | |
| "loss": 0.0001, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 6.197916666666667, | |
| "grad_norm": 0.0005994201637804508, | |
| "learning_rate": 2.3065476190476194e-05, | |
| "loss": 0.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 6.2109375, | |
| "grad_norm": 0.0007334449328482151, | |
| "learning_rate": 2.269345238095238e-05, | |
| "loss": 0.0049, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 6.223958333333333, | |
| "grad_norm": 0.0002302726061316207, | |
| "learning_rate": 2.2321428571428575e-05, | |
| "loss": 0.007, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 6.236979166666667, | |
| "grad_norm": 0.0017112663481384516, | |
| "learning_rate": 2.1949404761904762e-05, | |
| "loss": 0.0002, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 0.0005747594987042248, | |
| "learning_rate": 2.1577380952380955e-05, | |
| "loss": 0.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "eval_accuracy": 0.9801526717557252, | |
| "eval_f1": 0.979382890840219, | |
| "eval_loss": 0.1436641961336136, | |
| "eval_runtime": 79.878, | |
| "eval_samples_per_second": 16.4, | |
| "eval_steps_per_second": 2.053, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 6.263020833333333, | |
| "grad_norm": 0.0010530534200370312, | |
| "learning_rate": 2.1205357142857142e-05, | |
| "loss": 0.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 6.276041666666667, | |
| "grad_norm": 0.0005772118456661701, | |
| "learning_rate": 2.0833333333333336e-05, | |
| "loss": 0.0001, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 6.2890625, | |
| "grad_norm": 0.0002731045533437282, | |
| "learning_rate": 2.0461309523809523e-05, | |
| "loss": 0.0001, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 6.302083333333333, | |
| "grad_norm": 0.0004197451926302165, | |
| "learning_rate": 2.0089285714285717e-05, | |
| "loss": 0.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 6.315104166666667, | |
| "grad_norm": 0.00015136833826545626, | |
| "learning_rate": 1.9717261904761904e-05, | |
| "loss": 0.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 6.328125, | |
| "grad_norm": 2.641888386278879e-05, | |
| "learning_rate": 1.9345238095238097e-05, | |
| "loss": 0.0001, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 6.341145833333333, | |
| "grad_norm": 0.5401182174682617, | |
| "learning_rate": 1.8973214285714284e-05, | |
| "loss": 0.0003, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 6.354166666666667, | |
| "grad_norm": 0.0032670239452272654, | |
| "learning_rate": 1.8601190476190478e-05, | |
| "loss": 0.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 6.3671875, | |
| "grad_norm": 0.00046687290887348354, | |
| "learning_rate": 1.8229166666666668e-05, | |
| "loss": 0.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 6.380208333333333, | |
| "grad_norm": 0.00039088240009732544, | |
| "learning_rate": 1.785714285714286e-05, | |
| "loss": 0.0048, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 6.380208333333333, | |
| "eval_accuracy": 0.9793893129770992, | |
| "eval_f1": 0.9782051312661143, | |
| "eval_loss": 0.14399699866771698, | |
| "eval_runtime": 80.366, | |
| "eval_samples_per_second": 16.3, | |
| "eval_steps_per_second": 2.041, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 6.393229166666667, | |
| "grad_norm": 0.0010453681461513042, | |
| "learning_rate": 1.748511904761905e-05, | |
| "loss": 0.0005, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 6.40625, | |
| "grad_norm": 0.006344153545796871, | |
| "learning_rate": 1.711309523809524e-05, | |
| "loss": 0.0001, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 6.419270833333333, | |
| "grad_norm": 0.0013704535085707903, | |
| "learning_rate": 1.674107142857143e-05, | |
| "loss": 0.05, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 6.432291666666667, | |
| "grad_norm": 0.0011392245069146156, | |
| "learning_rate": 1.636904761904762e-05, | |
| "loss": 0.0004, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 6.4453125, | |
| "grad_norm": 0.000693273323122412, | |
| "learning_rate": 1.599702380952381e-05, | |
| "loss": 0.106, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 6.458333333333333, | |
| "grad_norm": 0.002483117626979947, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 0.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 6.471354166666667, | |
| "grad_norm": 0.0010965712135657668, | |
| "learning_rate": 1.5252976190476192e-05, | |
| "loss": 0.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 6.484375, | |
| "grad_norm": 0.0008587032789364457, | |
| "learning_rate": 1.4880952380952381e-05, | |
| "loss": 0.0068, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 6.497395833333333, | |
| "grad_norm": 0.000166285244631581, | |
| "learning_rate": 1.4508928571428573e-05, | |
| "loss": 0.0399, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 6.510416666666667, | |
| "grad_norm": 0.000623571511823684, | |
| "learning_rate": 1.4136904761904762e-05, | |
| "loss": 0.0002, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 6.510416666666667, | |
| "eval_accuracy": 0.9801526717557252, | |
| "eval_f1": 0.9792529286071088, | |
| "eval_loss": 0.14169791340827942, | |
| "eval_runtime": 79.5831, | |
| "eval_samples_per_second": 16.461, | |
| "eval_steps_per_second": 2.061, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 6.5234375, | |
| "grad_norm": 0.00023940973915159702, | |
| "learning_rate": 1.3764880952380954e-05, | |
| "loss": 0.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 6.536458333333333, | |
| "grad_norm": 9.644269448472187e-05, | |
| "learning_rate": 1.3392857142857144e-05, | |
| "loss": 0.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 6.549479166666667, | |
| "grad_norm": 0.0010055724997073412, | |
| "learning_rate": 1.3020833333333334e-05, | |
| "loss": 0.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 6.5625, | |
| "grad_norm": 0.001275699003599584, | |
| "learning_rate": 1.2648809523809524e-05, | |
| "loss": 0.0208, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 6.575520833333333, | |
| "grad_norm": 0.0229334719479084, | |
| "learning_rate": 1.2276785714285715e-05, | |
| "loss": 0.0002, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 6.588541666666667, | |
| "grad_norm": 0.0029182969592511654, | |
| "learning_rate": 1.1904761904761905e-05, | |
| "loss": 0.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 6.6015625, | |
| "grad_norm": 0.0006565605290234089, | |
| "learning_rate": 1.1532738095238097e-05, | |
| "loss": 0.0004, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 6.614583333333333, | |
| "grad_norm": 8.51297882036306e-05, | |
| "learning_rate": 1.1160714285714287e-05, | |
| "loss": 0.0013, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 6.627604166666667, | |
| "grad_norm": 0.03837132081389427, | |
| "learning_rate": 1.0788690476190478e-05, | |
| "loss": 0.0001, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 6.640625, | |
| "grad_norm": 0.0011966479942202568, | |
| "learning_rate": 1.0416666666666668e-05, | |
| "loss": 0.0001, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 6.640625, | |
| "eval_accuracy": 0.9801526717557252, | |
| "eval_f1": 0.9794155456475218, | |
| "eval_loss": 0.14272300899028778, | |
| "eval_runtime": 80.9251, | |
| "eval_samples_per_second": 16.188, | |
| "eval_steps_per_second": 2.027, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 6.653645833333333, | |
| "grad_norm": 1.9961093664169312, | |
| "learning_rate": 1.0044642857142858e-05, | |
| "loss": 0.0044, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.0005622489261440933, | |
| "learning_rate": 9.672619047619049e-06, | |
| "loss": 0.0001, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 6.6796875, | |
| "grad_norm": 0.00014217269199434668, | |
| "learning_rate": 9.300595238095239e-06, | |
| "loss": 0.0014, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 6.692708333333333, | |
| "grad_norm": 0.0006998235476203263, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.0002, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 6.705729166666667, | |
| "grad_norm": 0.0001875764282885939, | |
| "learning_rate": 8.55654761904762e-06, | |
| "loss": 0.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 6.71875, | |
| "grad_norm": 0.00016692327335476875, | |
| "learning_rate": 8.18452380952381e-06, | |
| "loss": 0.0002, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 6.731770833333333, | |
| "grad_norm": 0.00019502580107655376, | |
| "learning_rate": 7.8125e-06, | |
| "loss": 0.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 6.744791666666667, | |
| "grad_norm": 0.005827147047966719, | |
| "learning_rate": 7.4404761904761905e-06, | |
| "loss": 0.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 6.7578125, | |
| "grad_norm": 0.0037816341500729322, | |
| "learning_rate": 7.068452380952381e-06, | |
| "loss": 0.0487, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 6.770833333333333, | |
| "grad_norm": 0.001036057248711586, | |
| "learning_rate": 6.696428571428572e-06, | |
| "loss": 0.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 6.770833333333333, | |
| "eval_accuracy": 0.9801526717557252, | |
| "eval_f1": 0.9787148983839948, | |
| "eval_loss": 0.14222955703735352, | |
| "eval_runtime": 79.737, | |
| "eval_samples_per_second": 16.429, | |
| "eval_steps_per_second": 2.057, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 6.783854166666667, | |
| "grad_norm": 0.0004406815569382161, | |
| "learning_rate": 6.324404761904762e-06, | |
| "loss": 0.0003, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 6.796875, | |
| "grad_norm": 0.001424977439455688, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 0.0001, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 6.809895833333333, | |
| "grad_norm": 0.00036442719283513725, | |
| "learning_rate": 5.580357142857144e-06, | |
| "loss": 0.0001, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 6.822916666666667, | |
| "grad_norm": 0.15314286947250366, | |
| "learning_rate": 5.208333333333334e-06, | |
| "loss": 0.0006, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 6.8359375, | |
| "grad_norm": 0.00025265634758397937, | |
| "learning_rate": 4.836309523809524e-06, | |
| "loss": 0.0008, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 6.848958333333333, | |
| "grad_norm": 0.006122451741248369, | |
| "learning_rate": 4.464285714285715e-06, | |
| "loss": 0.0001, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 6.861979166666667, | |
| "grad_norm": 0.0012435363605618477, | |
| "learning_rate": 4.092261904761905e-06, | |
| "loss": 0.0002, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 6.875, | |
| "grad_norm": 0.00042211436084471643, | |
| "learning_rate": 3.7202380952380952e-06, | |
| "loss": 0.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 6.888020833333333, | |
| "grad_norm": 0.0004731386434286833, | |
| "learning_rate": 3.348214285714286e-06, | |
| "loss": 0.0002, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 6.901041666666667, | |
| "grad_norm": 0.0005277880118228495, | |
| "learning_rate": 2.9761904761904763e-06, | |
| "loss": 0.0002, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 6.901041666666667, | |
| "eval_accuracy": 0.9801526717557252, | |
| "eval_f1": 0.9787148983839948, | |
| "eval_loss": 0.14254190027713776, | |
| "eval_runtime": 79.6927, | |
| "eval_samples_per_second": 16.438, | |
| "eval_steps_per_second": 2.058, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 6.9140625, | |
| "grad_norm": 0.0005409631412476301, | |
| "learning_rate": 2.604166666666667e-06, | |
| "loss": 0.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 6.927083333333333, | |
| "grad_norm": 0.0001622817071620375, | |
| "learning_rate": 2.2321428571428573e-06, | |
| "loss": 0.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 6.940104166666667, | |
| "grad_norm": 0.0003351483028382063, | |
| "learning_rate": 1.8601190476190476e-06, | |
| "loss": 0.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 6.953125, | |
| "grad_norm": 0.0009497393039055169, | |
| "learning_rate": 1.4880952380952381e-06, | |
| "loss": 0.0001, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 6.966145833333333, | |
| "grad_norm": 0.00034176220651715994, | |
| "learning_rate": 1.1160714285714287e-06, | |
| "loss": 0.0001, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 6.979166666666667, | |
| "grad_norm": 0.0005293432041071355, | |
| "learning_rate": 7.440476190476191e-07, | |
| "loss": 0.0001, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 6.9921875, | |
| "grad_norm": 0.0005790857248939574, | |
| "learning_rate": 3.7202380952380953e-07, | |
| "loss": 0.0485, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "step": 5376, | |
| "total_flos": 3.332347463387013e+18, | |
| "train_loss": 0.16521713621213643, | |
| "train_runtime": 7665.8969, | |
| "train_samples_per_second": 5.608, | |
| "train_steps_per_second": 0.701 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5376, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.332347463387013e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |