| { | |
| "best_metric": 0.4731413722038269, | |
| "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-2200", | |
| "epoch": 4.0, | |
| "eval_steps": 100, | |
| "global_step": 3568, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.2656996250152588, | |
| "learning_rate": 0.00019943946188340808, | |
| "loss": 1.7641, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.702761173248291, | |
| "learning_rate": 0.00019887892376681615, | |
| "loss": 1.5342, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.4954216480255127, | |
| "learning_rate": 0.00019831838565022422, | |
| "loss": 1.2414, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.647926926612854, | |
| "learning_rate": 0.0001977578475336323, | |
| "loss": 1.1185, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.192228078842163, | |
| "learning_rate": 0.00019719730941704039, | |
| "loss": 1.1626, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.8315672874450684, | |
| "learning_rate": 0.00019663677130044843, | |
| "loss": 1.1362, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.4982831478118896, | |
| "learning_rate": 0.0001960762331838565, | |
| "loss": 1.0978, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.8782095909118652, | |
| "learning_rate": 0.0001955156950672646, | |
| "loss": 1.0034, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.299464464187622, | |
| "learning_rate": 0.00019495515695067267, | |
| "loss": 0.9887, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.7226407527923584, | |
| "learning_rate": 0.0001943946188340807, | |
| "loss": 0.8874, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_accuracy": 0.6571428571428571, | |
| "eval_loss": 0.9671773314476013, | |
| "eval_runtime": 129.3363, | |
| "eval_samples_per_second": 54.934, | |
| "eval_steps_per_second": 3.441, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.4987040758132935, | |
| "learning_rate": 0.0001938340807174888, | |
| "loss": 0.8796, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.5715798139572144, | |
| "learning_rate": 0.00019327354260089688, | |
| "loss": 0.9511, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.1380341053009033, | |
| "learning_rate": 0.00019271300448430495, | |
| "loss": 0.8983, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.5433120727539062, | |
| "learning_rate": 0.00019215246636771302, | |
| "loss": 0.8984, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.033275604248047, | |
| "learning_rate": 0.00019159192825112109, | |
| "loss": 0.824, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.4264872074127197, | |
| "learning_rate": 0.00019103139013452916, | |
| "loss": 0.8358, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.856844186782837, | |
| "learning_rate": 0.00019047085201793723, | |
| "loss": 0.8706, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.4621365070343018, | |
| "learning_rate": 0.0001899103139013453, | |
| "loss": 0.8521, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.4839106798171997, | |
| "learning_rate": 0.00018934977578475337, | |
| "loss": 0.8314, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.008270502090454, | |
| "learning_rate": 0.00018878923766816143, | |
| "loss": 0.7806, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_accuracy": 0.7096410978184378, | |
| "eval_loss": 0.8030331134796143, | |
| "eval_runtime": 127.3215, | |
| "eval_samples_per_second": 55.804, | |
| "eval_steps_per_second": 3.495, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.759716272354126, | |
| "learning_rate": 0.00018822869955156953, | |
| "loss": 0.7414, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.2896201610565186, | |
| "learning_rate": 0.00018766816143497757, | |
| "loss": 0.7788, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.522684097290039, | |
| "learning_rate": 0.00018710762331838564, | |
| "loss": 0.7765, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.6483938694000244, | |
| "learning_rate": 0.00018654708520179374, | |
| "loss": 0.8428, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.5204224586486816, | |
| "learning_rate": 0.0001859865470852018, | |
| "loss": 0.7756, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.352989912033081, | |
| "learning_rate": 0.00018542600896860985, | |
| "loss": 0.8085, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 3.1435911655426025, | |
| "learning_rate": 0.00018486547085201795, | |
| "loss": 0.8336, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.863233208656311, | |
| "learning_rate": 0.00018430493273542602, | |
| "loss": 0.7351, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.6672592163085938, | |
| "learning_rate": 0.0001837443946188341, | |
| "loss": 0.7169, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.852493405342102, | |
| "learning_rate": 0.00018318385650224216, | |
| "loss": 0.7404, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "eval_accuracy": 0.696551724137931, | |
| "eval_loss": 0.8076898455619812, | |
| "eval_runtime": 127.361, | |
| "eval_samples_per_second": 55.786, | |
| "eval_steps_per_second": 3.494, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.066432476043701, | |
| "learning_rate": 0.00018262331838565023, | |
| "loss": 0.7422, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.5433382987976074, | |
| "learning_rate": 0.0001820627802690583, | |
| "loss": 0.7509, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.6319210529327393, | |
| "learning_rate": 0.00018150224215246637, | |
| "loss": 0.7168, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.8435016870498657, | |
| "learning_rate": 0.00018094170403587444, | |
| "loss": 0.7421, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.946035623550415, | |
| "learning_rate": 0.0001803811659192825, | |
| "loss": 0.7062, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.926882028579712, | |
| "learning_rate": 0.0001798206278026906, | |
| "loss": 0.7228, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 3.0743374824523926, | |
| "learning_rate": 0.00017926008968609868, | |
| "loss": 0.7722, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.2202589511871338, | |
| "learning_rate": 0.00017869955156950672, | |
| "loss": 0.7334, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.696330189704895, | |
| "learning_rate": 0.00017813901345291482, | |
| "loss": 0.7306, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.4689078330993652, | |
| "learning_rate": 0.0001775784753363229, | |
| "loss": 0.7224, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "eval_accuracy": 0.741027445460943, | |
| "eval_loss": 0.6990236043930054, | |
| "eval_runtime": 127.2157, | |
| "eval_samples_per_second": 55.85, | |
| "eval_steps_per_second": 3.498, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.5020831823349, | |
| "learning_rate": 0.00017701793721973096, | |
| "loss": 0.7291, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 2.5034234523773193, | |
| "learning_rate": 0.00017645739910313903, | |
| "loss": 0.6754, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.4598332643508911, | |
| "learning_rate": 0.0001758968609865471, | |
| "loss": 0.6624, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 4.073904991149902, | |
| "learning_rate": 0.00017533632286995517, | |
| "loss": 0.6682, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.7306851148605347, | |
| "learning_rate": 0.00017477578475336324, | |
| "loss": 0.6515, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.59877347946167, | |
| "learning_rate": 0.0001742152466367713, | |
| "loss": 0.5476, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 3.6079037189483643, | |
| "learning_rate": 0.00017365470852017938, | |
| "loss": 0.7031, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 3.1861743927001953, | |
| "learning_rate": 0.00017309417040358745, | |
| "loss": 0.7141, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.9290772676467896, | |
| "learning_rate": 0.00017253363228699552, | |
| "loss": 0.668, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.201539993286133, | |
| "learning_rate": 0.0001719730941704036, | |
| "loss": 0.6969, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_accuracy": 0.7362420830401126, | |
| "eval_loss": 0.7265785932540894, | |
| "eval_runtime": 127.1383, | |
| "eval_samples_per_second": 55.884, | |
| "eval_steps_per_second": 3.5, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.651583194732666, | |
| "learning_rate": 0.00017141255605381166, | |
| "loss": 0.6099, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.7216840982437134, | |
| "learning_rate": 0.00017085201793721975, | |
| "loss": 0.6174, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.8105279207229614, | |
| "learning_rate": 0.0001702914798206278, | |
| "loss": 0.6444, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.837810754776001, | |
| "learning_rate": 0.00016973094170403587, | |
| "loss": 0.6607, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.6770997047424316, | |
| "learning_rate": 0.00016917040358744396, | |
| "loss": 0.6268, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.2837833166122437, | |
| "learning_rate": 0.00016860986547085203, | |
| "loss": 0.7084, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0523972511291504, | |
| "learning_rate": 0.00016804932735426008, | |
| "loss": 0.6271, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.356562852859497, | |
| "learning_rate": 0.00016748878923766817, | |
| "loss": 0.6242, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.5814656019210815, | |
| "learning_rate": 0.00016692825112107624, | |
| "loss": 0.564, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.757988691329956, | |
| "learning_rate": 0.0001663677130044843, | |
| "loss": 0.5929, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "eval_accuracy": 0.7589021815622801, | |
| "eval_loss": 0.6735280156135559, | |
| "eval_runtime": 126.499, | |
| "eval_samples_per_second": 56.166, | |
| "eval_steps_per_second": 3.518, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.8649027347564697, | |
| "learning_rate": 0.00016580717488789238, | |
| "loss": 0.6578, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.1447596549987793, | |
| "learning_rate": 0.00016524663677130045, | |
| "loss": 0.6224, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.7960020303726196, | |
| "learning_rate": 0.00016468609865470852, | |
| "loss": 0.6506, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.265737533569336, | |
| "learning_rate": 0.00016412556053811662, | |
| "loss": 0.6302, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.6668283939361572, | |
| "learning_rate": 0.00016356502242152466, | |
| "loss": 0.6518, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 2.8400986194610596, | |
| "learning_rate": 0.00016300448430493273, | |
| "loss": 0.619, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 4.842011451721191, | |
| "learning_rate": 0.00016244394618834083, | |
| "loss": 0.5984, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.0878114700317383, | |
| "learning_rate": 0.0001618834080717489, | |
| "loss": 0.5451, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.6710331439971924, | |
| "learning_rate": 0.00016132286995515694, | |
| "loss": 0.6813, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 2.501116991043091, | |
| "learning_rate": 0.00016076233183856504, | |
| "loss": 0.5556, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "eval_accuracy": 0.7463757916959888, | |
| "eval_loss": 0.6704686284065247, | |
| "eval_runtime": 126.4929, | |
| "eval_samples_per_second": 56.169, | |
| "eval_steps_per_second": 3.518, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.3544445037841797, | |
| "learning_rate": 0.0001602017937219731, | |
| "loss": 0.6037, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.7408026456832886, | |
| "learning_rate": 0.00015964125560538118, | |
| "loss": 0.7128, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 3.1222403049468994, | |
| "learning_rate": 0.00015908071748878925, | |
| "loss": 0.5976, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.2981232404708862, | |
| "learning_rate": 0.00015852017937219732, | |
| "loss": 0.5566, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9470215439796448, | |
| "learning_rate": 0.0001579596412556054, | |
| "loss": 0.5446, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.5283540487289429, | |
| "learning_rate": 0.00015739910313901346, | |
| "loss": 0.5127, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.9940904378890991, | |
| "learning_rate": 0.00015683856502242153, | |
| "loss": 0.5999, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 2.0824811458587646, | |
| "learning_rate": 0.0001562780269058296, | |
| "loss": 0.5658, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.9540338516235352, | |
| "learning_rate": 0.0001557174887892377, | |
| "loss": 0.5329, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.6645208597183228, | |
| "learning_rate": 0.00015515695067264574, | |
| "loss": 0.5831, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_accuracy": 0.7681914144968333, | |
| "eval_loss": 0.6300484538078308, | |
| "eval_runtime": 127.6321, | |
| "eval_samples_per_second": 55.668, | |
| "eval_steps_per_second": 3.487, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.9477753639221191, | |
| "learning_rate": 0.0001545964125560538, | |
| "loss": 0.5994, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.3609261512756348, | |
| "learning_rate": 0.00015403587443946188, | |
| "loss": 0.5809, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.7007122039794922, | |
| "learning_rate": 0.00015347533632286998, | |
| "loss": 0.6124, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.753017783164978, | |
| "learning_rate": 0.00015291479820627804, | |
| "loss": 0.5809, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.672780156135559, | |
| "learning_rate": 0.0001523542600896861, | |
| "loss": 0.5536, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.7526847124099731, | |
| "learning_rate": 0.00015179372197309418, | |
| "loss": 0.6337, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.4697840213775635, | |
| "learning_rate": 0.00015123318385650225, | |
| "loss": 0.5725, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 2.619101047515869, | |
| "learning_rate": 0.00015067264573991032, | |
| "loss": 0.7272, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.2301433086395264, | |
| "learning_rate": 0.0001501121076233184, | |
| "loss": 0.5315, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.2401838302612305, | |
| "learning_rate": 0.00014955156950672646, | |
| "loss": 0.3992, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "eval_accuracy": 0.7884588318085856, | |
| "eval_loss": 0.5879009366035461, | |
| "eval_runtime": 127.0026, | |
| "eval_samples_per_second": 55.944, | |
| "eval_steps_per_second": 3.504, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.7034109830856323, | |
| "learning_rate": 0.00014899103139013453, | |
| "loss": 0.3054, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.852588176727295, | |
| "learning_rate": 0.0001484304932735426, | |
| "loss": 0.4751, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.8306503295898438, | |
| "learning_rate": 0.00014786995515695067, | |
| "loss": 0.4143, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.520498037338257, | |
| "learning_rate": 0.00014730941704035874, | |
| "loss": 0.4863, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 3.4684131145477295, | |
| "learning_rate": 0.00014674887892376684, | |
| "loss": 0.4911, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.4476227760314941, | |
| "learning_rate": 0.00014618834080717488, | |
| "loss": 0.4375, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 2.003276824951172, | |
| "learning_rate": 0.00014562780269058295, | |
| "loss": 0.4116, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 2.1394705772399902, | |
| "learning_rate": 0.00014506726457399105, | |
| "loss": 0.3873, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 2.4954771995544434, | |
| "learning_rate": 0.00014450672645739912, | |
| "loss": 0.3475, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.449598550796509, | |
| "learning_rate": 0.00014394618834080716, | |
| "loss": 0.4661, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_accuracy": 0.7887403237156931, | |
| "eval_loss": 0.5782468914985657, | |
| "eval_runtime": 127.3385, | |
| "eval_samples_per_second": 55.796, | |
| "eval_steps_per_second": 3.495, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 2.011767864227295, | |
| "learning_rate": 0.00014338565022421526, | |
| "loss": 0.3975, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.4032831192016602, | |
| "learning_rate": 0.00014282511210762333, | |
| "loss": 0.4123, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.5600253343582153, | |
| "learning_rate": 0.0001422645739910314, | |
| "loss": 0.351, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 3.0084493160247803, | |
| "learning_rate": 0.00014170403587443947, | |
| "loss": 0.4234, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.441857933998108, | |
| "learning_rate": 0.00014114349775784754, | |
| "loss": 0.4868, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.2188992500305176, | |
| "learning_rate": 0.0001405829596412556, | |
| "loss": 0.423, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.6048247814178467, | |
| "learning_rate": 0.0001400224215246637, | |
| "loss": 0.4204, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 2.915587902069092, | |
| "learning_rate": 0.00013946188340807175, | |
| "loss": 0.3875, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.629499912261963, | |
| "learning_rate": 0.00013890134529147982, | |
| "loss": 0.4025, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 2.1824419498443604, | |
| "learning_rate": 0.00013834080717488792, | |
| "loss": 0.358, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "eval_accuracy": 0.7942294159042927, | |
| "eval_loss": 0.5690400004386902, | |
| "eval_runtime": 127.5854, | |
| "eval_samples_per_second": 55.688, | |
| "eval_steps_per_second": 3.488, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.4124354124069214, | |
| "learning_rate": 0.000137780269058296, | |
| "loss": 0.3944, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.3432316780090332, | |
| "learning_rate": 0.00013721973094170403, | |
| "loss": 0.4344, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 2.4002068042755127, | |
| "learning_rate": 0.0001366591928251121, | |
| "loss": 0.3457, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 4.019514560699463, | |
| "learning_rate": 0.0001360986547085202, | |
| "loss": 0.4713, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.3158634901046753, | |
| "learning_rate": 0.00013553811659192827, | |
| "loss": 0.3754, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 3.053358793258667, | |
| "learning_rate": 0.0001349775784753363, | |
| "loss": 0.4441, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.6883745193481445, | |
| "learning_rate": 0.0001344170403587444, | |
| "loss": 0.4562, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.3927807807922363, | |
| "learning_rate": 0.00013385650224215248, | |
| "loss": 0.4068, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.4922747611999512, | |
| "learning_rate": 0.00013329596412556055, | |
| "loss": 0.4174, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.156853199005127, | |
| "learning_rate": 0.00013273542600896862, | |
| "loss": 0.3812, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "eval_accuracy": 0.8146375791695989, | |
| "eval_loss": 0.5108710527420044, | |
| "eval_runtime": 127.9489, | |
| "eval_samples_per_second": 55.53, | |
| "eval_steps_per_second": 3.478, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.6083571910858154, | |
| "learning_rate": 0.00013217488789237669, | |
| "loss": 0.3351, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.889650583267212, | |
| "learning_rate": 0.00013161434977578476, | |
| "loss": 0.4398, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.9212812185287476, | |
| "learning_rate": 0.00013105381165919283, | |
| "loss": 0.3878, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.9533714056015015, | |
| "learning_rate": 0.0001304932735426009, | |
| "loss": 0.3447, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 3.091277837753296, | |
| "learning_rate": 0.00012993273542600897, | |
| "loss": 0.3848, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.8874760866165161, | |
| "learning_rate": 0.00012937219730941706, | |
| "loss": 0.5295, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 2.417236804962158, | |
| "learning_rate": 0.0001288116591928251, | |
| "loss": 0.4444, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.4967589378356934, | |
| "learning_rate": 0.00012825112107623318, | |
| "loss": 0.4944, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 3.3610410690307617, | |
| "learning_rate": 0.00012769058295964127, | |
| "loss": 0.3398, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.5874662399291992, | |
| "learning_rate": 0.00012713004484304934, | |
| "loss": 0.3535, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "eval_accuracy": 0.8147783251231527, | |
| "eval_loss": 0.5213413834571838, | |
| "eval_runtime": 127.3924, | |
| "eval_samples_per_second": 55.773, | |
| "eval_steps_per_second": 3.493, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.2904878854751587, | |
| "learning_rate": 0.00012656950672645739, | |
| "loss": 0.3868, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.38608980178833, | |
| "learning_rate": 0.00012600896860986548, | |
| "loss": 0.4082, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.6603220701217651, | |
| "learning_rate": 0.00012544843049327355, | |
| "loss": 0.3222, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.8661950826644897, | |
| "learning_rate": 0.00012488789237668162, | |
| "loss": 0.4029, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 2.12640643119812, | |
| "learning_rate": 0.0001243273542600897, | |
| "loss": 0.3691, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.671631097793579, | |
| "learning_rate": 0.00012376681614349776, | |
| "loss": 0.3328, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.103508949279785, | |
| "learning_rate": 0.00012320627802690583, | |
| "loss": 0.3475, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.7701743841171265, | |
| "learning_rate": 0.00012264573991031393, | |
| "loss": 0.4423, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.38301682472229, | |
| "learning_rate": 0.00012208520179372197, | |
| "loss": 0.426, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 2.7273807525634766, | |
| "learning_rate": 0.00012152466367713004, | |
| "loss": 0.3901, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "eval_accuracy": 0.8125263898662913, | |
| "eval_loss": 0.5261800289154053, | |
| "eval_runtime": 126.367, | |
| "eval_samples_per_second": 56.225, | |
| "eval_steps_per_second": 3.521, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.7706730365753174, | |
| "learning_rate": 0.00012096412556053814, | |
| "loss": 0.4756, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.9104268550872803, | |
| "learning_rate": 0.0001204035874439462, | |
| "loss": 0.3625, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.5548855066299438, | |
| "learning_rate": 0.00011984304932735426, | |
| "loss": 0.3352, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.2463669776916504, | |
| "learning_rate": 0.00011928251121076232, | |
| "loss": 0.3944, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 2.0413687229156494, | |
| "learning_rate": 0.00011872197309417042, | |
| "loss": 0.3637, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 2.267987012863159, | |
| "learning_rate": 0.00011816143497757847, | |
| "loss": 0.3517, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.90973699092865, | |
| "learning_rate": 0.00011760089686098654, | |
| "loss": 0.2954, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.5805819034576416, | |
| "learning_rate": 0.00011704035874439463, | |
| "loss": 0.3207, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 2.012744903564453, | |
| "learning_rate": 0.0001164798206278027, | |
| "loss": 0.2816, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.0160799026489258, | |
| "learning_rate": 0.00011591928251121075, | |
| "loss": 0.3276, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "eval_accuracy": 0.8081632653061225, | |
| "eval_loss": 0.5793688893318176, | |
| "eval_runtime": 128.0301, | |
| "eval_samples_per_second": 55.495, | |
| "eval_steps_per_second": 3.476, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 2.1283419132232666, | |
| "learning_rate": 0.00011535874439461885, | |
| "loss": 0.3697, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.384875774383545, | |
| "learning_rate": 0.00011479820627802691, | |
| "loss": 0.4283, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.6091313362121582, | |
| "learning_rate": 0.00011423766816143498, | |
| "loss": 0.2917, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 2.50388503074646, | |
| "learning_rate": 0.00011367713004484306, | |
| "loss": 0.3705, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 2.4337923526763916, | |
| "learning_rate": 0.00011311659192825113, | |
| "loss": 0.3724, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.677294373512268, | |
| "learning_rate": 0.00011255605381165919, | |
| "loss": 0.3492, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.634697675704956, | |
| "learning_rate": 0.00011199551569506727, | |
| "loss": 0.3633, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.2558953762054443, | |
| "learning_rate": 0.00011143497757847534, | |
| "loss": 0.3875, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.5888583660125732, | |
| "learning_rate": 0.00011087443946188341, | |
| "loss": 0.3323, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.265427827835083, | |
| "learning_rate": 0.0001103139013452915, | |
| "loss": 0.3679, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "eval_accuracy": 0.8115411681914145, | |
| "eval_loss": 0.5365468859672546, | |
| "eval_runtime": 127.8881, | |
| "eval_samples_per_second": 55.556, | |
| "eval_steps_per_second": 3.48, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.279982566833496, | |
| "learning_rate": 0.00010975336322869956, | |
| "loss": 0.2996, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.905090093612671, | |
| "learning_rate": 0.00010919282511210762, | |
| "loss": 0.3268, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 4.301815032958984, | |
| "learning_rate": 0.0001086322869955157, | |
| "loss": 0.3312, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.8537272214889526, | |
| "learning_rate": 0.00010807174887892377, | |
| "loss": 0.3654, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.3462457656860352, | |
| "learning_rate": 0.00010751121076233184, | |
| "loss": 0.346, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.4433109760284424, | |
| "learning_rate": 0.00010695067264573993, | |
| "loss": 0.3563, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.8175091743469238, | |
| "learning_rate": 0.00010639013452914798, | |
| "loss": 0.3065, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.724806547164917, | |
| "learning_rate": 0.00010582959641255605, | |
| "loss": 0.3949, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.4651908874511719, | |
| "learning_rate": 0.00010526905829596414, | |
| "loss": 0.3451, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.2862894535064697, | |
| "learning_rate": 0.0001047085201793722, | |
| "loss": 0.3077, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "eval_accuracy": 0.825615763546798, | |
| "eval_loss": 0.503186047077179, | |
| "eval_runtime": 127.5407, | |
| "eval_samples_per_second": 55.708, | |
| "eval_steps_per_second": 3.489, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.5572246313095093, | |
| "learning_rate": 0.00010414798206278026, | |
| "loss": 0.3638, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 3.091179847717285, | |
| "learning_rate": 0.00010358744394618836, | |
| "loss": 0.4205, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 2.097266435623169, | |
| "learning_rate": 0.00010302690582959642, | |
| "loss": 0.2865, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 3.9726853370666504, | |
| "learning_rate": 0.00010246636771300449, | |
| "loss": 0.3283, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.2275006771087646, | |
| "learning_rate": 0.00010190582959641257, | |
| "loss": 0.2865, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.571467161178589, | |
| "learning_rate": 0.00010134529147982064, | |
| "loss": 0.381, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.8339025974273682, | |
| "learning_rate": 0.0001007847533632287, | |
| "loss": 0.3399, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.96084463596344, | |
| "learning_rate": 0.00010022421524663677, | |
| "loss": 0.4104, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 1.5350396633148193, | |
| "learning_rate": 9.966367713004485e-05, | |
| "loss": 0.2616, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.11006760597229, | |
| "learning_rate": 9.910313901345292e-05, | |
| "loss": 0.1593, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "eval_accuracy": 0.8237860661505981, | |
| "eval_loss": 0.4946657121181488, | |
| "eval_runtime": 127.5397, | |
| "eval_samples_per_second": 55.708, | |
| "eval_steps_per_second": 3.489, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.9963308572769165, | |
| "learning_rate": 9.854260089686099e-05, | |
| "loss": 0.1422, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.350595235824585, | |
| "learning_rate": 9.798206278026907e-05, | |
| "loss": 0.2012, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.9206656217575073, | |
| "learning_rate": 9.742152466367713e-05, | |
| "loss": 0.2354, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.9061004519462585, | |
| "learning_rate": 9.686098654708521e-05, | |
| "loss": 0.1971, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 1.090854287147522, | |
| "learning_rate": 9.630044843049327e-05, | |
| "loss": 0.2035, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.798594355583191, | |
| "learning_rate": 9.573991031390135e-05, | |
| "loss": 0.2362, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.2856987416744232, | |
| "learning_rate": 9.517937219730942e-05, | |
| "loss": 0.1448, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.9062001705169678, | |
| "learning_rate": 9.461883408071749e-05, | |
| "loss": 0.1019, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.8595843315124512, | |
| "learning_rate": 9.405829596412556e-05, | |
| "loss": 0.1844, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 1.6841151714324951, | |
| "learning_rate": 9.349775784753365e-05, | |
| "loss": 0.2495, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "eval_accuracy": 0.8212526389866291, | |
| "eval_loss": 0.5188373327255249, | |
| "eval_runtime": 127.5963, | |
| "eval_samples_per_second": 55.683, | |
| "eval_steps_per_second": 3.488, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.3805599212646484, | |
| "learning_rate": 9.29372197309417e-05, | |
| "loss": 0.1725, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.5507524013519287, | |
| "learning_rate": 9.237668161434979e-05, | |
| "loss": 0.1951, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 4.487265586853027, | |
| "learning_rate": 9.181614349775786e-05, | |
| "loss": 0.1928, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 5.267577171325684, | |
| "learning_rate": 9.125560538116593e-05, | |
| "loss": 0.2279, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 1.9050337076187134, | |
| "learning_rate": 9.0695067264574e-05, | |
| "loss": 0.1408, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.7110774517059326, | |
| "learning_rate": 9.013452914798208e-05, | |
| "loss": 0.1851, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 2.4091663360595703, | |
| "learning_rate": 8.957399103139014e-05, | |
| "loss": 0.1961, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.0745985507965088, | |
| "learning_rate": 8.901345291479822e-05, | |
| "loss": 0.1807, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.907657504081726, | |
| "learning_rate": 8.845291479820629e-05, | |
| "loss": 0.1656, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.4196101427078247, | |
| "learning_rate": 8.789237668161436e-05, | |
| "loss": 0.1604, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_accuracy": 0.8457424349049965, | |
| "eval_loss": 0.47485658526420593, | |
| "eval_runtime": 127.5001, | |
| "eval_samples_per_second": 55.725, | |
| "eval_steps_per_second": 3.49, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.8819741606712341, | |
| "learning_rate": 8.733183856502243e-05, | |
| "loss": 0.1541, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 2.7008216381073, | |
| "learning_rate": 8.67713004484305e-05, | |
| "loss": 0.1754, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.3049780130386353, | |
| "learning_rate": 8.621076233183857e-05, | |
| "loss": 0.1474, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.7845088243484497, | |
| "learning_rate": 8.565022421524664e-05, | |
| "loss": 0.149, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 2.162095546722412, | |
| "learning_rate": 8.508968609865471e-05, | |
| "loss": 0.1711, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 2.4429993629455566, | |
| "learning_rate": 8.452914798206278e-05, | |
| "loss": 0.1278, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 4.209596157073975, | |
| "learning_rate": 8.396860986547086e-05, | |
| "loss": 0.2285, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.8332444429397583, | |
| "learning_rate": 8.340807174887892e-05, | |
| "loss": 0.1706, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.2180029153823853, | |
| "learning_rate": 8.2847533632287e-05, | |
| "loss": 0.1863, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.7199923992156982, | |
| "learning_rate": 8.228699551569507e-05, | |
| "loss": 0.1347, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "eval_accuracy": 0.8318085855031668, | |
| "eval_loss": 0.4878062307834625, | |
| "eval_runtime": 127.098, | |
| "eval_samples_per_second": 55.902, | |
| "eval_steps_per_second": 3.501, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 2.4405524730682373, | |
| "learning_rate": 8.172645739910314e-05, | |
| "loss": 0.1669, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 3.2856411933898926, | |
| "learning_rate": 8.116591928251121e-05, | |
| "loss": 0.1526, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 2.642458915710449, | |
| "learning_rate": 8.06053811659193e-05, | |
| "loss": 0.1866, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.8131886720657349, | |
| "learning_rate": 8.004484304932735e-05, | |
| "loss": 0.1413, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.279311418533325, | |
| "learning_rate": 7.948430493273543e-05, | |
| "loss": 0.1764, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 3.409904956817627, | |
| "learning_rate": 7.892376681614349e-05, | |
| "loss": 0.1697, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.6139248609542847, | |
| "learning_rate": 7.836322869955157e-05, | |
| "loss": 0.2257, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.9628515243530273, | |
| "learning_rate": 7.780269058295964e-05, | |
| "loss": 0.1869, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 2.0070972442626953, | |
| "learning_rate": 7.724215246636771e-05, | |
| "loss": 0.2005, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.0854668617248535, | |
| "learning_rate": 7.668161434977578e-05, | |
| "loss": 0.1723, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "eval_accuracy": 0.8441942294159043, | |
| "eval_loss": 0.4731413722038269, | |
| "eval_runtime": 127.6751, | |
| "eval_samples_per_second": 55.649, | |
| "eval_steps_per_second": 3.485, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 3.4323694705963135, | |
| "learning_rate": 7.612107623318387e-05, | |
| "loss": 0.1829, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 1.0630773305892944, | |
| "learning_rate": 7.556053811659192e-05, | |
| "loss": 0.1704, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.900248646736145, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.1428, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.8738330602645874, | |
| "learning_rate": 7.443946188340808e-05, | |
| "loss": 0.1403, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.873507499694824, | |
| "learning_rate": 7.387892376681615e-05, | |
| "loss": 0.1267, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 3.962599515914917, | |
| "learning_rate": 7.331838565022422e-05, | |
| "loss": 0.1783, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.1607624292373657, | |
| "learning_rate": 7.27578475336323e-05, | |
| "loss": 0.1067, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 3.1270833015441895, | |
| "learning_rate": 7.219730941704036e-05, | |
| "loss": 0.1542, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 4.381764888763428, | |
| "learning_rate": 7.163677130044844e-05, | |
| "loss": 0.1032, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 4.008007526397705, | |
| "learning_rate": 7.107623318385651e-05, | |
| "loss": 0.1235, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "eval_accuracy": 0.8450387051372273, | |
| "eval_loss": 0.493280827999115, | |
| "eval_runtime": 127.613, | |
| "eval_samples_per_second": 55.676, | |
| "eval_steps_per_second": 3.487, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.7960009574890137, | |
| "learning_rate": 7.051569506726458e-05, | |
| "loss": 0.1695, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 3.1904006004333496, | |
| "learning_rate": 6.995515695067265e-05, | |
| "loss": 0.1388, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 3.1949515342712402, | |
| "learning_rate": 6.939461883408072e-05, | |
| "loss": 0.2264, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.386139154434204, | |
| "learning_rate": 6.883408071748879e-05, | |
| "loss": 0.1498, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 2.6440839767456055, | |
| "learning_rate": 6.827354260089687e-05, | |
| "loss": 0.1445, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.2900611162185669, | |
| "learning_rate": 6.771300448430493e-05, | |
| "loss": 0.1245, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 3.771578073501587, | |
| "learning_rate": 6.715246636771301e-05, | |
| "loss": 0.1775, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.2707236707210541, | |
| "learning_rate": 6.659192825112108e-05, | |
| "loss": 0.1317, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 2.4165215492248535, | |
| "learning_rate": 6.603139013452915e-05, | |
| "loss": 0.1645, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 2.579758882522583, | |
| "learning_rate": 6.547085201793722e-05, | |
| "loss": 0.1752, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "eval_accuracy": 0.8501055594651654, | |
| "eval_loss": 0.47405895590782166, | |
| "eval_runtime": 127.5154, | |
| "eval_samples_per_second": 55.719, | |
| "eval_steps_per_second": 3.49, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.206603527069092, | |
| "learning_rate": 6.491031390134529e-05, | |
| "loss": 0.2482, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.586669921875, | |
| "learning_rate": 6.434977578475336e-05, | |
| "loss": 0.1746, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 2.046320676803589, | |
| "learning_rate": 6.378923766816143e-05, | |
| "loss": 0.1386, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.8042988181114197, | |
| "learning_rate": 6.322869955156952e-05, | |
| "loss": 0.1383, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 3.815175771713257, | |
| "learning_rate": 6.266816143497759e-05, | |
| "loss": 0.1394, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 2.830374002456665, | |
| "learning_rate": 6.210762331838566e-05, | |
| "loss": 0.1511, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 2.1348299980163574, | |
| "learning_rate": 6.154708520179373e-05, | |
| "loss": 0.1295, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 4.810758590698242, | |
| "learning_rate": 6.0986547085201795e-05, | |
| "loss": 0.1757, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 1.4163758754730225, | |
| "learning_rate": 6.042600896860987e-05, | |
| "loss": 0.0962, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.047985792160034, | |
| "learning_rate": 5.9865470852017935e-05, | |
| "loss": 0.1421, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_accuracy": 0.8474313863476425, | |
| "eval_loss": 0.4880400598049164, | |
| "eval_runtime": 127.7239, | |
| "eval_samples_per_second": 55.628, | |
| "eval_steps_per_second": 3.484, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.3623876571655273, | |
| "learning_rate": 5.930493273542601e-05, | |
| "loss": 0.1455, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.8722714185714722, | |
| "learning_rate": 5.874439461883409e-05, | |
| "loss": 0.1755, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.139150619506836, | |
| "learning_rate": 5.818385650224215e-05, | |
| "loss": 0.1471, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.041858434677124, | |
| "learning_rate": 5.762331838565023e-05, | |
| "loss": 0.1372, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.6558467149734497, | |
| "learning_rate": 5.7062780269058305e-05, | |
| "loss": 0.1184, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 3.568887233734131, | |
| "learning_rate": 5.650224215246637e-05, | |
| "loss": 0.1752, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.7773478627204895, | |
| "learning_rate": 5.5941704035874445e-05, | |
| "loss": 0.1491, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 2.827122688293457, | |
| "learning_rate": 5.5381165919282515e-05, | |
| "loss": 0.1893, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 3.533275842666626, | |
| "learning_rate": 5.4820627802690585e-05, | |
| "loss": 0.1117, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.7163364887237549, | |
| "learning_rate": 5.426008968609866e-05, | |
| "loss": 0.1549, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "eval_accuracy": 0.8389866291344124, | |
| "eval_loss": 0.4745788276195526, | |
| "eval_runtime": 127.6611, | |
| "eval_samples_per_second": 55.655, | |
| "eval_steps_per_second": 3.486, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 2.184201717376709, | |
| "learning_rate": 5.369955156950673e-05, | |
| "loss": 0.1698, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 2.9737119674682617, | |
| "learning_rate": 5.31390134529148e-05, | |
| "loss": 0.1189, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.0814894437789917, | |
| "learning_rate": 5.257847533632287e-05, | |
| "loss": 0.132, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.9624450206756592, | |
| "learning_rate": 5.201793721973094e-05, | |
| "loss": 0.1032, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 2.6256656646728516, | |
| "learning_rate": 5.145739910313902e-05, | |
| "loss": 0.1592, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 3.0557103157043457, | |
| "learning_rate": 5.089686098654709e-05, | |
| "loss": 0.1327, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 2.0262203216552734, | |
| "learning_rate": 5.033632286995516e-05, | |
| "loss": 0.1487, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.14867419004440308, | |
| "learning_rate": 4.977578475336323e-05, | |
| "loss": 0.1005, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 0.12747132778167725, | |
| "learning_rate": 4.92152466367713e-05, | |
| "loss": 0.0543, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 0.1839972287416458, | |
| "learning_rate": 4.8654708520179374e-05, | |
| "loss": 0.0617, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "eval_accuracy": 0.8496833216045039, | |
| "eval_loss": 0.4935864508152008, | |
| "eval_runtime": 127.5847, | |
| "eval_samples_per_second": 55.688, | |
| "eval_steps_per_second": 3.488, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 1.8856990337371826, | |
| "learning_rate": 4.8094170403587444e-05, | |
| "loss": 0.0687, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 0.39913854002952576, | |
| "learning_rate": 4.7533632286995514e-05, | |
| "loss": 0.0553, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.8423967957496643, | |
| "learning_rate": 4.697309417040359e-05, | |
| "loss": 0.0797, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 0.2557125687599182, | |
| "learning_rate": 4.641255605381166e-05, | |
| "loss": 0.08, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 2.2399497032165527, | |
| "learning_rate": 4.585201793721973e-05, | |
| "loss": 0.0483, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.04336220771074295, | |
| "learning_rate": 4.52914798206278e-05, | |
| "loss": 0.0339, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 0.29698485136032104, | |
| "learning_rate": 4.473094170403588e-05, | |
| "loss": 0.0579, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 2.0980567932128906, | |
| "learning_rate": 4.417040358744395e-05, | |
| "loss": 0.0996, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 0.4327409267425537, | |
| "learning_rate": 4.360986547085202e-05, | |
| "loss": 0.0435, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 0.9807620048522949, | |
| "learning_rate": 4.3049327354260094e-05, | |
| "loss": 0.0835, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "eval_accuracy": 0.8554539057002111, | |
| "eval_loss": 0.4977756142616272, | |
| "eval_runtime": 127.7265, | |
| "eval_samples_per_second": 55.627, | |
| "eval_steps_per_second": 3.484, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 1.283894658088684, | |
| "learning_rate": 4.2488789237668164e-05, | |
| "loss": 0.0397, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.820012092590332, | |
| "learning_rate": 4.1928251121076234e-05, | |
| "loss": 0.0601, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 1.781630039215088, | |
| "learning_rate": 4.1367713004484303e-05, | |
| "loss": 0.0452, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 0.20719203352928162, | |
| "learning_rate": 4.080717488789238e-05, | |
| "loss": 0.0401, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 2.106254816055298, | |
| "learning_rate": 4.024663677130045e-05, | |
| "loss": 0.0358, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 1.9900000095367432, | |
| "learning_rate": 3.968609865470852e-05, | |
| "loss": 0.0496, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 0.2951858937740326, | |
| "learning_rate": 3.91255605381166e-05, | |
| "loss": 0.0406, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 0.3538978695869446, | |
| "learning_rate": 3.8565022421524667e-05, | |
| "loss": 0.0481, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.9406136870384216, | |
| "learning_rate": 3.8004484304932737e-05, | |
| "loss": 0.0239, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 1.0194897651672363, | |
| "learning_rate": 3.744394618834081e-05, | |
| "loss": 0.0477, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "eval_accuracy": 0.8586910626319494, | |
| "eval_loss": 0.5344606637954712, | |
| "eval_runtime": 127.6926, | |
| "eval_samples_per_second": 55.641, | |
| "eval_steps_per_second": 3.485, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 0.11960559338331223, | |
| "learning_rate": 3.688340807174888e-05, | |
| "loss": 0.0511, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 0.57135009765625, | |
| "learning_rate": 3.632286995515695e-05, | |
| "loss": 0.0119, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 0.40805578231811523, | |
| "learning_rate": 3.576233183856502e-05, | |
| "loss": 0.0735, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 3.6458218097686768, | |
| "learning_rate": 3.52017937219731e-05, | |
| "loss": 0.0713, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 0.26397281885147095, | |
| "learning_rate": 3.464125560538117e-05, | |
| "loss": 0.0248, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 0.8868299126625061, | |
| "learning_rate": 3.408071748878924e-05, | |
| "loss": 0.0734, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 0.07102204859256744, | |
| "learning_rate": 3.3520179372197316e-05, | |
| "loss": 0.0176, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 2.244887590408325, | |
| "learning_rate": 3.2959641255605386e-05, | |
| "loss": 0.0498, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 0.5616236925125122, | |
| "learning_rate": 3.2399103139013456e-05, | |
| "loss": 0.0373, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.19669972360134125, | |
| "learning_rate": 3.1838565022421526e-05, | |
| "loss": 0.0287, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "eval_accuracy": 0.8596762843068262, | |
| "eval_loss": 0.5332924723625183, | |
| "eval_runtime": 127.3143, | |
| "eval_samples_per_second": 55.807, | |
| "eval_steps_per_second": 3.495, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 1.8116190433502197, | |
| "learning_rate": 3.12780269058296e-05, | |
| "loss": 0.0203, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 4.554844379425049, | |
| "learning_rate": 3.071748878923767e-05, | |
| "loss": 0.0199, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 1.331480622291565, | |
| "learning_rate": 3.015695067264574e-05, | |
| "loss": 0.0258, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 0.2970045208930969, | |
| "learning_rate": 2.9596412556053816e-05, | |
| "loss": 0.0311, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 1.1972055435180664, | |
| "learning_rate": 2.9035874439461886e-05, | |
| "loss": 0.0559, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 3.563384532928467, | |
| "learning_rate": 2.8475336322869956e-05, | |
| "loss": 0.0457, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 1.6148489713668823, | |
| "learning_rate": 2.7914798206278025e-05, | |
| "loss": 0.0583, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 0.8646751642227173, | |
| "learning_rate": 2.7354260089686102e-05, | |
| "loss": 0.03, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 0.4915563762187958, | |
| "learning_rate": 2.6793721973094172e-05, | |
| "loss": 0.032, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 1.7134366035461426, | |
| "learning_rate": 2.6233183856502242e-05, | |
| "loss": 0.0242, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "eval_accuracy": 0.8602392681210416, | |
| "eval_loss": 0.5433253645896912, | |
| "eval_runtime": 128.4379, | |
| "eval_samples_per_second": 55.319, | |
| "eval_steps_per_second": 3.465, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 0.1170569360256195, | |
| "learning_rate": 2.567264573991032e-05, | |
| "loss": 0.0349, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.28987565636634827, | |
| "learning_rate": 2.511210762331839e-05, | |
| "loss": 0.0327, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 0.07998673617839813, | |
| "learning_rate": 2.455156950672646e-05, | |
| "loss": 0.0483, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.4792230725288391, | |
| "learning_rate": 2.3991031390134532e-05, | |
| "loss": 0.0271, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 3.571005344390869, | |
| "learning_rate": 2.3430493273542602e-05, | |
| "loss": 0.0474, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 0.4508035182952881, | |
| "learning_rate": 2.286995515695067e-05, | |
| "loss": 0.0487, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 0.219608336687088, | |
| "learning_rate": 2.2309417040358745e-05, | |
| "loss": 0.0206, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 0.5323840975761414, | |
| "learning_rate": 2.1748878923766815e-05, | |
| "loss": 0.0461, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 5.022609233856201, | |
| "learning_rate": 2.1188340807174888e-05, | |
| "loss": 0.0387, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 0.12102050334215164, | |
| "learning_rate": 2.062780269058296e-05, | |
| "loss": 0.0196, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "eval_accuracy": 0.8584095707248417, | |
| "eval_loss": 0.5772469639778137, | |
| "eval_runtime": 126.9569, | |
| "eval_samples_per_second": 55.964, | |
| "eval_steps_per_second": 3.505, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.7281592488288879, | |
| "learning_rate": 2.006726457399103e-05, | |
| "loss": 0.029, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 3.789141893386841, | |
| "learning_rate": 1.9506726457399105e-05, | |
| "loss": 0.0747, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 0.8820950388908386, | |
| "learning_rate": 1.8946188340807175e-05, | |
| "loss": 0.0254, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 5.991636276245117, | |
| "learning_rate": 1.8385650224215248e-05, | |
| "loss": 0.0354, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.16825184226036072, | |
| "learning_rate": 1.7825112107623318e-05, | |
| "loss": 0.0229, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 0.19618666172027588, | |
| "learning_rate": 1.726457399103139e-05, | |
| "loss": 0.0737, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 0.08360274136066437, | |
| "learning_rate": 1.6704035874439464e-05, | |
| "loss": 0.0341, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.07565028220415115, | |
| "learning_rate": 1.6143497757847534e-05, | |
| "loss": 0.0287, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 2.6630303859710693, | |
| "learning_rate": 1.5582959641255608e-05, | |
| "loss": 0.0322, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 3.4573700428009033, | |
| "learning_rate": 1.5022421524663678e-05, | |
| "loss": 0.0297, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "eval_accuracy": 0.8595355383532723, | |
| "eval_loss": 0.5564337372779846, | |
| "eval_runtime": 127.7494, | |
| "eval_samples_per_second": 55.617, | |
| "eval_steps_per_second": 3.483, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 2.092428207397461, | |
| "learning_rate": 1.4461883408071749e-05, | |
| "loss": 0.0486, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 4.266282558441162, | |
| "learning_rate": 1.3901345291479822e-05, | |
| "loss": 0.0191, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 0.5256732106208801, | |
| "learning_rate": 1.3340807174887892e-05, | |
| "loss": 0.0101, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 2.7987325191497803, | |
| "learning_rate": 1.2780269058295966e-05, | |
| "loss": 0.0547, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.17688162624835968, | |
| "learning_rate": 1.2219730941704037e-05, | |
| "loss": 0.0221, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 1.9824228286743164, | |
| "learning_rate": 1.1659192825112109e-05, | |
| "loss": 0.0528, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.06763149797916412, | |
| "learning_rate": 1.109865470852018e-05, | |
| "loss": 0.0193, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 3.0181431770324707, | |
| "learning_rate": 1.0538116591928252e-05, | |
| "loss": 0.0777, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 1.1364251375198364, | |
| "learning_rate": 9.977578475336324e-06, | |
| "loss": 0.0376, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 0.19941140711307526, | |
| "learning_rate": 9.417040358744395e-06, | |
| "loss": 0.0457, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "eval_accuracy": 0.8512315270935961, | |
| "eval_loss": 0.5806910991668701, | |
| "eval_runtime": 128.3848, | |
| "eval_samples_per_second": 55.341, | |
| "eval_steps_per_second": 3.466, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 7.786200523376465, | |
| "learning_rate": 8.856502242152467e-06, | |
| "loss": 0.0541, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 0.42947888374328613, | |
| "learning_rate": 8.295964125560539e-06, | |
| "loss": 0.0169, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.17804774641990662, | |
| "learning_rate": 7.73542600896861e-06, | |
| "loss": 0.0324, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 0.34209346771240234, | |
| "learning_rate": 7.174887892376682e-06, | |
| "loss": 0.0158, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 0.16426484286785126, | |
| "learning_rate": 6.614349775784753e-06, | |
| "loss": 0.0135, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.19225721061229706, | |
| "learning_rate": 6.053811659192826e-06, | |
| "loss": 0.0291, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 1.250550627708435, | |
| "learning_rate": 5.493273542600897e-06, | |
| "loss": 0.0214, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 0.08164811879396439, | |
| "learning_rate": 4.932735426008968e-06, | |
| "loss": 0.0259, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 0.14926199615001678, | |
| "learning_rate": 4.372197309417041e-06, | |
| "loss": 0.0299, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.0725017860531807, | |
| "learning_rate": 3.8116591928251122e-06, | |
| "loss": 0.016, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "eval_accuracy": 0.8617874736101337, | |
| "eval_loss": 0.5601363778114319, | |
| "eval_runtime": 127.8408, | |
| "eval_samples_per_second": 55.577, | |
| "eval_steps_per_second": 3.481, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 1.6513164043426514, | |
| "learning_rate": 3.251121076233184e-06, | |
| "loss": 0.0165, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 0.088756263256073, | |
| "learning_rate": 2.690582959641256e-06, | |
| "loss": 0.0332, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.26848122477531433, | |
| "learning_rate": 2.1300448430493275e-06, | |
| "loss": 0.0232, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 0.14724156260490417, | |
| "learning_rate": 1.5695067264573993e-06, | |
| "loss": 0.02, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 1.9452786445617676, | |
| "learning_rate": 1.0089686098654709e-06, | |
| "loss": 0.0433, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 0.16432423889636993, | |
| "learning_rate": 4.484304932735426e-07, | |
| "loss": 0.0311, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 3568, | |
| "total_flos": 8.839521632856048e+18, | |
| "train_loss": 0.3317156129854944, | |
| "train_runtime": 7997.833, | |
| "train_samples_per_second": 14.262, | |
| "train_steps_per_second": 0.446 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3568, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 100, | |
| "total_flos": 8.839521632856048e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |