| { | |
| "best_global_step": 1100, | |
| "best_metric": 2.467733144760132, | |
| "best_model_checkpoint": "./swin-ena24/checkpoint-1100", | |
| "epoch": 7.0, | |
| "eval_steps": 100, | |
| "global_step": 2779, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02518891687657431, | |
| "grad_norm": 254123.9375, | |
| "learning_rate": 0.00019935228499460238, | |
| "loss": 2.9691, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05037783375314862, | |
| "grad_norm": 426010.46875, | |
| "learning_rate": 0.00019863260165527168, | |
| "loss": 2.8919, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07556675062972293, | |
| "grad_norm": 438250.125, | |
| "learning_rate": 0.000197912918315941, | |
| "loss": 2.7195, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10075566750629723, | |
| "grad_norm": 643403.375, | |
| "learning_rate": 0.00019719323497661032, | |
| "loss": 2.567, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12594458438287154, | |
| "grad_norm": 416875.5, | |
| "learning_rate": 0.0001964735516372796, | |
| "loss": 2.6196, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15113350125944586, | |
| "grad_norm": 412576.78125, | |
| "learning_rate": 0.0001957538682979489, | |
| "loss": 2.4359, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17632241813602015, | |
| "grad_norm": 371569.9375, | |
| "learning_rate": 0.00019503418495861824, | |
| "loss": 2.4815, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.20151133501259447, | |
| "grad_norm": 297408.5625, | |
| "learning_rate": 0.0001943145016192875, | |
| "loss": 2.508, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22670025188916876, | |
| "grad_norm": 405330.71875, | |
| "learning_rate": 0.00019359481827995682, | |
| "loss": 2.385, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2518891687657431, | |
| "grad_norm": 578770.75, | |
| "learning_rate": 0.00019287513494062612, | |
| "loss": 1.9888, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2518891687657431, | |
| "eval_accuracy": 0.1630859375, | |
| "eval_f1_macro": 0.08928735942932195, | |
| "eval_loss": 3.412175178527832, | |
| "eval_runtime": 11.0442, | |
| "eval_samples_per_second": 92.718, | |
| "eval_steps_per_second": 5.795, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2770780856423174, | |
| "grad_norm": 510874.125, | |
| "learning_rate": 0.00019215545160129545, | |
| "loss": 2.188, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3022670025188917, | |
| "grad_norm": 513859.9375, | |
| "learning_rate": 0.00019143576826196473, | |
| "loss": 2.1743, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.327455919395466, | |
| "grad_norm": 527887.5, | |
| "learning_rate": 0.00019071608492263404, | |
| "loss": 2.2732, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3526448362720403, | |
| "grad_norm": 460399.375, | |
| "learning_rate": 0.00018999640158330337, | |
| "loss": 2.1564, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3778337531486146, | |
| "grad_norm": 569056.8125, | |
| "learning_rate": 0.00018927671824397267, | |
| "loss": 1.8225, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.40302267002518893, | |
| "grad_norm": 909920.6875, | |
| "learning_rate": 0.00018855703490464195, | |
| "loss": 1.771, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4282115869017632, | |
| "grad_norm": 494884.5, | |
| "learning_rate": 0.00018783735156531128, | |
| "loss": 1.8349, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4534005037783375, | |
| "grad_norm": 550597.375, | |
| "learning_rate": 0.0001871176682259806, | |
| "loss": 1.5891, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.47858942065491183, | |
| "grad_norm": 462934.75, | |
| "learning_rate": 0.00018639798488664987, | |
| "loss": 1.8926, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5037783375314862, | |
| "grad_norm": 724803.5, | |
| "learning_rate": 0.00018567830154731917, | |
| "loss": 1.6111, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5037783375314862, | |
| "eval_accuracy": 0.2578125, | |
| "eval_f1_macro": 0.15353256242114557, | |
| "eval_loss": 2.9077086448669434, | |
| "eval_runtime": 11.368, | |
| "eval_samples_per_second": 90.078, | |
| "eval_steps_per_second": 5.63, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5289672544080605, | |
| "grad_norm": 559555.8125, | |
| "learning_rate": 0.0001849586182079885, | |
| "loss": 1.6636, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5541561712846348, | |
| "grad_norm": 1021320.3125, | |
| "learning_rate": 0.0001842389348686578, | |
| "loss": 1.462, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5793450881612091, | |
| "grad_norm": 692250.375, | |
| "learning_rate": 0.0001835192515293271, | |
| "loss": 1.5007, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6045340050377834, | |
| "grad_norm": 774498.1875, | |
| "learning_rate": 0.00018279956818999642, | |
| "loss": 1.5122, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6297229219143576, | |
| "grad_norm": 497269.53125, | |
| "learning_rate": 0.00018207988485066572, | |
| "loss": 1.3379, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.654911838790932, | |
| "grad_norm": 786711.4375, | |
| "learning_rate": 0.00018136020151133503, | |
| "loss": 1.5591, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6801007556675063, | |
| "grad_norm": 561642.375, | |
| "learning_rate": 0.00018064051817200433, | |
| "loss": 1.3499, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7052896725440806, | |
| "grad_norm": 372517.21875, | |
| "learning_rate": 0.00017992083483267364, | |
| "loss": 1.3296, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7304785894206549, | |
| "grad_norm": 783448.4375, | |
| "learning_rate": 0.00017920115149334294, | |
| "loss": 1.2106, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7556675062972292, | |
| "grad_norm": 748525.0, | |
| "learning_rate": 0.00017848146815401222, | |
| "loss": 1.1276, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7556675062972292, | |
| "eval_accuracy": 0.357421875, | |
| "eval_f1_macro": 0.28272669387224364, | |
| "eval_loss": 2.6503801345825195, | |
| "eval_runtime": 11.5522, | |
| "eval_samples_per_second": 88.641, | |
| "eval_steps_per_second": 5.54, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7808564231738035, | |
| "grad_norm": 481374.1875, | |
| "learning_rate": 0.00017776178481468155, | |
| "loss": 1.1727, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8060453400503779, | |
| "grad_norm": 1038731.1875, | |
| "learning_rate": 0.00017704210147535086, | |
| "loss": 1.1783, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8312342569269522, | |
| "grad_norm": 369614.8125, | |
| "learning_rate": 0.00017632241813602016, | |
| "loss": 1.4703, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8564231738035264, | |
| "grad_norm": 411708.9375, | |
| "learning_rate": 0.00017560273479668947, | |
| "loss": 1.0237, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8816120906801007, | |
| "grad_norm": 777554.375, | |
| "learning_rate": 0.00017488305145735877, | |
| "loss": 1.3113, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.906801007556675, | |
| "grad_norm": 589113.125, | |
| "learning_rate": 0.00017416336811802808, | |
| "loss": 1.4282, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9319899244332494, | |
| "grad_norm": 537991.9375, | |
| "learning_rate": 0.00017344368477869738, | |
| "loss": 1.0802, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9571788413098237, | |
| "grad_norm": 745339.875, | |
| "learning_rate": 0.0001727240014393667, | |
| "loss": 1.1078, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.982367758186398, | |
| "grad_norm": 515597.875, | |
| "learning_rate": 0.000172004318100036, | |
| "loss": 1.3261, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0075566750629723, | |
| "grad_norm": 438800.96875, | |
| "learning_rate": 0.0001712846347607053, | |
| "loss": 1.0234, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0075566750629723, | |
| "eval_accuracy": 0.390625, | |
| "eval_f1_macro": 0.31557618159267287, | |
| "eval_loss": 2.572810173034668, | |
| "eval_runtime": 11.7819, | |
| "eval_samples_per_second": 86.913, | |
| "eval_steps_per_second": 5.432, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0327455919395465, | |
| "grad_norm": 360076.0, | |
| "learning_rate": 0.0001705649514213746, | |
| "loss": 1.0415, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.057934508816121, | |
| "grad_norm": 468955.34375, | |
| "learning_rate": 0.0001698452680820439, | |
| "loss": 0.8579, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0831234256926952, | |
| "grad_norm": 765571.4375, | |
| "learning_rate": 0.0001691255847427132, | |
| "loss": 0.9408, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1083123425692696, | |
| "grad_norm": 682154.8125, | |
| "learning_rate": 0.00016840590140338252, | |
| "loss": 1.0962, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1335012594458438, | |
| "grad_norm": 861960.5, | |
| "learning_rate": 0.00016768621806405182, | |
| "loss": 0.8124, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1586901763224182, | |
| "grad_norm": 583161.8125, | |
| "learning_rate": 0.00016696653472472113, | |
| "loss": 1.1014, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1838790931989924, | |
| "grad_norm": 390205.40625, | |
| "learning_rate": 0.00016624685138539046, | |
| "loss": 0.8475, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.2090680100755669, | |
| "grad_norm": 293254.3125, | |
| "learning_rate": 0.00016552716804605974, | |
| "loss": 1.0482, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.234256926952141, | |
| "grad_norm": 571449.25, | |
| "learning_rate": 0.00016480748470672904, | |
| "loss": 0.8776, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2594458438287153, | |
| "grad_norm": 362773.25, | |
| "learning_rate": 0.00016408780136739835, | |
| "loss": 0.8909, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2594458438287153, | |
| "eval_accuracy": 0.421875, | |
| "eval_f1_macro": 0.3388453334601594, | |
| "eval_loss": 2.500704765319824, | |
| "eval_runtime": 11.4557, | |
| "eval_samples_per_second": 89.388, | |
| "eval_steps_per_second": 5.587, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2846347607052897, | |
| "grad_norm": 351467.09375, | |
| "learning_rate": 0.00016336811802806765, | |
| "loss": 0.8244, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.309823677581864, | |
| "grad_norm": 894287.0625, | |
| "learning_rate": 0.00016264843468873696, | |
| "loss": 1.048, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3350125944584383, | |
| "grad_norm": 474119.75, | |
| "learning_rate": 0.00016192875134940626, | |
| "loss": 0.9196, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3602015113350125, | |
| "grad_norm": 1069011.125, | |
| "learning_rate": 0.0001612090680100756, | |
| "loss": 0.804, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.385390428211587, | |
| "grad_norm": 587531.6875, | |
| "learning_rate": 0.00016048938467074487, | |
| "loss": 0.6696, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.4105793450881612, | |
| "grad_norm": 879147.25, | |
| "learning_rate": 0.00015976970133141418, | |
| "loss": 0.7831, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4357682619647356, | |
| "grad_norm": 219215.859375, | |
| "learning_rate": 0.0001590500179920835, | |
| "loss": 0.5681, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4609571788413098, | |
| "grad_norm": 447798.375, | |
| "learning_rate": 0.00015833033465275279, | |
| "loss": 0.8663, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.486146095717884, | |
| "grad_norm": 708370.625, | |
| "learning_rate": 0.0001576106513134221, | |
| "loss": 0.7913, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.5113350125944585, | |
| "grad_norm": 822008.0625, | |
| "learning_rate": 0.0001568909679740914, | |
| "loss": 0.8008, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5113350125944585, | |
| "eval_accuracy": 0.404296875, | |
| "eval_f1_macro": 0.3618779716937303, | |
| "eval_loss": 2.7039053440093994, | |
| "eval_runtime": 11.8087, | |
| "eval_samples_per_second": 86.716, | |
| "eval_steps_per_second": 5.42, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.536523929471033, | |
| "grad_norm": 467169.0625, | |
| "learning_rate": 0.00015617128463476073, | |
| "loss": 0.7902, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.561712846347607, | |
| "grad_norm": 385000.3125, | |
| "learning_rate": 0.00015545160129543, | |
| "loss": 0.7146, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5869017632241813, | |
| "grad_norm": 366275.21875, | |
| "learning_rate": 0.0001547319179560993, | |
| "loss": 0.6558, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.6120906801007555, | |
| "grad_norm": 432902.1875, | |
| "learning_rate": 0.00015401223461676864, | |
| "loss": 0.7669, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.63727959697733, | |
| "grad_norm": 426131.71875, | |
| "learning_rate": 0.00015329255127743795, | |
| "loss": 0.7121, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6624685138539044, | |
| "grad_norm": 426302.0, | |
| "learning_rate": 0.00015257286793810722, | |
| "loss": 0.9564, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6876574307304786, | |
| "grad_norm": 405949.34375, | |
| "learning_rate": 0.00015185318459877656, | |
| "loss": 0.6072, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.7128463476070528, | |
| "grad_norm": 319128.53125, | |
| "learning_rate": 0.00015113350125944586, | |
| "loss": 0.5537, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.7380352644836272, | |
| "grad_norm": 405533.625, | |
| "learning_rate": 0.00015041381792011514, | |
| "loss": 0.8143, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7632241813602016, | |
| "grad_norm": 357302.3125, | |
| "learning_rate": 0.00014969413458078447, | |
| "loss": 0.6885, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7632241813602016, | |
| "eval_accuracy": 0.3701171875, | |
| "eval_f1_macro": 0.2926079223907959, | |
| "eval_loss": 3.1089859008789062, | |
| "eval_runtime": 11.4817, | |
| "eval_samples_per_second": 89.186, | |
| "eval_steps_per_second": 5.574, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7884130982367759, | |
| "grad_norm": 448783.3125, | |
| "learning_rate": 0.00014897445124145378, | |
| "loss": 0.8981, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.81360201511335, | |
| "grad_norm": 663808.1875, | |
| "learning_rate": 0.00014825476790212308, | |
| "loss": 0.6493, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.8387909319899243, | |
| "grad_norm": 1166550.75, | |
| "learning_rate": 0.00014753508456279236, | |
| "loss": 0.6288, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.8639798488664987, | |
| "grad_norm": 774232.6875, | |
| "learning_rate": 0.0001468154012234617, | |
| "loss": 0.8717, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.8891687657430731, | |
| "grad_norm": 565763.0, | |
| "learning_rate": 0.000146095717884131, | |
| "loss": 0.7199, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.9143576826196473, | |
| "grad_norm": 365545.625, | |
| "learning_rate": 0.0001453760345448003, | |
| "loss": 0.721, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9395465994962215, | |
| "grad_norm": 616748.25, | |
| "learning_rate": 0.0001446563512054696, | |
| "loss": 0.6793, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.964735516372796, | |
| "grad_norm": 503789.65625, | |
| "learning_rate": 0.0001439366678661389, | |
| "loss": 0.5703, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9899244332493704, | |
| "grad_norm": 619143.4375, | |
| "learning_rate": 0.00014321698452680822, | |
| "loss": 0.679, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.0151133501259446, | |
| "grad_norm": 661451.3125, | |
| "learning_rate": 0.00014249730118747752, | |
| "loss": 0.839, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.0151133501259446, | |
| "eval_accuracy": 0.484375, | |
| "eval_f1_macro": 0.414907768165119, | |
| "eval_loss": 2.584502935409546, | |
| "eval_runtime": 11.5153, | |
| "eval_samples_per_second": 88.925, | |
| "eval_steps_per_second": 5.558, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.040302267002519, | |
| "grad_norm": 316634.78125, | |
| "learning_rate": 0.00014177761784814683, | |
| "loss": 0.4317, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.065491183879093, | |
| "grad_norm": 347729.625, | |
| "learning_rate": 0.00014105793450881613, | |
| "loss": 0.3792, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.0906801007556677, | |
| "grad_norm": 659445.4375, | |
| "learning_rate": 0.00014033825116948544, | |
| "loss": 0.452, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.115869017632242, | |
| "grad_norm": 735795.1875, | |
| "learning_rate": 0.00013961856783015474, | |
| "loss": 0.5447, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.141057934508816, | |
| "grad_norm": 107777.5390625, | |
| "learning_rate": 0.00013889888449082404, | |
| "loss": 0.5247, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.1662468513853903, | |
| "grad_norm": 577874.5, | |
| "learning_rate": 0.00013817920115149335, | |
| "loss": 0.5108, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.1914357682619645, | |
| "grad_norm": 441668.625, | |
| "learning_rate": 0.00013745951781216265, | |
| "loss": 0.4324, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.216624685138539, | |
| "grad_norm": 489653.59375, | |
| "learning_rate": 0.00013673983447283196, | |
| "loss": 0.4427, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.2418136020151134, | |
| "grad_norm": 296346.84375, | |
| "learning_rate": 0.00013602015113350126, | |
| "loss": 0.5105, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.2670025188916876, | |
| "grad_norm": 187145.796875, | |
| "learning_rate": 0.00013530046779417057, | |
| "loss": 0.325, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.2670025188916876, | |
| "eval_accuracy": 0.5068359375, | |
| "eval_f1_macro": 0.41278416064519763, | |
| "eval_loss": 2.5142855644226074, | |
| "eval_runtime": 12.173, | |
| "eval_samples_per_second": 84.12, | |
| "eval_steps_per_second": 5.258, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.292191435768262, | |
| "grad_norm": 406714.6875, | |
| "learning_rate": 0.00013458078445483987, | |
| "loss": 0.4576, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.3173803526448364, | |
| "grad_norm": 379892.40625, | |
| "learning_rate": 0.00013386110111550918, | |
| "loss": 0.4179, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.3425692695214106, | |
| "grad_norm": 540661.9375, | |
| "learning_rate": 0.00013314141777617848, | |
| "loss": 0.3386, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.367758186397985, | |
| "grad_norm": 850949.25, | |
| "learning_rate": 0.0001324217344368478, | |
| "loss": 0.5695, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.392947103274559, | |
| "grad_norm": 363627.53125, | |
| "learning_rate": 0.0001317020510975171, | |
| "loss": 0.3362, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.4181360201511337, | |
| "grad_norm": 644468.0625, | |
| "learning_rate": 0.0001309823677581864, | |
| "loss": 0.444, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.443324937027708, | |
| "grad_norm": 164236.78125, | |
| "learning_rate": 0.00013026268441885573, | |
| "loss": 0.285, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.468513853904282, | |
| "grad_norm": 688494.375, | |
| "learning_rate": 0.000129543001079525, | |
| "loss": 0.4024, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.4937027707808563, | |
| "grad_norm": 486211.0625, | |
| "learning_rate": 0.0001288233177401943, | |
| "loss": 0.4544, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.5188916876574305, | |
| "grad_norm": 273390.03125, | |
| "learning_rate": 0.00012810363440086365, | |
| "loss": 0.4501, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.5188916876574305, | |
| "eval_accuracy": 0.4482421875, | |
| "eval_f1_macro": 0.40562802574511003, | |
| "eval_loss": 2.7683629989624023, | |
| "eval_runtime": 11.8873, | |
| "eval_samples_per_second": 86.142, | |
| "eval_steps_per_second": 5.384, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.544080604534005, | |
| "grad_norm": 730712.125, | |
| "learning_rate": 0.00012738395106153292, | |
| "loss": 0.5234, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.5692695214105794, | |
| "grad_norm": 369909.34375, | |
| "learning_rate": 0.00012666426772220223, | |
| "loss": 0.4051, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.5944584382871536, | |
| "grad_norm": 507635.78125, | |
| "learning_rate": 0.00012594458438287153, | |
| "loss": 0.2967, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.619647355163728, | |
| "grad_norm": 688013.125, | |
| "learning_rate": 0.00012522490104354087, | |
| "loss": 0.5321, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.644836272040302, | |
| "grad_norm": 505216.40625, | |
| "learning_rate": 0.00012450521770421014, | |
| "loss": 0.4807, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.6700251889168767, | |
| "grad_norm": 252679.53125, | |
| "learning_rate": 0.00012378553436487945, | |
| "loss": 0.4088, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.695214105793451, | |
| "grad_norm": 235546.9375, | |
| "learning_rate": 0.00012306585102554878, | |
| "loss": 0.393, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.720403022670025, | |
| "grad_norm": 398018.84375, | |
| "learning_rate": 0.00012234616768621808, | |
| "loss": 0.3694, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.7455919395465997, | |
| "grad_norm": 374467.3125, | |
| "learning_rate": 0.00012162648434688738, | |
| "loss": 0.3599, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.770780856423174, | |
| "grad_norm": 533788.9375, | |
| "learning_rate": 0.0001209068010075567, | |
| "loss": 0.3191, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.770780856423174, | |
| "eval_accuracy": 0.5146484375, | |
| "eval_f1_macro": 0.432831730682114, | |
| "eval_loss": 2.467733144760132, | |
| "eval_runtime": 11.9402, | |
| "eval_samples_per_second": 85.761, | |
| "eval_steps_per_second": 5.36, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.795969773299748, | |
| "grad_norm": 27070.849609375, | |
| "learning_rate": 0.00012018711766822599, | |
| "loss": 0.2222, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.8211586901763224, | |
| "grad_norm": 742011.0625, | |
| "learning_rate": 0.00011946743432889529, | |
| "loss": 0.3194, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.8463476070528966, | |
| "grad_norm": 508084.53125, | |
| "learning_rate": 0.00011874775098956458, | |
| "loss": 0.5068, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.8715365239294712, | |
| "grad_norm": 472090.21875, | |
| "learning_rate": 0.00011802806765023391, | |
| "loss": 0.3457, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.8967254408060454, | |
| "grad_norm": 345946.625, | |
| "learning_rate": 0.0001173083843109032, | |
| "loss": 0.4024, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.9219143576826196, | |
| "grad_norm": 116557.78125, | |
| "learning_rate": 0.00011658870097157251, | |
| "loss": 0.3248, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.947103274559194, | |
| "grad_norm": 544007.125, | |
| "learning_rate": 0.00011586901763224183, | |
| "loss": 0.2957, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.972292191435768, | |
| "grad_norm": 558989.0, | |
| "learning_rate": 0.00011514933429291112, | |
| "loss": 0.3386, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.9974811083123427, | |
| "grad_norm": 509623.65625, | |
| "learning_rate": 0.00011442965095358043, | |
| "loss": 0.4525, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 3.022670025188917, | |
| "grad_norm": 374462.28125, | |
| "learning_rate": 0.00011370996761424974, | |
| "loss": 0.1664, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.022670025188917, | |
| "eval_accuracy": 0.5361328125, | |
| "eval_f1_macro": 0.4597358092759388, | |
| "eval_loss": 2.477670192718506, | |
| "eval_runtime": 11.8885, | |
| "eval_samples_per_second": 86.134, | |
| "eval_steps_per_second": 5.383, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.047858942065491, | |
| "grad_norm": 417895.8125, | |
| "learning_rate": 0.00011299028427491905, | |
| "loss": 0.2631, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 3.0730478589420653, | |
| "grad_norm": 307081.21875, | |
| "learning_rate": 0.00011227060093558834, | |
| "loss": 0.1544, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 3.09823677581864, | |
| "grad_norm": 1034528.125, | |
| "learning_rate": 0.00011155091759625764, | |
| "loss": 0.2107, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 3.123425692695214, | |
| "grad_norm": 659800.0625, | |
| "learning_rate": 0.00011083123425692696, | |
| "loss": 0.3405, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 3.1486146095717884, | |
| "grad_norm": 129759.8671875, | |
| "learning_rate": 0.00011011155091759627, | |
| "loss": 0.2225, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.1738035264483626, | |
| "grad_norm": 560157.25, | |
| "learning_rate": 0.00010939186757826556, | |
| "loss": 0.1545, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 3.1989924433249373, | |
| "grad_norm": 253132.671875, | |
| "learning_rate": 0.00010867218423893488, | |
| "loss": 0.3073, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 3.2241813602015115, | |
| "grad_norm": 407181.90625, | |
| "learning_rate": 0.00010795250089960418, | |
| "loss": 0.3247, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 3.2493702770780857, | |
| "grad_norm": 433259.09375, | |
| "learning_rate": 0.00010723281756027347, | |
| "loss": 0.127, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 3.27455919395466, | |
| "grad_norm": 187479.984375, | |
| "learning_rate": 0.0001065131342209428, | |
| "loss": 0.1469, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.27455919395466, | |
| "eval_accuracy": 0.5205078125, | |
| "eval_f1_macro": 0.4495261806770087, | |
| "eval_loss": 2.6402528285980225, | |
| "eval_runtime": 11.5773, | |
| "eval_samples_per_second": 88.449, | |
| "eval_steps_per_second": 5.528, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.299748110831234, | |
| "grad_norm": 736520.0625, | |
| "learning_rate": 0.0001057934508816121, | |
| "loss": 0.2567, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 3.3249370277078087, | |
| "grad_norm": 376107.03125, | |
| "learning_rate": 0.0001050737675422814, | |
| "loss": 0.1442, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.350125944584383, | |
| "grad_norm": 723020.3125, | |
| "learning_rate": 0.0001043540842029507, | |
| "loss": 0.1422, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 3.375314861460957, | |
| "grad_norm": 60796.48046875, | |
| "learning_rate": 0.00010363440086362001, | |
| "loss": 0.2077, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 3.4005037783375314, | |
| "grad_norm": 624072.0, | |
| "learning_rate": 0.00010291471752428932, | |
| "loss": 0.1422, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.4256926952141056, | |
| "grad_norm": 191143.5625, | |
| "learning_rate": 0.00010219503418495862, | |
| "loss": 0.2425, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 3.4508816120906802, | |
| "grad_norm": 291693.75, | |
| "learning_rate": 0.00010147535084562794, | |
| "loss": 0.2693, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 3.4760705289672544, | |
| "grad_norm": 516212.4375, | |
| "learning_rate": 0.00010075566750629723, | |
| "loss": 0.2397, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 3.5012594458438286, | |
| "grad_norm": 273365.9375, | |
| "learning_rate": 0.00010003598416696654, | |
| "loss": 0.1774, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 3.5264483627204033, | |
| "grad_norm": 518260.21875, | |
| "learning_rate": 9.931630082763584e-05, | |
| "loss": 0.3063, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.5264483627204033, | |
| "eval_accuracy": 0.5009765625, | |
| "eval_f1_macro": 0.441499104615912, | |
| "eval_loss": 2.7999606132507324, | |
| "eval_runtime": 11.7898, | |
| "eval_samples_per_second": 86.855, | |
| "eval_steps_per_second": 5.428, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.551637279596977, | |
| "grad_norm": 552209.5, | |
| "learning_rate": 9.859661748830516e-05, | |
| "loss": 0.2254, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 3.5768261964735517, | |
| "grad_norm": 7195.90625, | |
| "learning_rate": 9.787693414897445e-05, | |
| "loss": 0.1378, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 3.602015113350126, | |
| "grad_norm": 879434.0625, | |
| "learning_rate": 9.715725080964376e-05, | |
| "loss": 0.2466, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 3.6272040302267, | |
| "grad_norm": 116491.9609375, | |
| "learning_rate": 9.643756747031306e-05, | |
| "loss": 0.2094, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 3.652392947103275, | |
| "grad_norm": 524990.0625, | |
| "learning_rate": 9.571788413098237e-05, | |
| "loss": 0.2522, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.677581863979849, | |
| "grad_norm": 939990.8125, | |
| "learning_rate": 9.499820079165168e-05, | |
| "loss": 0.2382, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 3.702770780856423, | |
| "grad_norm": 337334.625, | |
| "learning_rate": 9.427851745232098e-05, | |
| "loss": 0.299, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 3.7279596977329974, | |
| "grad_norm": 322309.28125, | |
| "learning_rate": 9.35588341129903e-05, | |
| "loss": 0.2307, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.7531486146095716, | |
| "grad_norm": 80700.328125, | |
| "learning_rate": 9.283915077365959e-05, | |
| "loss": 0.1816, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.7783375314861463, | |
| "grad_norm": 667783.125, | |
| "learning_rate": 9.21194674343289e-05, | |
| "loss": 0.1786, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.7783375314861463, | |
| "eval_accuracy": 0.533203125, | |
| "eval_f1_macro": 0.45250014527358284, | |
| "eval_loss": 2.8164846897125244, | |
| "eval_runtime": 11.728, | |
| "eval_samples_per_second": 87.312, | |
| "eval_steps_per_second": 5.457, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.8035264483627205, | |
| "grad_norm": 340034.53125, | |
| "learning_rate": 9.139978409499821e-05, | |
| "loss": 0.1934, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.8287153652392947, | |
| "grad_norm": 569343.8125, | |
| "learning_rate": 9.068010075566751e-05, | |
| "loss": 0.3571, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.853904282115869, | |
| "grad_norm": 73828.2578125, | |
| "learning_rate": 8.996041741633682e-05, | |
| "loss": 0.2595, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.879093198992443, | |
| "grad_norm": 610885.5625, | |
| "learning_rate": 8.924073407700611e-05, | |
| "loss": 0.2454, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.9042821158690177, | |
| "grad_norm": 549705.5, | |
| "learning_rate": 8.852105073767543e-05, | |
| "loss": 0.2246, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.929471032745592, | |
| "grad_norm": 946495.1875, | |
| "learning_rate": 8.780136739834473e-05, | |
| "loss": 0.3156, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.954659949622166, | |
| "grad_norm": 89126.0546875, | |
| "learning_rate": 8.708168405901404e-05, | |
| "loss": 0.127, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.979848866498741, | |
| "grad_norm": 364322.40625, | |
| "learning_rate": 8.636200071968334e-05, | |
| "loss": 0.2114, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 4.005037783375315, | |
| "grad_norm": 646444.625, | |
| "learning_rate": 8.564231738035265e-05, | |
| "loss": 0.1633, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 4.030226700251889, | |
| "grad_norm": 281335.90625, | |
| "learning_rate": 8.492263404102195e-05, | |
| "loss": 0.0687, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.030226700251889, | |
| "eval_accuracy": 0.568359375, | |
| "eval_f1_macro": 0.49420184108051124, | |
| "eval_loss": 2.9026849269866943, | |
| "eval_runtime": 11.365, | |
| "eval_samples_per_second": 90.101, | |
| "eval_steps_per_second": 5.631, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.055415617128464, | |
| "grad_norm": 12626.9794921875, | |
| "learning_rate": 8.420295070169126e-05, | |
| "loss": 0.0641, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 4.080604534005038, | |
| "grad_norm": 55986.578125, | |
| "learning_rate": 8.348326736236056e-05, | |
| "loss": 0.0647, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 4.105793450881612, | |
| "grad_norm": 509303.3125, | |
| "learning_rate": 8.276358402302987e-05, | |
| "loss": 0.0746, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 4.130982367758186, | |
| "grad_norm": 21481.740234375, | |
| "learning_rate": 8.204390068369917e-05, | |
| "loss": 0.0746, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 4.156171284634761, | |
| "grad_norm": 11360.7412109375, | |
| "learning_rate": 8.132421734436848e-05, | |
| "loss": 0.0555, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 4.181360201511335, | |
| "grad_norm": 396739.0, | |
| "learning_rate": 8.06045340050378e-05, | |
| "loss": 0.0595, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 4.206549118387909, | |
| "grad_norm": 8099.65478515625, | |
| "learning_rate": 7.988485066570709e-05, | |
| "loss": 0.1391, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 4.231738035264484, | |
| "grad_norm": 208596.65625, | |
| "learning_rate": 7.916516732637639e-05, | |
| "loss": 0.1038, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 4.2569269521410575, | |
| "grad_norm": 359488.34375, | |
| "learning_rate": 7.84454839870457e-05, | |
| "loss": 0.025, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 4.282115869017632, | |
| "grad_norm": 247004.875, | |
| "learning_rate": 7.7725800647715e-05, | |
| "loss": 0.0427, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.282115869017632, | |
| "eval_accuracy": 0.4912109375, | |
| "eval_f1_macro": 0.4362345681944286, | |
| "eval_loss": 3.321627616882324, | |
| "eval_runtime": 11.8095, | |
| "eval_samples_per_second": 86.71, | |
| "eval_steps_per_second": 5.419, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.307304785894207, | |
| "grad_norm": 22134.611328125, | |
| "learning_rate": 7.700611730838432e-05, | |
| "loss": 0.1015, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 4.332493702770781, | |
| "grad_norm": 163005.84375, | |
| "learning_rate": 7.628643396905361e-05, | |
| "loss": 0.0524, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 4.357682619647355, | |
| "grad_norm": 25011.78515625, | |
| "learning_rate": 7.556675062972293e-05, | |
| "loss": 0.1, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 4.382871536523929, | |
| "grad_norm": 606683.75, | |
| "learning_rate": 7.484706729039224e-05, | |
| "loss": 0.1503, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 4.408060453400504, | |
| "grad_norm": 411167.28125, | |
| "learning_rate": 7.412738395106154e-05, | |
| "loss": 0.1075, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.433249370277078, | |
| "grad_norm": 20846.5390625, | |
| "learning_rate": 7.340770061173085e-05, | |
| "loss": 0.0793, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 4.458438287153652, | |
| "grad_norm": 35871.06640625, | |
| "learning_rate": 7.268801727240015e-05, | |
| "loss": 0.0607, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 4.483627204030227, | |
| "grad_norm": 58382.01171875, | |
| "learning_rate": 7.196833393306946e-05, | |
| "loss": 0.082, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 4.508816120906801, | |
| "grad_norm": 13096.5205078125, | |
| "learning_rate": 7.124865059373876e-05, | |
| "loss": 0.0754, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 4.534005037783375, | |
| "grad_norm": 21618.96875, | |
| "learning_rate": 7.052896725440807e-05, | |
| "loss": 0.1825, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.534005037783375, | |
| "eval_accuracy": 0.53125, | |
| "eval_f1_macro": 0.4663662196170286, | |
| "eval_loss": 3.1456074714660645, | |
| "eval_runtime": 11.9035, | |
| "eval_samples_per_second": 86.025, | |
| "eval_steps_per_second": 5.377, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.55919395465995, | |
| "grad_norm": 97136.0546875, | |
| "learning_rate": 6.980928391507737e-05, | |
| "loss": 0.0821, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 4.584382871536524, | |
| "grad_norm": 14197.41796875, | |
| "learning_rate": 6.908960057574667e-05, | |
| "loss": 0.1245, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 4.609571788413098, | |
| "grad_norm": 792662.5, | |
| "learning_rate": 6.836991723641598e-05, | |
| "loss": 0.0678, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 4.634760705289673, | |
| "grad_norm": 70302.1484375, | |
| "learning_rate": 6.765023389708528e-05, | |
| "loss": 0.0369, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 4.659949622166247, | |
| "grad_norm": 315541.75, | |
| "learning_rate": 6.693055055775459e-05, | |
| "loss": 0.086, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.685138539042821, | |
| "grad_norm": 426322.5625, | |
| "learning_rate": 6.62108672184239e-05, | |
| "loss": 0.1367, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 4.710327455919396, | |
| "grad_norm": 5201.7265625, | |
| "learning_rate": 6.54911838790932e-05, | |
| "loss": 0.0518, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 4.73551637279597, | |
| "grad_norm": 16552.916015625, | |
| "learning_rate": 6.47715005397625e-05, | |
| "loss": 0.0857, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 4.760705289672544, | |
| "grad_norm": 74746.0234375, | |
| "learning_rate": 6.405181720043182e-05, | |
| "loss": 0.178, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 4.785894206549118, | |
| "grad_norm": 30216.1328125, | |
| "learning_rate": 6.333213386110111e-05, | |
| "loss": 0.0758, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.785894206549118, | |
| "eval_accuracy": 0.5546875, | |
| "eval_f1_macro": 0.4578418057053371, | |
| "eval_loss": 3.2782468795776367, | |
| "eval_runtime": 11.4304, | |
| "eval_samples_per_second": 89.586, | |
| "eval_steps_per_second": 5.599, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.811083123425693, | |
| "grad_norm": 122862.0703125, | |
| "learning_rate": 6.261245052177043e-05, | |
| "loss": 0.1008, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 4.836272040302267, | |
| "grad_norm": 355039.96875, | |
| "learning_rate": 6.189276718243972e-05, | |
| "loss": 0.0619, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 4.861460957178841, | |
| "grad_norm": 180546.546875, | |
| "learning_rate": 6.117308384310904e-05, | |
| "loss": 0.212, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 4.886649874055416, | |
| "grad_norm": 1149145.875, | |
| "learning_rate": 6.045340050377835e-05, | |
| "loss": 0.1821, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 4.91183879093199, | |
| "grad_norm": 197082.046875, | |
| "learning_rate": 5.9733717164447645e-05, | |
| "loss": 0.0539, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.937027707808564, | |
| "grad_norm": 361.24676513671875, | |
| "learning_rate": 5.901403382511696e-05, | |
| "loss": 0.1526, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 4.962216624685139, | |
| "grad_norm": 362799.34375, | |
| "learning_rate": 5.8294350485786255e-05, | |
| "loss": 0.0838, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 4.987405541561713, | |
| "grad_norm": 4683.50732421875, | |
| "learning_rate": 5.757466714645556e-05, | |
| "loss": 0.0995, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 5.012594458438287, | |
| "grad_norm": 5118.85986328125, | |
| "learning_rate": 5.685498380712487e-05, | |
| "loss": 0.0488, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 5.037783375314861, | |
| "grad_norm": 6831.2880859375, | |
| "learning_rate": 5.613530046779417e-05, | |
| "loss": 0.0471, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.037783375314861, | |
| "eval_accuracy": 0.5517578125, | |
| "eval_f1_macro": 0.4725336026660133, | |
| "eval_loss": 3.334784507751465, | |
| "eval_runtime": 11.4554, | |
| "eval_samples_per_second": 89.39, | |
| "eval_steps_per_second": 5.587, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.062972292191436, | |
| "grad_norm": 88275.8359375, | |
| "learning_rate": 5.541561712846348e-05, | |
| "loss": 0.0503, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 5.08816120906801, | |
| "grad_norm": 1161589.25, | |
| "learning_rate": 5.469593378913278e-05, | |
| "loss": 0.0326, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 5.113350125944584, | |
| "grad_norm": 263008.125, | |
| "learning_rate": 5.397625044980209e-05, | |
| "loss": 0.103, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 5.138539042821159, | |
| "grad_norm": 2268.762939453125, | |
| "learning_rate": 5.32565671104714e-05, | |
| "loss": 0.0093, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 5.163727959697733, | |
| "grad_norm": 530844.6875, | |
| "learning_rate": 5.25368837711407e-05, | |
| "loss": 0.0919, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 5.188916876574307, | |
| "grad_norm": 181022.703125, | |
| "learning_rate": 5.1817200431810006e-05, | |
| "loss": 0.049, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 5.214105793450882, | |
| "grad_norm": 1360.029541015625, | |
| "learning_rate": 5.109751709247931e-05, | |
| "loss": 0.0656, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 5.239294710327456, | |
| "grad_norm": 541578.0625, | |
| "learning_rate": 5.0377833753148616e-05, | |
| "loss": 0.0287, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 5.26448362720403, | |
| "grad_norm": 8820.35546875, | |
| "learning_rate": 4.965815041381792e-05, | |
| "loss": 0.0252, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 5.289672544080605, | |
| "grad_norm": 50239.13671875, | |
| "learning_rate": 4.8938467074487226e-05, | |
| "loss": 0.0512, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.289672544080605, | |
| "eval_accuracy": 0.5283203125, | |
| "eval_f1_macro": 0.45143425422893263, | |
| "eval_loss": 3.718236207962036, | |
| "eval_runtime": 11.3333, | |
| "eval_samples_per_second": 90.353, | |
| "eval_steps_per_second": 5.647, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.314861460957179, | |
| "grad_norm": 1060627.125, | |
| "learning_rate": 4.821878373515653e-05, | |
| "loss": 0.0412, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 5.340050377833753, | |
| "grad_norm": 258190.28125, | |
| "learning_rate": 4.749910039582584e-05, | |
| "loss": 0.1177, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 5.365239294710327, | |
| "grad_norm": 126000.2265625, | |
| "learning_rate": 4.677941705649515e-05, | |
| "loss": 0.0049, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 5.390428211586902, | |
| "grad_norm": 2938.684814453125, | |
| "learning_rate": 4.605973371716445e-05, | |
| "loss": 0.1016, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 5.415617128463476, | |
| "grad_norm": 191.32302856445312, | |
| "learning_rate": 4.534005037783376e-05, | |
| "loss": 0.0051, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 5.44080604534005, | |
| "grad_norm": 70.82958984375, | |
| "learning_rate": 4.4620367038503055e-05, | |
| "loss": 0.0335, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 5.465994962216625, | |
| "grad_norm": 2224.07568359375, | |
| "learning_rate": 4.390068369917237e-05, | |
| "loss": 0.0019, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 5.491183879093199, | |
| "grad_norm": 7065.70703125, | |
| "learning_rate": 4.318100035984167e-05, | |
| "loss": 0.0489, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 5.516372795969773, | |
| "grad_norm": 218.8759765625, | |
| "learning_rate": 4.246131702051098e-05, | |
| "loss": 0.0761, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 5.541561712846348, | |
| "grad_norm": 2789.82763671875, | |
| "learning_rate": 4.174163368118028e-05, | |
| "loss": 0.0095, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.541561712846348, | |
| "eval_accuracy": 0.5341796875, | |
| "eval_f1_macro": 0.478510457034419, | |
| "eval_loss": 3.902801036834717, | |
| "eval_runtime": 11.7659, | |
| "eval_samples_per_second": 87.031, | |
| "eval_steps_per_second": 5.439, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.566750629722922, | |
| "grad_norm": 33521.52734375, | |
| "learning_rate": 4.1021950341849587e-05, | |
| "loss": 0.0445, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 5.591939546599496, | |
| "grad_norm": 339091.78125, | |
| "learning_rate": 4.03022670025189e-05, | |
| "loss": 0.0717, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 5.617128463476071, | |
| "grad_norm": 570797.6875, | |
| "learning_rate": 3.9582583663188196e-05, | |
| "loss": 0.0664, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 5.642317380352645, | |
| "grad_norm": 638298.25, | |
| "learning_rate": 3.88629003238575e-05, | |
| "loss": 0.0152, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 5.667506297229219, | |
| "grad_norm": 1308.215087890625, | |
| "learning_rate": 3.8143216984526806e-05, | |
| "loss": 0.0029, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 5.692695214105793, | |
| "grad_norm": 47.38713836669922, | |
| "learning_rate": 3.742353364519612e-05, | |
| "loss": 0.0199, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 5.717884130982368, | |
| "grad_norm": 56138.89453125, | |
| "learning_rate": 3.670385030586542e-05, | |
| "loss": 0.0748, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 5.7430730478589425, | |
| "grad_norm": 49.50386047363281, | |
| "learning_rate": 3.598416696653473e-05, | |
| "loss": 0.0174, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 5.768261964735516, | |
| "grad_norm": 7627.2783203125, | |
| "learning_rate": 3.526448362720403e-05, | |
| "loss": 0.002, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 5.793450881612091, | |
| "grad_norm": 123.52435302734375, | |
| "learning_rate": 3.454480028787334e-05, | |
| "loss": 0.0247, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.793450881612091, | |
| "eval_accuracy": 0.5712890625, | |
| "eval_f1_macro": 0.48791698609028533, | |
| "eval_loss": 3.960580348968506, | |
| "eval_runtime": 11.8204, | |
| "eval_samples_per_second": 86.63, | |
| "eval_steps_per_second": 5.414, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.818639798488665, | |
| "grad_norm": 534.7195434570312, | |
| "learning_rate": 3.382511694854264e-05, | |
| "loss": 0.0496, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 5.843828715365239, | |
| "grad_norm": 17011.57421875, | |
| "learning_rate": 3.310543360921195e-05, | |
| "loss": 0.1476, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 5.869017632241814, | |
| "grad_norm": 386354.28125, | |
| "learning_rate": 3.238575026988125e-05, | |
| "loss": 0.0593, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 5.894206549118388, | |
| "grad_norm": 102287.3828125, | |
| "learning_rate": 3.166606693055056e-05, | |
| "loss": 0.0117, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 5.919395465994962, | |
| "grad_norm": 29.17066764831543, | |
| "learning_rate": 3.094638359121986e-05, | |
| "loss": 0.0702, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 5.944584382871536, | |
| "grad_norm": 1611.7230224609375, | |
| "learning_rate": 3.0226700251889174e-05, | |
| "loss": 0.0687, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 5.969773299748111, | |
| "grad_norm": 168.02320861816406, | |
| "learning_rate": 2.950701691255848e-05, | |
| "loss": 0.0872, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 5.994962216624685, | |
| "grad_norm": 1058.51904296875, | |
| "learning_rate": 2.878733357322778e-05, | |
| "loss": 0.0008, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 6.020151133501259, | |
| "grad_norm": 23.99247932434082, | |
| "learning_rate": 2.8067650233897085e-05, | |
| "loss": 0.0526, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 6.045340050377834, | |
| "grad_norm": 2630.893798828125, | |
| "learning_rate": 2.734796689456639e-05, | |
| "loss": 0.0008, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 6.045340050377834, | |
| "eval_accuracy": 0.5654296875, | |
| "eval_f1_macro": 0.49182304765519874, | |
| "eval_loss": 4.129029750823975, | |
| "eval_runtime": 11.7471, | |
| "eval_samples_per_second": 87.17, | |
| "eval_steps_per_second": 5.448, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 6.0705289672544085, | |
| "grad_norm": 3671.589111328125, | |
| "learning_rate": 2.66282835552357e-05, | |
| "loss": 0.0009, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 6.095717884130982, | |
| "grad_norm": 246.3783721923828, | |
| "learning_rate": 2.5908600215905003e-05, | |
| "loss": 0.0279, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 6.120906801007557, | |
| "grad_norm": 250481.90625, | |
| "learning_rate": 2.5188916876574308e-05, | |
| "loss": 0.0367, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 6.146095717884131, | |
| "grad_norm": 5997.31396484375, | |
| "learning_rate": 2.4469233537243613e-05, | |
| "loss": 0.0002, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 6.171284634760705, | |
| "grad_norm": 294020.6875, | |
| "learning_rate": 2.374955019791292e-05, | |
| "loss": 0.0447, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 6.19647355163728, | |
| "grad_norm": 5173.7607421875, | |
| "learning_rate": 2.3029866858582226e-05, | |
| "loss": 0.0006, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 6.221662468513854, | |
| "grad_norm": 4.966667175292969, | |
| "learning_rate": 2.2310183519251528e-05, | |
| "loss": 0.0394, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 6.246851385390428, | |
| "grad_norm": 328896.6875, | |
| "learning_rate": 2.1590500179920836e-05, | |
| "loss": 0.0725, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 6.272040302267002, | |
| "grad_norm": 855.643798828125, | |
| "learning_rate": 2.087081684059014e-05, | |
| "loss": 0.0001, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 6.297229219143577, | |
| "grad_norm": 18.617643356323242, | |
| "learning_rate": 2.015113350125945e-05, | |
| "loss": 0.0024, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 6.297229219143577, | |
| "eval_accuracy": 0.5654296875, | |
| "eval_f1_macro": 0.4862593826724424, | |
| "eval_loss": 4.414713382720947, | |
| "eval_runtime": 11.3987, | |
| "eval_samples_per_second": 89.835, | |
| "eval_steps_per_second": 5.615, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 6.3224181360201515, | |
| "grad_norm": 43.19185256958008, | |
| "learning_rate": 1.943145016192875e-05, | |
| "loss": 0.005, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 6.347607052896725, | |
| "grad_norm": 636742.8125, | |
| "learning_rate": 1.871176682259806e-05, | |
| "loss": 0.0187, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 6.3727959697733, | |
| "grad_norm": 14.576433181762695, | |
| "learning_rate": 1.7992083483267364e-05, | |
| "loss": 0.0008, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 6.3979848866498745, | |
| "grad_norm": 22.115917205810547, | |
| "learning_rate": 1.727240014393667e-05, | |
| "loss": 0.0038, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 6.423173803526448, | |
| "grad_norm": 618.1704711914062, | |
| "learning_rate": 1.6552716804605974e-05, | |
| "loss": 0.0685, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 6.448362720403023, | |
| "grad_norm": 279531.71875, | |
| "learning_rate": 1.583303346527528e-05, | |
| "loss": 0.0035, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 6.473551637279597, | |
| "grad_norm": 265.0247802734375, | |
| "learning_rate": 1.5113350125944587e-05, | |
| "loss": 0.0245, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 6.498740554156171, | |
| "grad_norm": 635805.875, | |
| "learning_rate": 1.439366678661389e-05, | |
| "loss": 0.0052, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 6.523929471032746, | |
| "grad_norm": 8.843326568603516, | |
| "learning_rate": 1.3673983447283195e-05, | |
| "loss": 0.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 6.54911838790932, | |
| "grad_norm": 87.60523986816406, | |
| "learning_rate": 1.2954300107952502e-05, | |
| "loss": 0.0002, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 6.54911838790932, | |
| "eval_accuracy": 0.5654296875, | |
| "eval_f1_macro": 0.4913330578351924, | |
| "eval_loss": 4.520939826965332, | |
| "eval_runtime": 11.8845, | |
| "eval_samples_per_second": 86.163, | |
| "eval_steps_per_second": 5.385, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 6.574307304785894, | |
| "grad_norm": 8.957763671875, | |
| "learning_rate": 1.2234616768621806e-05, | |
| "loss": 0.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 6.599496221662468, | |
| "grad_norm": 6.1030144691467285, | |
| "learning_rate": 1.1514933429291113e-05, | |
| "loss": 0.0009, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 6.624685138539043, | |
| "grad_norm": 4.862893581390381, | |
| "learning_rate": 1.0795250089960418e-05, | |
| "loss": 0.1231, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 6.6498740554156175, | |
| "grad_norm": 598.0420532226562, | |
| "learning_rate": 1.0075566750629725e-05, | |
| "loss": 0.0001, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 6.675062972292191, | |
| "grad_norm": 5.682479381561279, | |
| "learning_rate": 9.35588341129903e-06, | |
| "loss": 0.0365, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 6.700251889168766, | |
| "grad_norm": 1.800890564918518, | |
| "learning_rate": 8.636200071968334e-06, | |
| "loss": 0.133, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 6.72544080604534, | |
| "grad_norm": 718.6954345703125, | |
| "learning_rate": 7.91651673263764e-06, | |
| "loss": 0.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 6.750629722921914, | |
| "grad_norm": 338166.46875, | |
| "learning_rate": 7.196833393306945e-06, | |
| "loss": 0.034, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 6.775818639798489, | |
| "grad_norm": 471.62005615234375, | |
| "learning_rate": 6.477150053976251e-06, | |
| "loss": 0.0599, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 6.801007556675063, | |
| "grad_norm": 308.09417724609375, | |
| "learning_rate": 5.7574667146455565e-06, | |
| "loss": 0.0055, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 6.801007556675063, | |
| "eval_accuracy": 0.58203125, | |
| "eval_f1_macro": 0.5067161880167751, | |
| "eval_loss": 4.515384197235107, | |
| "eval_runtime": 11.8883, | |
| "eval_samples_per_second": 86.135, | |
| "eval_steps_per_second": 5.383, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 6.826196473551637, | |
| "grad_norm": 407.3280334472656, | |
| "learning_rate": 5.037783375314862e-06, | |
| "loss": 0.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 6.851385390428211, | |
| "grad_norm": 396.4019470214844, | |
| "learning_rate": 4.318100035984167e-06, | |
| "loss": 0.0015, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 6.876574307304786, | |
| "grad_norm": 2.2462317943573, | |
| "learning_rate": 3.5984166966534725e-06, | |
| "loss": 0.0007, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 6.9017632241813605, | |
| "grad_norm": 40.78224182128906, | |
| "learning_rate": 2.8787333573227783e-06, | |
| "loss": 0.0001, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 6.926952141057934, | |
| "grad_norm": 11996.0771484375, | |
| "learning_rate": 2.1590500179920836e-06, | |
| "loss": 0.0002, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 6.952141057934509, | |
| "grad_norm": 2.2435834407806396, | |
| "learning_rate": 1.4393666786613891e-06, | |
| "loss": 0.0001, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 6.977329974811083, | |
| "grad_norm": 2109.901123046875, | |
| "learning_rate": 7.196833393306946e-07, | |
| "loss": 0.0832, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "step": 2779, | |
| "total_flos": 2.5585840915697664e+18, | |
| "train_loss": 0.4836694428009911, | |
| "train_runtime": 1538.4023, | |
| "train_samples_per_second": 28.894, | |
| "train_steps_per_second": 1.806 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2779, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5585840915697664e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |