| { |
| "best_global_step": 2000, |
| "best_metric": 1.5096291303634644, |
| "best_model_checkpoint": "hieptt/vietnamese-correction-ft/checkpoint-2000", |
| "epoch": 0.04837227301310889, |
| "eval_steps": 1000, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 2.4186136506554445e-05, |
| "grad_norm": 6.7611775398254395, |
| "learning_rate": 0.0, |
| "loss": 4.8761, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00024186136506554442, |
| "grad_norm": 7.38713264465332, |
| "learning_rate": 4.5e-07, |
| "loss": 4.9047, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.00048372273013108885, |
| "grad_norm": 9.883085250854492, |
| "learning_rate": 9.5e-07, |
| "loss": 4.7161, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0007255840951966333, |
| "grad_norm": 12.030326843261719, |
| "learning_rate": 1.45e-06, |
| "loss": 4.3936, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0009674454602621777, |
| "grad_norm": 13.895113945007324, |
| "learning_rate": 1.95e-06, |
| "loss": 4.0292, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0012093068253277222, |
| "grad_norm": 14.019319534301758, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 3.5471, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0014511681903932665, |
| "grad_norm": 4.721616744995117, |
| "learning_rate": 2.95e-06, |
| "loss": 2.88, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.001693029555458811, |
| "grad_norm": 2.526538372039795, |
| "learning_rate": 3.4500000000000004e-06, |
| "loss": 2.3633, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0019348909205243554, |
| "grad_norm": 1.627066969871521, |
| "learning_rate": 3.95e-06, |
| "loss": 2.0775, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0021767522855899, |
| "grad_norm": 0.8922048807144165, |
| "learning_rate": 4.45e-06, |
| "loss": 1.9554, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0024186136506554445, |
| "grad_norm": 0.8262912631034851, |
| "learning_rate": 4.950000000000001e-06, |
| "loss": 1.887, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0026604750157209886, |
| "grad_norm": 0.8611723184585571, |
| "learning_rate": 5.45e-06, |
| "loss": 1.8405, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.002902336380786533, |
| "grad_norm": 0.7151685357093811, |
| "learning_rate": 5.95e-06, |
| "loss": 1.8095, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0031441977458520776, |
| "grad_norm": 0.6910758018493652, |
| "learning_rate": 6.45e-06, |
| "loss": 1.7881, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.003386059110917622, |
| "grad_norm": 0.7488411664962769, |
| "learning_rate": 6.950000000000001e-06, |
| "loss": 1.7647, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0036279204759831663, |
| "grad_norm": 0.6839690208435059, |
| "learning_rate": 7.45e-06, |
| "loss": 1.7577, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.003869781841048711, |
| "grad_norm": 0.7202065587043762, |
| "learning_rate": 7.95e-06, |
| "loss": 1.7499, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.004111643206114255, |
| "grad_norm": 0.6052983403205872, |
| "learning_rate": 8.45e-06, |
| "loss": 1.741, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0043535045711798, |
| "grad_norm": 0.7364121079444885, |
| "learning_rate": 8.95e-06, |
| "loss": 1.7283, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.004595365936245344, |
| "grad_norm": 0.7102776169776917, |
| "learning_rate": 9.450000000000001e-06, |
| "loss": 1.707, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.004837227301310889, |
| "grad_norm": 0.6977587342262268, |
| "learning_rate": 9.950000000000001e-06, |
| "loss": 1.706, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.005079088666376433, |
| "grad_norm": 0.6137486100196838, |
| "learning_rate": 1.045e-05, |
| "loss": 1.7065, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.005320950031441977, |
| "grad_norm": 0.5842403769493103, |
| "learning_rate": 1.095e-05, |
| "loss": 1.6961, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.005562811396507522, |
| "grad_norm": 0.6366673111915588, |
| "learning_rate": 1.145e-05, |
| "loss": 1.6822, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.005804672761573066, |
| "grad_norm": 0.5458840131759644, |
| "learning_rate": 1.195e-05, |
| "loss": 1.6794, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.006046534126638611, |
| "grad_norm": 0.5614727735519409, |
| "learning_rate": 1.2450000000000001e-05, |
| "loss": 1.6587, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.006288395491704155, |
| "grad_norm": 0.6590563058853149, |
| "learning_rate": 1.2950000000000001e-05, |
| "loss": 1.6756, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.006530256856769699, |
| "grad_norm": 0.6136428713798523, |
| "learning_rate": 1.3450000000000002e-05, |
| "loss": 1.6723, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.006772118221835244, |
| "grad_norm": 0.6550738215446472, |
| "learning_rate": 1.3950000000000002e-05, |
| "loss": 1.6605, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.007013979586900788, |
| "grad_norm": 0.5237274765968323, |
| "learning_rate": 1.4449999999999999e-05, |
| "loss": 1.6496, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0072558409519663325, |
| "grad_norm": 0.6175047159194946, |
| "learning_rate": 1.4950000000000001e-05, |
| "loss": 1.6441, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0074977023170318775, |
| "grad_norm": 0.5207669138908386, |
| "learning_rate": 1.545e-05, |
| "loss": 1.6478, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.007739563682097422, |
| "grad_norm": 0.4118206799030304, |
| "learning_rate": 1.595e-05, |
| "loss": 1.6405, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.007981425047162967, |
| "grad_norm": 0.5392739772796631, |
| "learning_rate": 1.645e-05, |
| "loss": 1.6441, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.00822328641222851, |
| "grad_norm": 0.5163934826850891, |
| "learning_rate": 1.6950000000000002e-05, |
| "loss": 1.6401, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.008465147777294055, |
| "grad_norm": 0.5002100467681885, |
| "learning_rate": 1.745e-05, |
| "loss": 1.6385, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0087070091423596, |
| "grad_norm": 0.6193915009498596, |
| "learning_rate": 1.795e-05, |
| "loss": 1.6222, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.008948870507425145, |
| "grad_norm": 0.45196324586868286, |
| "learning_rate": 1.845e-05, |
| "loss": 1.6267, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.009190731872490688, |
| "grad_norm": 0.45798471570014954, |
| "learning_rate": 1.895e-05, |
| "loss": 1.6265, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.009432593237556233, |
| "grad_norm": 0.5480896234512329, |
| "learning_rate": 1.9450000000000002e-05, |
| "loss": 1.6208, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.009674454602621778, |
| "grad_norm": 0.4133257567882538, |
| "learning_rate": 1.995e-05, |
| "loss": 1.6137, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.009916315967687321, |
| "grad_norm": 0.45199403166770935, |
| "learning_rate": 2.045e-05, |
| "loss": 1.6196, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.010158177332752866, |
| "grad_norm": 0.5875248312950134, |
| "learning_rate": 2.095e-05, |
| "loss": 1.6049, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.010400038697818411, |
| "grad_norm": 0.6048296093940735, |
| "learning_rate": 2.145e-05, |
| "loss": 1.6083, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.010641900062883954, |
| "grad_norm": 0.5220174193382263, |
| "learning_rate": 2.195e-05, |
| "loss": 1.6074, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.0108837614279495, |
| "grad_norm": 0.5288923978805542, |
| "learning_rate": 2.245e-05, |
| "loss": 1.6033, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.011125622793015044, |
| "grad_norm": 0.5983248353004456, |
| "learning_rate": 2.2950000000000002e-05, |
| "loss": 1.5976, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.011367484158080587, |
| "grad_norm": 0.5582709312438965, |
| "learning_rate": 2.345e-05, |
| "loss": 1.5982, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.011609345523146132, |
| "grad_norm": 0.43671730160713196, |
| "learning_rate": 2.395e-05, |
| "loss": 1.5983, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.011851206888211677, |
| "grad_norm": 0.49004241824150085, |
| "learning_rate": 2.445e-05, |
| "loss": 1.5916, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.012093068253277222, |
| "grad_norm": 0.41542279720306396, |
| "learning_rate": 2.495e-05, |
| "loss": 1.5944, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.012334929618342766, |
| "grad_norm": 0.42180851101875305, |
| "learning_rate": 2.5450000000000002e-05, |
| "loss": 1.5958, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.01257679098340831, |
| "grad_norm": 0.5838198661804199, |
| "learning_rate": 2.595e-05, |
| "loss": 1.5945, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.012818652348473855, |
| "grad_norm": 0.4461694359779358, |
| "learning_rate": 2.6450000000000003e-05, |
| "loss": 1.5975, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.013060513713539399, |
| "grad_norm": 0.445316344499588, |
| "learning_rate": 2.6950000000000005e-05, |
| "loss": 1.586, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.013302375078604944, |
| "grad_norm": 0.3972742557525635, |
| "learning_rate": 2.7450000000000003e-05, |
| "loss": 1.591, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.013544236443670489, |
| "grad_norm": 0.4089615046977997, |
| "learning_rate": 2.7950000000000005e-05, |
| "loss": 1.5944, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.013786097808736032, |
| "grad_norm": 0.42213934659957886, |
| "learning_rate": 2.845e-05, |
| "loss": 1.5924, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.014027959173801577, |
| "grad_norm": 0.5217621326446533, |
| "learning_rate": 2.895e-05, |
| "loss": 1.5837, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.014269820538867122, |
| "grad_norm": 0.601890504360199, |
| "learning_rate": 2.945e-05, |
| "loss": 1.5856, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.014511681903932665, |
| "grad_norm": 0.4616837501525879, |
| "learning_rate": 2.995e-05, |
| "loss": 1.5796, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.01475354326899821, |
| "grad_norm": 0.5522668957710266, |
| "learning_rate": 3.045e-05, |
| "loss": 1.5768, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.014995404634063755, |
| "grad_norm": 0.5148155093193054, |
| "learning_rate": 3.095e-05, |
| "loss": 1.5797, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.015237265999129298, |
| "grad_norm": 0.444976270198822, |
| "learning_rate": 3.145e-05, |
| "loss": 1.5811, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.015479127364194843, |
| "grad_norm": 0.4654428958892822, |
| "learning_rate": 3.1950000000000004e-05, |
| "loss": 1.5825, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.015720988729260386, |
| "grad_norm": 0.4910729229450226, |
| "learning_rate": 3.245e-05, |
| "loss": 1.5678, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.015962850094325933, |
| "grad_norm": 0.42272791266441345, |
| "learning_rate": 3.295e-05, |
| "loss": 1.5799, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.016204711459391476, |
| "grad_norm": 0.5058956146240234, |
| "learning_rate": 3.345000000000001e-05, |
| "loss": 1.5821, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.01644657282445702, |
| "grad_norm": 0.4395345151424408, |
| "learning_rate": 3.3950000000000005e-05, |
| "loss": 1.5787, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.016688434189522566, |
| "grad_norm": 0.3926360607147217, |
| "learning_rate": 3.445e-05, |
| "loss": 1.5763, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.01693029555458811, |
| "grad_norm": 0.3995387554168701, |
| "learning_rate": 3.495e-05, |
| "loss": 1.5719, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.017172156919653656, |
| "grad_norm": 0.48855945467948914, |
| "learning_rate": 3.545e-05, |
| "loss": 1.5754, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.0174140182847192, |
| "grad_norm": 0.7298448085784912, |
| "learning_rate": 3.595e-05, |
| "loss": 1.5686, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.017655879649784743, |
| "grad_norm": 0.5547767281532288, |
| "learning_rate": 3.645e-05, |
| "loss": 1.5784, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.01789774101485029, |
| "grad_norm": 0.46988898515701294, |
| "learning_rate": 3.6950000000000004e-05, |
| "loss": 1.5653, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.018139602379915833, |
| "grad_norm": 0.48516085743904114, |
| "learning_rate": 3.745e-05, |
| "loss": 1.5718, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.018381463744981376, |
| "grad_norm": 0.6289165019989014, |
| "learning_rate": 3.795e-05, |
| "loss": 1.5604, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.018623325110046923, |
| "grad_norm": 0.5190830826759338, |
| "learning_rate": 3.845e-05, |
| "loss": 1.5656, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.018865186475112466, |
| "grad_norm": 0.5255008935928345, |
| "learning_rate": 3.8950000000000005e-05, |
| "loss": 1.5719, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.01910704784017801, |
| "grad_norm": 0.5320749878883362, |
| "learning_rate": 3.9450000000000003e-05, |
| "loss": 1.5645, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.019348909205243556, |
| "grad_norm": 0.5073422193527222, |
| "learning_rate": 3.995e-05, |
| "loss": 1.5757, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0195907705703091, |
| "grad_norm": 0.5081677436828613, |
| "learning_rate": 4.045000000000001e-05, |
| "loss": 1.5618, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.019832631935374642, |
| "grad_norm": 0.4249745309352875, |
| "learning_rate": 4.095e-05, |
| "loss": 1.5565, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.02007449330044019, |
| "grad_norm": 0.4423615634441376, |
| "learning_rate": 4.145e-05, |
| "loss": 1.5645, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.020316354665505732, |
| "grad_norm": 0.5799271464347839, |
| "learning_rate": 4.195e-05, |
| "loss": 1.5584, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.020558216030571275, |
| "grad_norm": 0.4171762466430664, |
| "learning_rate": 4.245e-05, |
| "loss": 1.5527, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.020800077395636822, |
| "grad_norm": 0.418618768453598, |
| "learning_rate": 4.295e-05, |
| "loss": 1.5571, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.021041938760702365, |
| "grad_norm": 0.4215683937072754, |
| "learning_rate": 4.345e-05, |
| "loss": 1.5569, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.02128380012576791, |
| "grad_norm": 0.4244352877140045, |
| "learning_rate": 4.3950000000000004e-05, |
| "loss": 1.5581, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.021525661490833455, |
| "grad_norm": 0.5420682430267334, |
| "learning_rate": 4.445e-05, |
| "loss": 1.5539, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.021767522855899, |
| "grad_norm": 0.37200865149497986, |
| "learning_rate": 4.495e-05, |
| "loss": 1.5503, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02200938422096454, |
| "grad_norm": 0.46333831548690796, |
| "learning_rate": 4.545000000000001e-05, |
| "loss": 1.5514, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.02225124558603009, |
| "grad_norm": 0.4164145290851593, |
| "learning_rate": 4.5950000000000006e-05, |
| "loss": 1.5466, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.02249310695109563, |
| "grad_norm": 0.3915160596370697, |
| "learning_rate": 4.6450000000000004e-05, |
| "loss": 1.5373, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.022734968316161175, |
| "grad_norm": 0.5574844479560852, |
| "learning_rate": 4.695e-05, |
| "loss": 1.5484, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.02297682968122672, |
| "grad_norm": 0.48569104075431824, |
| "learning_rate": 4.745e-05, |
| "loss": 1.5494, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.023218691046292265, |
| "grad_norm": 0.5068873167037964, |
| "learning_rate": 4.795e-05, |
| "loss": 1.5598, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.023460552411357808, |
| "grad_norm": 0.680260419845581, |
| "learning_rate": 4.845e-05, |
| "loss": 1.5707, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.023702413776423355, |
| "grad_norm": 0.41554513573646545, |
| "learning_rate": 4.8950000000000004e-05, |
| "loss": 1.5526, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.023944275141488898, |
| "grad_norm": 0.4825437366962433, |
| "learning_rate": 4.945e-05, |
| "loss": 1.5479, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.024186136506554445, |
| "grad_norm": 0.42075395584106445, |
| "learning_rate": 4.995e-05, |
| "loss": 1.5491, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.024186136506554445, |
| "eval_loss": 1.5370113849639893, |
| "eval_runtime": 1410.8458, |
| "eval_sacrebleu": 96.55473159449107, |
| "eval_samples_per_second": 71.621, |
| "eval_steps_per_second": 0.56, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.024427997871619988, |
| "grad_norm": 0.48966652154922485, |
| "learning_rate": 4.999999583799493e-05, |
| "loss": 1.5447, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.02466985923668553, |
| "grad_norm": 0.6714717149734497, |
| "learning_rate": 4.999998145081868e-05, |
| "loss": 1.5504, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.024911720601751078, |
| "grad_norm": 0.7765893340110779, |
| "learning_rate": 4.999995678709439e-05, |
| "loss": 1.5565, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.02515358196681662, |
| "grad_norm": 0.4742671549320221, |
| "learning_rate": 4.999992184683219e-05, |
| "loss": 1.5453, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.025395443331882164, |
| "grad_norm": 0.5306811332702637, |
| "learning_rate": 4.999987663004646e-05, |
| "loss": 1.5512, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.02563730469694771, |
| "grad_norm": 0.43038302659988403, |
| "learning_rate": 4.9999821136755766e-05, |
| "loss": 1.5513, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.025879166062013254, |
| "grad_norm": 0.788059413433075, |
| "learning_rate": 4.9999755366982925e-05, |
| "loss": 1.5326, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.026121027427078797, |
| "grad_norm": 0.4768883287906647, |
| "learning_rate": 4.999967932075499e-05, |
| "loss": 1.5526, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.026362888792144344, |
| "grad_norm": 0.383400559425354, |
| "learning_rate": 4.99995929981032e-05, |
| "loss": 1.5518, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.026604750157209887, |
| "grad_norm": 0.5224942564964294, |
| "learning_rate": 4.999949639906304e-05, |
| "loss": 1.5495, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.02684661152227543, |
| "grad_norm": 0.4375554025173187, |
| "learning_rate": 4.999938952367422e-05, |
| "loss": 1.5521, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.027088472887340977, |
| "grad_norm": 0.44675424695014954, |
| "learning_rate": 4.999927237198069e-05, |
| "loss": 1.5475, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.02733033425240652, |
| "grad_norm": 0.5646783709526062, |
| "learning_rate": 4.999914494403059e-05, |
| "loss": 1.539, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.027572195617472064, |
| "grad_norm": 0.5079995393753052, |
| "learning_rate": 4.9999007239876294e-05, |
| "loss": 1.5437, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.02781405698253761, |
| "grad_norm": 0.4094880223274231, |
| "learning_rate": 4.999885925957443e-05, |
| "loss": 1.5354, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.028055918347603154, |
| "grad_norm": 0.4403417408466339, |
| "learning_rate": 4.99987010031858e-05, |
| "loss": 1.5445, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.028297779712668697, |
| "grad_norm": 0.3404127061367035, |
| "learning_rate": 4.9998532470775484e-05, |
| "loss": 1.5321, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.028539641077734244, |
| "grad_norm": 0.4042949378490448, |
| "learning_rate": 4.999835366241274e-05, |
| "loss": 1.5442, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.028781502442799787, |
| "grad_norm": 0.3902073800563812, |
| "learning_rate": 4.9998164578171076e-05, |
| "loss": 1.5358, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.02902336380786533, |
| "grad_norm": 0.4594404399394989, |
| "learning_rate": 4.999796521812822e-05, |
| "loss": 1.5282, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.029265225172930877, |
| "grad_norm": 0.4223099648952484, |
| "learning_rate": 4.999775558236611e-05, |
| "loss": 1.5388, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.02950708653799642, |
| "grad_norm": 0.6008235812187195, |
| "learning_rate": 4.999753567097094e-05, |
| "loss": 1.5392, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.029748947903061963, |
| "grad_norm": 0.5003873705863953, |
| "learning_rate": 4.9997305484033085e-05, |
| "loss": 1.5434, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.02999080926812751, |
| "grad_norm": 0.5244422554969788, |
| "learning_rate": 4.999706502164718e-05, |
| "loss": 1.5481, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.030232670633193053, |
| "grad_norm": 0.36595821380615234, |
| "learning_rate": 4.999681428391207e-05, |
| "loss": 1.544, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.030474531998258596, |
| "grad_norm": 0.5237463116645813, |
| "learning_rate": 4.999655327093081e-05, |
| "loss": 1.5377, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.030716393363324143, |
| "grad_norm": 0.4382268190383911, |
| "learning_rate": 4.999628198281072e-05, |
| "loss": 1.5382, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.030958254728389686, |
| "grad_norm": 0.5116040706634521, |
| "learning_rate": 4.999600041966328e-05, |
| "loss": 1.5383, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.031200116093455233, |
| "grad_norm": 0.3517632782459259, |
| "learning_rate": 4.999570858160426e-05, |
| "loss": 1.5284, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.03144197745852077, |
| "grad_norm": 0.46076980233192444, |
| "learning_rate": 4.999540646875361e-05, |
| "loss": 1.5347, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.03168383882358632, |
| "grad_norm": 0.6168367266654968, |
| "learning_rate": 4.9995094081235524e-05, |
| "loss": 1.5387, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.031925700188651866, |
| "grad_norm": 0.40505921840667725, |
| "learning_rate": 4.9994771419178396e-05, |
| "loss": 1.5375, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.03216756155371741, |
| "grad_norm": 0.4371592104434967, |
| "learning_rate": 4.999443848271489e-05, |
| "loss": 1.5363, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.03240942291878295, |
| "grad_norm": 0.518997311592102, |
| "learning_rate": 4.9994095271981835e-05, |
| "loss": 1.5434, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.032651284283848496, |
| "grad_norm": 0.8396134972572327, |
| "learning_rate": 4.999374178712032e-05, |
| "loss": 1.5324, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.03289314564891404, |
| "grad_norm": 0.41988566517829895, |
| "learning_rate": 4.999337802827566e-05, |
| "loss": 1.5314, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.03313500701397959, |
| "grad_norm": 0.3672787845134735, |
| "learning_rate": 4.999300399559738e-05, |
| "loss": 1.525, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.03337686837904513, |
| "grad_norm": 0.4160480499267578, |
| "learning_rate": 4.999261968923922e-05, |
| "loss": 1.5298, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.033618729744110676, |
| "grad_norm": 0.5236791372299194, |
| "learning_rate": 4.999222510935915e-05, |
| "loss": 1.5306, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.03386059110917622, |
| "grad_norm": 0.4650459587574005, |
| "learning_rate": 4.9991820256119385e-05, |
| "loss": 1.535, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.03410245247424176, |
| "grad_norm": 0.39175882935523987, |
| "learning_rate": 4.999140512968634e-05, |
| "loss": 1.5302, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.03434431383930731, |
| "grad_norm": 0.35965096950531006, |
| "learning_rate": 4.999097973023065e-05, |
| "loss": 1.5236, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.034586175204372856, |
| "grad_norm": 0.3973771333694458, |
| "learning_rate": 4.999054405792718e-05, |
| "loss": 1.5261, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.0348280365694384, |
| "grad_norm": 0.5168911218643188, |
| "learning_rate": 4.999009811295503e-05, |
| "loss": 1.5289, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.03506989793450394, |
| "grad_norm": 0.4921228587627411, |
| "learning_rate": 4.998964189549751e-05, |
| "loss": 1.537, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.035311759299569485, |
| "grad_norm": 0.559264600276947, |
| "learning_rate": 4.9989175405742135e-05, |
| "loss": 1.5322, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.03555362066463503, |
| "grad_norm": 0.5126819014549255, |
| "learning_rate": 4.998869864388068e-05, |
| "loss": 1.5369, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.03579548202970058, |
| "grad_norm": 0.4884808361530304, |
| "learning_rate": 4.998821161010912e-05, |
| "loss": 1.5359, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.03603734339476612, |
| "grad_norm": 1.4691296815872192, |
| "learning_rate": 4.9987714304627655e-05, |
| "loss": 1.529, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.036279204759831665, |
| "grad_norm": 23.75047492980957, |
| "learning_rate": 4.9987206727640703e-05, |
| "loss": 1.9818, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.03652106612489721, |
| "grad_norm": 1.1937427520751953, |
| "learning_rate": 4.998668887935691e-05, |
| "loss": 2.3099, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.03676292748996275, |
| "grad_norm": 1.2184133529663086, |
| "learning_rate": 4.998616075998916e-05, |
| "loss": 1.9202, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.037004788855028295, |
| "grad_norm": 0.720676839351654, |
| "learning_rate": 4.9985622369754525e-05, |
| "loss": 1.8545, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.037246650220093845, |
| "grad_norm": 0.4644893705844879, |
| "learning_rate": 4.998507370887433e-05, |
| "loss": 1.6034, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.03748851158515939, |
| "grad_norm": 0.6309983134269714, |
| "learning_rate": 4.9984514777574085e-05, |
| "loss": 1.5414, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.03773037295022493, |
| "grad_norm": 0.3813267648220062, |
| "learning_rate": 4.998394557608358e-05, |
| "loss": 1.5335, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.037972234315290475, |
| "grad_norm": 0.7492319941520691, |
| "learning_rate": 4.998336610463677e-05, |
| "loss": 1.5299, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.03821409568035602, |
| "grad_norm": 0.5672308802604675, |
| "learning_rate": 4.998277636347186e-05, |
| "loss": 1.5323, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.03845595704542156, |
| "grad_norm": 0.3646668791770935, |
| "learning_rate": 4.998217635283127e-05, |
| "loss": 1.525, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.03869781841048711, |
| "grad_norm": 0.46738356351852417, |
| "learning_rate": 4.998156607296163e-05, |
| "loss": 1.5258, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.038939679775552655, |
| "grad_norm": 0.413133442401886, |
| "learning_rate": 4.998094552411382e-05, |
| "loss": 1.5317, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.0391815411406182, |
| "grad_norm": 0.9869425892829895, |
| "learning_rate": 4.9980314706542916e-05, |
| "loss": 1.5286, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.03942340250568374, |
| "grad_norm": 0.44352006912231445, |
| "learning_rate": 4.997967362050824e-05, |
| "loss": 1.518, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.039665263870749284, |
| "grad_norm": 0.33023595809936523, |
| "learning_rate": 4.997902226627329e-05, |
| "loss": 1.5239, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.03990712523581483, |
| "grad_norm": 0.5091515779495239, |
| "learning_rate": 4.997836064410583e-05, |
| "loss": 1.524, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.04014898660088038, |
| "grad_norm": 0.42869803309440613, |
| "learning_rate": 4.997768875427782e-05, |
| "loss": 1.5244, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.04039084796594592, |
| "grad_norm": 0.40443161129951477, |
| "learning_rate": 4.997700659706545e-05, |
| "loss": 1.5201, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.040632709331011464, |
| "grad_norm": 0.37971532344818115, |
| "learning_rate": 4.997631417274914e-05, |
| "loss": 1.5283, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.04087457069607701, |
| "grad_norm": 0.4408821165561676, |
| "learning_rate": 4.997561148161351e-05, |
| "loss": 1.5241, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.04111643206114255, |
| "grad_norm": 0.5017372965812683, |
| "learning_rate": 4.997489852394741e-05, |
| "loss": 1.519, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.0413582934262081, |
| "grad_norm": 0.3806293308734894, |
| "learning_rate": 4.997417530004391e-05, |
| "loss": 1.5278, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.041600154791273644, |
| "grad_norm": 1.141066312789917, |
| "learning_rate": 4.9973441810200306e-05, |
| "loss": 1.5174, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.04184201615633919, |
| "grad_norm": 0.3906162977218628, |
| "learning_rate": 4.997269805471809e-05, |
| "loss": 1.519, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.04208387752140473, |
| "grad_norm": 0.5911729335784912, |
| "learning_rate": 4.997194403390302e-05, |
| "loss": 1.536, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.042325738886470274, |
| "grad_norm": 0.6229117512702942, |
| "learning_rate": 4.9971179748065024e-05, |
| "loss": 1.5263, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.04256760025153582, |
| "grad_norm": 0.4941336512565613, |
| "learning_rate": 4.997040519751828e-05, |
| "loss": 1.5202, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.04280946161660137, |
| "grad_norm": 0.6714040040969849, |
| "learning_rate": 4.996962038258117e-05, |
| "loss": 1.5184, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.04305132298166691, |
| "grad_norm": 0.4575778841972351, |
| "learning_rate": 4.9968825303576314e-05, |
| "loss": 1.5265, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.043293184346732454, |
| "grad_norm": 0.3734686076641083, |
| "learning_rate": 4.996801996083052e-05, |
| "loss": 1.5223, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.043535045711798, |
| "grad_norm": 0.6092630624771118, |
| "learning_rate": 4.996720435467485e-05, |
| "loss": 1.5184, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.04377690707686354, |
| "grad_norm": 0.31611162424087524, |
| "learning_rate": 4.9966378485444567e-05, |
| "loss": 1.5201, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.04401876844192908, |
| "grad_norm": 0.4829297661781311, |
| "learning_rate": 4.9965542353479144e-05, |
| "loss": 1.519, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.044260629806994634, |
| "grad_norm": 0.4227820634841919, |
| "learning_rate": 4.9964695959122294e-05, |
| "loss": 1.5147, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.04450249117206018, |
| "grad_norm": 0.4444202184677124, |
| "learning_rate": 4.9963839302721936e-05, |
| "loss": 1.5241, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.04474435253712572, |
| "grad_norm": 0.42105644941329956, |
| "learning_rate": 4.99629723846302e-05, |
| "loss": 1.5248, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.04498621390219126, |
| "grad_norm": 0.34201350808143616, |
| "learning_rate": 4.996209520520346e-05, |
| "loss": 1.5097, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.045228075267256806, |
| "grad_norm": 0.410153865814209, |
| "learning_rate": 4.9961207764802275e-05, |
| "loss": 1.5191, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.04546993663232235, |
| "grad_norm": 0.38393330574035645, |
| "learning_rate": 4.996031006379145e-05, |
| "loss": 1.5119, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.0457117979973879, |
| "grad_norm": 0.3539496958255768, |
| "learning_rate": 4.9959402102539986e-05, |
| "loss": 1.5105, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.04595365936245344, |
| "grad_norm": 0.8583787679672241, |
| "learning_rate": 4.995848388142112e-05, |
| "loss": 1.5276, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.046195520727518986, |
| "grad_norm": 0.3652508854866028, |
| "learning_rate": 4.995755540081229e-05, |
| "loss": 1.5133, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.04643738209258453, |
| "grad_norm": 0.7512590885162354, |
| "learning_rate": 4.995661666109518e-05, |
| "loss": 1.5167, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.04667924345765007, |
| "grad_norm": 0.4336129128932953, |
| "learning_rate": 4.9955667662655636e-05, |
| "loss": 1.5171, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.046921104822715616, |
| "grad_norm": 0.4716378450393677, |
| "learning_rate": 4.995470840588379e-05, |
| "loss": 1.5336, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.047162966187781166, |
| "grad_norm": 0.3509134352207184, |
| "learning_rate": 4.995373889117393e-05, |
| "loss": 1.5282, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.04740482755284671, |
| "grad_norm": 0.6889932155609131, |
| "learning_rate": 4.99527591189246e-05, |
| "loss": 1.515, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.04764668891791225, |
| "grad_norm": 0.37906014919281006, |
| "learning_rate": 4.995176908953854e-05, |
| "loss": 1.5097, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.047888550282977796, |
| "grad_norm": 0.4350769519805908, |
| "learning_rate": 4.995076880342271e-05, |
| "loss": 1.5081, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.04813041164804334, |
| "grad_norm": 0.33059579133987427, |
| "learning_rate": 4.994975826098831e-05, |
| "loss": 1.5157, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.04837227301310889, |
| "grad_norm": 0.4527088701725006, |
| "learning_rate": 4.994873746265073e-05, |
| "loss": 1.5202, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.04837227301310889, |
| "eval_loss": 1.5096291303634644, |
| "eval_runtime": 1228.5547, |
| "eval_sacrebleu": 96.66770045228822, |
| "eval_samples_per_second": 82.248, |
| "eval_steps_per_second": 0.643, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 50000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 5, |
| "early_stopping_threshold": 0.0001 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.116679707171226e+16, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|