NguyenTan's picture
Training in progress, step 2000, checkpoint
901be13 verified
{
"best_global_step": 2000,
"best_metric": 1.5096291303634644,
"best_model_checkpoint": "hieptt/vietnamese-correction-ft/checkpoint-2000",
"epoch": 0.04837227301310889,
"eval_steps": 1000,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.4186136506554445e-05,
"grad_norm": 6.7611775398254395,
"learning_rate": 0.0,
"loss": 4.8761,
"step": 1
},
{
"epoch": 0.00024186136506554442,
"grad_norm": 7.38713264465332,
"learning_rate": 4.5e-07,
"loss": 4.9047,
"step": 10
},
{
"epoch": 0.00048372273013108885,
"grad_norm": 9.883085250854492,
"learning_rate": 9.5e-07,
"loss": 4.7161,
"step": 20
},
{
"epoch": 0.0007255840951966333,
"grad_norm": 12.030326843261719,
"learning_rate": 1.45e-06,
"loss": 4.3936,
"step": 30
},
{
"epoch": 0.0009674454602621777,
"grad_norm": 13.895113945007324,
"learning_rate": 1.95e-06,
"loss": 4.0292,
"step": 40
},
{
"epoch": 0.0012093068253277222,
"grad_norm": 14.019319534301758,
"learning_rate": 2.4500000000000003e-06,
"loss": 3.5471,
"step": 50
},
{
"epoch": 0.0014511681903932665,
"grad_norm": 4.721616744995117,
"learning_rate": 2.95e-06,
"loss": 2.88,
"step": 60
},
{
"epoch": 0.001693029555458811,
"grad_norm": 2.526538372039795,
"learning_rate": 3.4500000000000004e-06,
"loss": 2.3633,
"step": 70
},
{
"epoch": 0.0019348909205243554,
"grad_norm": 1.627066969871521,
"learning_rate": 3.95e-06,
"loss": 2.0775,
"step": 80
},
{
"epoch": 0.0021767522855899,
"grad_norm": 0.8922048807144165,
"learning_rate": 4.45e-06,
"loss": 1.9554,
"step": 90
},
{
"epoch": 0.0024186136506554445,
"grad_norm": 0.8262912631034851,
"learning_rate": 4.950000000000001e-06,
"loss": 1.887,
"step": 100
},
{
"epoch": 0.0026604750157209886,
"grad_norm": 0.8611723184585571,
"learning_rate": 5.45e-06,
"loss": 1.8405,
"step": 110
},
{
"epoch": 0.002902336380786533,
"grad_norm": 0.7151685357093811,
"learning_rate": 5.95e-06,
"loss": 1.8095,
"step": 120
},
{
"epoch": 0.0031441977458520776,
"grad_norm": 0.6910758018493652,
"learning_rate": 6.45e-06,
"loss": 1.7881,
"step": 130
},
{
"epoch": 0.003386059110917622,
"grad_norm": 0.7488411664962769,
"learning_rate": 6.950000000000001e-06,
"loss": 1.7647,
"step": 140
},
{
"epoch": 0.0036279204759831663,
"grad_norm": 0.6839690208435059,
"learning_rate": 7.45e-06,
"loss": 1.7577,
"step": 150
},
{
"epoch": 0.003869781841048711,
"grad_norm": 0.7202065587043762,
"learning_rate": 7.95e-06,
"loss": 1.7499,
"step": 160
},
{
"epoch": 0.004111643206114255,
"grad_norm": 0.6052983403205872,
"learning_rate": 8.45e-06,
"loss": 1.741,
"step": 170
},
{
"epoch": 0.0043535045711798,
"grad_norm": 0.7364121079444885,
"learning_rate": 8.95e-06,
"loss": 1.7283,
"step": 180
},
{
"epoch": 0.004595365936245344,
"grad_norm": 0.7102776169776917,
"learning_rate": 9.450000000000001e-06,
"loss": 1.707,
"step": 190
},
{
"epoch": 0.004837227301310889,
"grad_norm": 0.6977587342262268,
"learning_rate": 9.950000000000001e-06,
"loss": 1.706,
"step": 200
},
{
"epoch": 0.005079088666376433,
"grad_norm": 0.6137486100196838,
"learning_rate": 1.045e-05,
"loss": 1.7065,
"step": 210
},
{
"epoch": 0.005320950031441977,
"grad_norm": 0.5842403769493103,
"learning_rate": 1.095e-05,
"loss": 1.6961,
"step": 220
},
{
"epoch": 0.005562811396507522,
"grad_norm": 0.6366673111915588,
"learning_rate": 1.145e-05,
"loss": 1.6822,
"step": 230
},
{
"epoch": 0.005804672761573066,
"grad_norm": 0.5458840131759644,
"learning_rate": 1.195e-05,
"loss": 1.6794,
"step": 240
},
{
"epoch": 0.006046534126638611,
"grad_norm": 0.5614727735519409,
"learning_rate": 1.2450000000000001e-05,
"loss": 1.6587,
"step": 250
},
{
"epoch": 0.006288395491704155,
"grad_norm": 0.6590563058853149,
"learning_rate": 1.2950000000000001e-05,
"loss": 1.6756,
"step": 260
},
{
"epoch": 0.006530256856769699,
"grad_norm": 0.6136428713798523,
"learning_rate": 1.3450000000000002e-05,
"loss": 1.6723,
"step": 270
},
{
"epoch": 0.006772118221835244,
"grad_norm": 0.6550738215446472,
"learning_rate": 1.3950000000000002e-05,
"loss": 1.6605,
"step": 280
},
{
"epoch": 0.007013979586900788,
"grad_norm": 0.5237274765968323,
"learning_rate": 1.4449999999999999e-05,
"loss": 1.6496,
"step": 290
},
{
"epoch": 0.0072558409519663325,
"grad_norm": 0.6175047159194946,
"learning_rate": 1.4950000000000001e-05,
"loss": 1.6441,
"step": 300
},
{
"epoch": 0.0074977023170318775,
"grad_norm": 0.5207669138908386,
"learning_rate": 1.545e-05,
"loss": 1.6478,
"step": 310
},
{
"epoch": 0.007739563682097422,
"grad_norm": 0.4118206799030304,
"learning_rate": 1.595e-05,
"loss": 1.6405,
"step": 320
},
{
"epoch": 0.007981425047162967,
"grad_norm": 0.5392739772796631,
"learning_rate": 1.645e-05,
"loss": 1.6441,
"step": 330
},
{
"epoch": 0.00822328641222851,
"grad_norm": 0.5163934826850891,
"learning_rate": 1.6950000000000002e-05,
"loss": 1.6401,
"step": 340
},
{
"epoch": 0.008465147777294055,
"grad_norm": 0.5002100467681885,
"learning_rate": 1.745e-05,
"loss": 1.6385,
"step": 350
},
{
"epoch": 0.0087070091423596,
"grad_norm": 0.6193915009498596,
"learning_rate": 1.795e-05,
"loss": 1.6222,
"step": 360
},
{
"epoch": 0.008948870507425145,
"grad_norm": 0.45196324586868286,
"learning_rate": 1.845e-05,
"loss": 1.6267,
"step": 370
},
{
"epoch": 0.009190731872490688,
"grad_norm": 0.45798471570014954,
"learning_rate": 1.895e-05,
"loss": 1.6265,
"step": 380
},
{
"epoch": 0.009432593237556233,
"grad_norm": 0.5480896234512329,
"learning_rate": 1.9450000000000002e-05,
"loss": 1.6208,
"step": 390
},
{
"epoch": 0.009674454602621778,
"grad_norm": 0.4133257567882538,
"learning_rate": 1.995e-05,
"loss": 1.6137,
"step": 400
},
{
"epoch": 0.009916315967687321,
"grad_norm": 0.45199403166770935,
"learning_rate": 2.045e-05,
"loss": 1.6196,
"step": 410
},
{
"epoch": 0.010158177332752866,
"grad_norm": 0.5875248312950134,
"learning_rate": 2.095e-05,
"loss": 1.6049,
"step": 420
},
{
"epoch": 0.010400038697818411,
"grad_norm": 0.6048296093940735,
"learning_rate": 2.145e-05,
"loss": 1.6083,
"step": 430
},
{
"epoch": 0.010641900062883954,
"grad_norm": 0.5220174193382263,
"learning_rate": 2.195e-05,
"loss": 1.6074,
"step": 440
},
{
"epoch": 0.0108837614279495,
"grad_norm": 0.5288923978805542,
"learning_rate": 2.245e-05,
"loss": 1.6033,
"step": 450
},
{
"epoch": 0.011125622793015044,
"grad_norm": 0.5983248353004456,
"learning_rate": 2.2950000000000002e-05,
"loss": 1.5976,
"step": 460
},
{
"epoch": 0.011367484158080587,
"grad_norm": 0.5582709312438965,
"learning_rate": 2.345e-05,
"loss": 1.5982,
"step": 470
},
{
"epoch": 0.011609345523146132,
"grad_norm": 0.43671730160713196,
"learning_rate": 2.395e-05,
"loss": 1.5983,
"step": 480
},
{
"epoch": 0.011851206888211677,
"grad_norm": 0.49004241824150085,
"learning_rate": 2.445e-05,
"loss": 1.5916,
"step": 490
},
{
"epoch": 0.012093068253277222,
"grad_norm": 0.41542279720306396,
"learning_rate": 2.495e-05,
"loss": 1.5944,
"step": 500
},
{
"epoch": 0.012334929618342766,
"grad_norm": 0.42180851101875305,
"learning_rate": 2.5450000000000002e-05,
"loss": 1.5958,
"step": 510
},
{
"epoch": 0.01257679098340831,
"grad_norm": 0.5838198661804199,
"learning_rate": 2.595e-05,
"loss": 1.5945,
"step": 520
},
{
"epoch": 0.012818652348473855,
"grad_norm": 0.4461694359779358,
"learning_rate": 2.6450000000000003e-05,
"loss": 1.5975,
"step": 530
},
{
"epoch": 0.013060513713539399,
"grad_norm": 0.445316344499588,
"learning_rate": 2.6950000000000005e-05,
"loss": 1.586,
"step": 540
},
{
"epoch": 0.013302375078604944,
"grad_norm": 0.3972742557525635,
"learning_rate": 2.7450000000000003e-05,
"loss": 1.591,
"step": 550
},
{
"epoch": 0.013544236443670489,
"grad_norm": 0.4089615046977997,
"learning_rate": 2.7950000000000005e-05,
"loss": 1.5944,
"step": 560
},
{
"epoch": 0.013786097808736032,
"grad_norm": 0.42213934659957886,
"learning_rate": 2.845e-05,
"loss": 1.5924,
"step": 570
},
{
"epoch": 0.014027959173801577,
"grad_norm": 0.5217621326446533,
"learning_rate": 2.895e-05,
"loss": 1.5837,
"step": 580
},
{
"epoch": 0.014269820538867122,
"grad_norm": 0.601890504360199,
"learning_rate": 2.945e-05,
"loss": 1.5856,
"step": 590
},
{
"epoch": 0.014511681903932665,
"grad_norm": 0.4616837501525879,
"learning_rate": 2.995e-05,
"loss": 1.5796,
"step": 600
},
{
"epoch": 0.01475354326899821,
"grad_norm": 0.5522668957710266,
"learning_rate": 3.045e-05,
"loss": 1.5768,
"step": 610
},
{
"epoch": 0.014995404634063755,
"grad_norm": 0.5148155093193054,
"learning_rate": 3.095e-05,
"loss": 1.5797,
"step": 620
},
{
"epoch": 0.015237265999129298,
"grad_norm": 0.444976270198822,
"learning_rate": 3.145e-05,
"loss": 1.5811,
"step": 630
},
{
"epoch": 0.015479127364194843,
"grad_norm": 0.4654428958892822,
"learning_rate": 3.1950000000000004e-05,
"loss": 1.5825,
"step": 640
},
{
"epoch": 0.015720988729260386,
"grad_norm": 0.4910729229450226,
"learning_rate": 3.245e-05,
"loss": 1.5678,
"step": 650
},
{
"epoch": 0.015962850094325933,
"grad_norm": 0.42272791266441345,
"learning_rate": 3.295e-05,
"loss": 1.5799,
"step": 660
},
{
"epoch": 0.016204711459391476,
"grad_norm": 0.5058956146240234,
"learning_rate": 3.345000000000001e-05,
"loss": 1.5821,
"step": 670
},
{
"epoch": 0.01644657282445702,
"grad_norm": 0.4395345151424408,
"learning_rate": 3.3950000000000005e-05,
"loss": 1.5787,
"step": 680
},
{
"epoch": 0.016688434189522566,
"grad_norm": 0.3926360607147217,
"learning_rate": 3.445e-05,
"loss": 1.5763,
"step": 690
},
{
"epoch": 0.01693029555458811,
"grad_norm": 0.3995387554168701,
"learning_rate": 3.495e-05,
"loss": 1.5719,
"step": 700
},
{
"epoch": 0.017172156919653656,
"grad_norm": 0.48855945467948914,
"learning_rate": 3.545e-05,
"loss": 1.5754,
"step": 710
},
{
"epoch": 0.0174140182847192,
"grad_norm": 0.7298448085784912,
"learning_rate": 3.595e-05,
"loss": 1.5686,
"step": 720
},
{
"epoch": 0.017655879649784743,
"grad_norm": 0.5547767281532288,
"learning_rate": 3.645e-05,
"loss": 1.5784,
"step": 730
},
{
"epoch": 0.01789774101485029,
"grad_norm": 0.46988898515701294,
"learning_rate": 3.6950000000000004e-05,
"loss": 1.5653,
"step": 740
},
{
"epoch": 0.018139602379915833,
"grad_norm": 0.48516085743904114,
"learning_rate": 3.745e-05,
"loss": 1.5718,
"step": 750
},
{
"epoch": 0.018381463744981376,
"grad_norm": 0.6289165019989014,
"learning_rate": 3.795e-05,
"loss": 1.5604,
"step": 760
},
{
"epoch": 0.018623325110046923,
"grad_norm": 0.5190830826759338,
"learning_rate": 3.845e-05,
"loss": 1.5656,
"step": 770
},
{
"epoch": 0.018865186475112466,
"grad_norm": 0.5255008935928345,
"learning_rate": 3.8950000000000005e-05,
"loss": 1.5719,
"step": 780
},
{
"epoch": 0.01910704784017801,
"grad_norm": 0.5320749878883362,
"learning_rate": 3.9450000000000003e-05,
"loss": 1.5645,
"step": 790
},
{
"epoch": 0.019348909205243556,
"grad_norm": 0.5073422193527222,
"learning_rate": 3.995e-05,
"loss": 1.5757,
"step": 800
},
{
"epoch": 0.0195907705703091,
"grad_norm": 0.5081677436828613,
"learning_rate": 4.045000000000001e-05,
"loss": 1.5618,
"step": 810
},
{
"epoch": 0.019832631935374642,
"grad_norm": 0.4249745309352875,
"learning_rate": 4.095e-05,
"loss": 1.5565,
"step": 820
},
{
"epoch": 0.02007449330044019,
"grad_norm": 0.4423615634441376,
"learning_rate": 4.145e-05,
"loss": 1.5645,
"step": 830
},
{
"epoch": 0.020316354665505732,
"grad_norm": 0.5799271464347839,
"learning_rate": 4.195e-05,
"loss": 1.5584,
"step": 840
},
{
"epoch": 0.020558216030571275,
"grad_norm": 0.4171762466430664,
"learning_rate": 4.245e-05,
"loss": 1.5527,
"step": 850
},
{
"epoch": 0.020800077395636822,
"grad_norm": 0.418618768453598,
"learning_rate": 4.295e-05,
"loss": 1.5571,
"step": 860
},
{
"epoch": 0.021041938760702365,
"grad_norm": 0.4215683937072754,
"learning_rate": 4.345e-05,
"loss": 1.5569,
"step": 870
},
{
"epoch": 0.02128380012576791,
"grad_norm": 0.4244352877140045,
"learning_rate": 4.3950000000000004e-05,
"loss": 1.5581,
"step": 880
},
{
"epoch": 0.021525661490833455,
"grad_norm": 0.5420682430267334,
"learning_rate": 4.445e-05,
"loss": 1.5539,
"step": 890
},
{
"epoch": 0.021767522855899,
"grad_norm": 0.37200865149497986,
"learning_rate": 4.495e-05,
"loss": 1.5503,
"step": 900
},
{
"epoch": 0.02200938422096454,
"grad_norm": 0.46333831548690796,
"learning_rate": 4.545000000000001e-05,
"loss": 1.5514,
"step": 910
},
{
"epoch": 0.02225124558603009,
"grad_norm": 0.4164145290851593,
"learning_rate": 4.5950000000000006e-05,
"loss": 1.5466,
"step": 920
},
{
"epoch": 0.02249310695109563,
"grad_norm": 0.3915160596370697,
"learning_rate": 4.6450000000000004e-05,
"loss": 1.5373,
"step": 930
},
{
"epoch": 0.022734968316161175,
"grad_norm": 0.5574844479560852,
"learning_rate": 4.695e-05,
"loss": 1.5484,
"step": 940
},
{
"epoch": 0.02297682968122672,
"grad_norm": 0.48569104075431824,
"learning_rate": 4.745e-05,
"loss": 1.5494,
"step": 950
},
{
"epoch": 0.023218691046292265,
"grad_norm": 0.5068873167037964,
"learning_rate": 4.795e-05,
"loss": 1.5598,
"step": 960
},
{
"epoch": 0.023460552411357808,
"grad_norm": 0.680260419845581,
"learning_rate": 4.845e-05,
"loss": 1.5707,
"step": 970
},
{
"epoch": 0.023702413776423355,
"grad_norm": 0.41554513573646545,
"learning_rate": 4.8950000000000004e-05,
"loss": 1.5526,
"step": 980
},
{
"epoch": 0.023944275141488898,
"grad_norm": 0.4825437366962433,
"learning_rate": 4.945e-05,
"loss": 1.5479,
"step": 990
},
{
"epoch": 0.024186136506554445,
"grad_norm": 0.42075395584106445,
"learning_rate": 4.995e-05,
"loss": 1.5491,
"step": 1000
},
{
"epoch": 0.024186136506554445,
"eval_loss": 1.5370113849639893,
"eval_runtime": 1410.8458,
"eval_sacrebleu": 96.55473159449107,
"eval_samples_per_second": 71.621,
"eval_steps_per_second": 0.56,
"step": 1000
},
{
"epoch": 0.024427997871619988,
"grad_norm": 0.48966652154922485,
"learning_rate": 4.999999583799493e-05,
"loss": 1.5447,
"step": 1010
},
{
"epoch": 0.02466985923668553,
"grad_norm": 0.6714717149734497,
"learning_rate": 4.999998145081868e-05,
"loss": 1.5504,
"step": 1020
},
{
"epoch": 0.024911720601751078,
"grad_norm": 0.7765893340110779,
"learning_rate": 4.999995678709439e-05,
"loss": 1.5565,
"step": 1030
},
{
"epoch": 0.02515358196681662,
"grad_norm": 0.4742671549320221,
"learning_rate": 4.999992184683219e-05,
"loss": 1.5453,
"step": 1040
},
{
"epoch": 0.025395443331882164,
"grad_norm": 0.5306811332702637,
"learning_rate": 4.999987663004646e-05,
"loss": 1.5512,
"step": 1050
},
{
"epoch": 0.02563730469694771,
"grad_norm": 0.43038302659988403,
"learning_rate": 4.9999821136755766e-05,
"loss": 1.5513,
"step": 1060
},
{
"epoch": 0.025879166062013254,
"grad_norm": 0.788059413433075,
"learning_rate": 4.9999755366982925e-05,
"loss": 1.5326,
"step": 1070
},
{
"epoch": 0.026121027427078797,
"grad_norm": 0.4768883287906647,
"learning_rate": 4.999967932075499e-05,
"loss": 1.5526,
"step": 1080
},
{
"epoch": 0.026362888792144344,
"grad_norm": 0.383400559425354,
"learning_rate": 4.99995929981032e-05,
"loss": 1.5518,
"step": 1090
},
{
"epoch": 0.026604750157209887,
"grad_norm": 0.5224942564964294,
"learning_rate": 4.999949639906304e-05,
"loss": 1.5495,
"step": 1100
},
{
"epoch": 0.02684661152227543,
"grad_norm": 0.4375554025173187,
"learning_rate": 4.999938952367422e-05,
"loss": 1.5521,
"step": 1110
},
{
"epoch": 0.027088472887340977,
"grad_norm": 0.44675424695014954,
"learning_rate": 4.999927237198069e-05,
"loss": 1.5475,
"step": 1120
},
{
"epoch": 0.02733033425240652,
"grad_norm": 0.5646783709526062,
"learning_rate": 4.999914494403059e-05,
"loss": 1.539,
"step": 1130
},
{
"epoch": 0.027572195617472064,
"grad_norm": 0.5079995393753052,
"learning_rate": 4.9999007239876294e-05,
"loss": 1.5437,
"step": 1140
},
{
"epoch": 0.02781405698253761,
"grad_norm": 0.4094880223274231,
"learning_rate": 4.999885925957443e-05,
"loss": 1.5354,
"step": 1150
},
{
"epoch": 0.028055918347603154,
"grad_norm": 0.4403417408466339,
"learning_rate": 4.99987010031858e-05,
"loss": 1.5445,
"step": 1160
},
{
"epoch": 0.028297779712668697,
"grad_norm": 0.3404127061367035,
"learning_rate": 4.9998532470775484e-05,
"loss": 1.5321,
"step": 1170
},
{
"epoch": 0.028539641077734244,
"grad_norm": 0.4042949378490448,
"learning_rate": 4.999835366241274e-05,
"loss": 1.5442,
"step": 1180
},
{
"epoch": 0.028781502442799787,
"grad_norm": 0.3902073800563812,
"learning_rate": 4.9998164578171076e-05,
"loss": 1.5358,
"step": 1190
},
{
"epoch": 0.02902336380786533,
"grad_norm": 0.4594404399394989,
"learning_rate": 4.999796521812822e-05,
"loss": 1.5282,
"step": 1200
},
{
"epoch": 0.029265225172930877,
"grad_norm": 0.4223099648952484,
"learning_rate": 4.999775558236611e-05,
"loss": 1.5388,
"step": 1210
},
{
"epoch": 0.02950708653799642,
"grad_norm": 0.6008235812187195,
"learning_rate": 4.999753567097094e-05,
"loss": 1.5392,
"step": 1220
},
{
"epoch": 0.029748947903061963,
"grad_norm": 0.5003873705863953,
"learning_rate": 4.9997305484033085e-05,
"loss": 1.5434,
"step": 1230
},
{
"epoch": 0.02999080926812751,
"grad_norm": 0.5244422554969788,
"learning_rate": 4.999706502164718e-05,
"loss": 1.5481,
"step": 1240
},
{
"epoch": 0.030232670633193053,
"grad_norm": 0.36595821380615234,
"learning_rate": 4.999681428391207e-05,
"loss": 1.544,
"step": 1250
},
{
"epoch": 0.030474531998258596,
"grad_norm": 0.5237463116645813,
"learning_rate": 4.999655327093081e-05,
"loss": 1.5377,
"step": 1260
},
{
"epoch": 0.030716393363324143,
"grad_norm": 0.4382268190383911,
"learning_rate": 4.999628198281072e-05,
"loss": 1.5382,
"step": 1270
},
{
"epoch": 0.030958254728389686,
"grad_norm": 0.5116040706634521,
"learning_rate": 4.999600041966328e-05,
"loss": 1.5383,
"step": 1280
},
{
"epoch": 0.031200116093455233,
"grad_norm": 0.3517632782459259,
"learning_rate": 4.999570858160426e-05,
"loss": 1.5284,
"step": 1290
},
{
"epoch": 0.03144197745852077,
"grad_norm": 0.46076980233192444,
"learning_rate": 4.999540646875361e-05,
"loss": 1.5347,
"step": 1300
},
{
"epoch": 0.03168383882358632,
"grad_norm": 0.6168367266654968,
"learning_rate": 4.9995094081235524e-05,
"loss": 1.5387,
"step": 1310
},
{
"epoch": 0.031925700188651866,
"grad_norm": 0.40505921840667725,
"learning_rate": 4.9994771419178396e-05,
"loss": 1.5375,
"step": 1320
},
{
"epoch": 0.03216756155371741,
"grad_norm": 0.4371592104434967,
"learning_rate": 4.999443848271489e-05,
"loss": 1.5363,
"step": 1330
},
{
"epoch": 0.03240942291878295,
"grad_norm": 0.518997311592102,
"learning_rate": 4.9994095271981835e-05,
"loss": 1.5434,
"step": 1340
},
{
"epoch": 0.032651284283848496,
"grad_norm": 0.8396134972572327,
"learning_rate": 4.999374178712032e-05,
"loss": 1.5324,
"step": 1350
},
{
"epoch": 0.03289314564891404,
"grad_norm": 0.41988566517829895,
"learning_rate": 4.999337802827566e-05,
"loss": 1.5314,
"step": 1360
},
{
"epoch": 0.03313500701397959,
"grad_norm": 0.3672787845134735,
"learning_rate": 4.999300399559738e-05,
"loss": 1.525,
"step": 1370
},
{
"epoch": 0.03337686837904513,
"grad_norm": 0.4160480499267578,
"learning_rate": 4.999261968923922e-05,
"loss": 1.5298,
"step": 1380
},
{
"epoch": 0.033618729744110676,
"grad_norm": 0.5236791372299194,
"learning_rate": 4.999222510935915e-05,
"loss": 1.5306,
"step": 1390
},
{
"epoch": 0.03386059110917622,
"grad_norm": 0.4650459587574005,
"learning_rate": 4.9991820256119385e-05,
"loss": 1.535,
"step": 1400
},
{
"epoch": 0.03410245247424176,
"grad_norm": 0.39175882935523987,
"learning_rate": 4.999140512968634e-05,
"loss": 1.5302,
"step": 1410
},
{
"epoch": 0.03434431383930731,
"grad_norm": 0.35965096950531006,
"learning_rate": 4.999097973023065e-05,
"loss": 1.5236,
"step": 1420
},
{
"epoch": 0.034586175204372856,
"grad_norm": 0.3973771333694458,
"learning_rate": 4.999054405792718e-05,
"loss": 1.5261,
"step": 1430
},
{
"epoch": 0.0348280365694384,
"grad_norm": 0.5168911218643188,
"learning_rate": 4.999009811295503e-05,
"loss": 1.5289,
"step": 1440
},
{
"epoch": 0.03506989793450394,
"grad_norm": 0.4921228587627411,
"learning_rate": 4.998964189549751e-05,
"loss": 1.537,
"step": 1450
},
{
"epoch": 0.035311759299569485,
"grad_norm": 0.559264600276947,
"learning_rate": 4.9989175405742135e-05,
"loss": 1.5322,
"step": 1460
},
{
"epoch": 0.03555362066463503,
"grad_norm": 0.5126819014549255,
"learning_rate": 4.998869864388068e-05,
"loss": 1.5369,
"step": 1470
},
{
"epoch": 0.03579548202970058,
"grad_norm": 0.4884808361530304,
"learning_rate": 4.998821161010912e-05,
"loss": 1.5359,
"step": 1480
},
{
"epoch": 0.03603734339476612,
"grad_norm": 1.4691296815872192,
"learning_rate": 4.9987714304627655e-05,
"loss": 1.529,
"step": 1490
},
{
"epoch": 0.036279204759831665,
"grad_norm": 23.75047492980957,
"learning_rate": 4.9987206727640703e-05,
"loss": 1.9818,
"step": 1500
},
{
"epoch": 0.03652106612489721,
"grad_norm": 1.1937427520751953,
"learning_rate": 4.998668887935691e-05,
"loss": 2.3099,
"step": 1510
},
{
"epoch": 0.03676292748996275,
"grad_norm": 1.2184133529663086,
"learning_rate": 4.998616075998916e-05,
"loss": 1.9202,
"step": 1520
},
{
"epoch": 0.037004788855028295,
"grad_norm": 0.720676839351654,
"learning_rate": 4.9985622369754525e-05,
"loss": 1.8545,
"step": 1530
},
{
"epoch": 0.037246650220093845,
"grad_norm": 0.4644893705844879,
"learning_rate": 4.998507370887433e-05,
"loss": 1.6034,
"step": 1540
},
{
"epoch": 0.03748851158515939,
"grad_norm": 0.6309983134269714,
"learning_rate": 4.9984514777574085e-05,
"loss": 1.5414,
"step": 1550
},
{
"epoch": 0.03773037295022493,
"grad_norm": 0.3813267648220062,
"learning_rate": 4.998394557608358e-05,
"loss": 1.5335,
"step": 1560
},
{
"epoch": 0.037972234315290475,
"grad_norm": 0.7492319941520691,
"learning_rate": 4.998336610463677e-05,
"loss": 1.5299,
"step": 1570
},
{
"epoch": 0.03821409568035602,
"grad_norm": 0.5672308802604675,
"learning_rate": 4.998277636347186e-05,
"loss": 1.5323,
"step": 1580
},
{
"epoch": 0.03845595704542156,
"grad_norm": 0.3646668791770935,
"learning_rate": 4.998217635283127e-05,
"loss": 1.525,
"step": 1590
},
{
"epoch": 0.03869781841048711,
"grad_norm": 0.46738356351852417,
"learning_rate": 4.998156607296163e-05,
"loss": 1.5258,
"step": 1600
},
{
"epoch": 0.038939679775552655,
"grad_norm": 0.413133442401886,
"learning_rate": 4.998094552411382e-05,
"loss": 1.5317,
"step": 1610
},
{
"epoch": 0.0391815411406182,
"grad_norm": 0.9869425892829895,
"learning_rate": 4.9980314706542916e-05,
"loss": 1.5286,
"step": 1620
},
{
"epoch": 0.03942340250568374,
"grad_norm": 0.44352006912231445,
"learning_rate": 4.997967362050824e-05,
"loss": 1.518,
"step": 1630
},
{
"epoch": 0.039665263870749284,
"grad_norm": 0.33023595809936523,
"learning_rate": 4.997902226627329e-05,
"loss": 1.5239,
"step": 1640
},
{
"epoch": 0.03990712523581483,
"grad_norm": 0.5091515779495239,
"learning_rate": 4.997836064410583e-05,
"loss": 1.524,
"step": 1650
},
{
"epoch": 0.04014898660088038,
"grad_norm": 0.42869803309440613,
"learning_rate": 4.997768875427782e-05,
"loss": 1.5244,
"step": 1660
},
{
"epoch": 0.04039084796594592,
"grad_norm": 0.40443161129951477,
"learning_rate": 4.997700659706545e-05,
"loss": 1.5201,
"step": 1670
},
{
"epoch": 0.040632709331011464,
"grad_norm": 0.37971532344818115,
"learning_rate": 4.997631417274914e-05,
"loss": 1.5283,
"step": 1680
},
{
"epoch": 0.04087457069607701,
"grad_norm": 0.4408821165561676,
"learning_rate": 4.997561148161351e-05,
"loss": 1.5241,
"step": 1690
},
{
"epoch": 0.04111643206114255,
"grad_norm": 0.5017372965812683,
"learning_rate": 4.997489852394741e-05,
"loss": 1.519,
"step": 1700
},
{
"epoch": 0.0413582934262081,
"grad_norm": 0.3806293308734894,
"learning_rate": 4.997417530004391e-05,
"loss": 1.5278,
"step": 1710
},
{
"epoch": 0.041600154791273644,
"grad_norm": 1.141066312789917,
"learning_rate": 4.9973441810200306e-05,
"loss": 1.5174,
"step": 1720
},
{
"epoch": 0.04184201615633919,
"grad_norm": 0.3906162977218628,
"learning_rate": 4.997269805471809e-05,
"loss": 1.519,
"step": 1730
},
{
"epoch": 0.04208387752140473,
"grad_norm": 0.5911729335784912,
"learning_rate": 4.997194403390302e-05,
"loss": 1.536,
"step": 1740
},
{
"epoch": 0.042325738886470274,
"grad_norm": 0.6229117512702942,
"learning_rate": 4.9971179748065024e-05,
"loss": 1.5263,
"step": 1750
},
{
"epoch": 0.04256760025153582,
"grad_norm": 0.4941336512565613,
"learning_rate": 4.997040519751828e-05,
"loss": 1.5202,
"step": 1760
},
{
"epoch": 0.04280946161660137,
"grad_norm": 0.6714040040969849,
"learning_rate": 4.996962038258117e-05,
"loss": 1.5184,
"step": 1770
},
{
"epoch": 0.04305132298166691,
"grad_norm": 0.4575778841972351,
"learning_rate": 4.9968825303576314e-05,
"loss": 1.5265,
"step": 1780
},
{
"epoch": 0.043293184346732454,
"grad_norm": 0.3734686076641083,
"learning_rate": 4.996801996083052e-05,
"loss": 1.5223,
"step": 1790
},
{
"epoch": 0.043535045711798,
"grad_norm": 0.6092630624771118,
"learning_rate": 4.996720435467485e-05,
"loss": 1.5184,
"step": 1800
},
{
"epoch": 0.04377690707686354,
"grad_norm": 0.31611162424087524,
"learning_rate": 4.9966378485444567e-05,
"loss": 1.5201,
"step": 1810
},
{
"epoch": 0.04401876844192908,
"grad_norm": 0.4829297661781311,
"learning_rate": 4.9965542353479144e-05,
"loss": 1.519,
"step": 1820
},
{
"epoch": 0.044260629806994634,
"grad_norm": 0.4227820634841919,
"learning_rate": 4.9964695959122294e-05,
"loss": 1.5147,
"step": 1830
},
{
"epoch": 0.04450249117206018,
"grad_norm": 0.4444202184677124,
"learning_rate": 4.9963839302721936e-05,
"loss": 1.5241,
"step": 1840
},
{
"epoch": 0.04474435253712572,
"grad_norm": 0.42105644941329956,
"learning_rate": 4.99629723846302e-05,
"loss": 1.5248,
"step": 1850
},
{
"epoch": 0.04498621390219126,
"grad_norm": 0.34201350808143616,
"learning_rate": 4.996209520520346e-05,
"loss": 1.5097,
"step": 1860
},
{
"epoch": 0.045228075267256806,
"grad_norm": 0.410153865814209,
"learning_rate": 4.9961207764802275e-05,
"loss": 1.5191,
"step": 1870
},
{
"epoch": 0.04546993663232235,
"grad_norm": 0.38393330574035645,
"learning_rate": 4.996031006379145e-05,
"loss": 1.5119,
"step": 1880
},
{
"epoch": 0.0457117979973879,
"grad_norm": 0.3539496958255768,
"learning_rate": 4.9959402102539986e-05,
"loss": 1.5105,
"step": 1890
},
{
"epoch": 0.04595365936245344,
"grad_norm": 0.8583787679672241,
"learning_rate": 4.995848388142112e-05,
"loss": 1.5276,
"step": 1900
},
{
"epoch": 0.046195520727518986,
"grad_norm": 0.3652508854866028,
"learning_rate": 4.995755540081229e-05,
"loss": 1.5133,
"step": 1910
},
{
"epoch": 0.04643738209258453,
"grad_norm": 0.7512590885162354,
"learning_rate": 4.995661666109518e-05,
"loss": 1.5167,
"step": 1920
},
{
"epoch": 0.04667924345765007,
"grad_norm": 0.4336129128932953,
"learning_rate": 4.9955667662655636e-05,
"loss": 1.5171,
"step": 1930
},
{
"epoch": 0.046921104822715616,
"grad_norm": 0.4716378450393677,
"learning_rate": 4.995470840588379e-05,
"loss": 1.5336,
"step": 1940
},
{
"epoch": 0.047162966187781166,
"grad_norm": 0.3509134352207184,
"learning_rate": 4.995373889117393e-05,
"loss": 1.5282,
"step": 1950
},
{
"epoch": 0.04740482755284671,
"grad_norm": 0.6889932155609131,
"learning_rate": 4.99527591189246e-05,
"loss": 1.515,
"step": 1960
},
{
"epoch": 0.04764668891791225,
"grad_norm": 0.37906014919281006,
"learning_rate": 4.995176908953854e-05,
"loss": 1.5097,
"step": 1970
},
{
"epoch": 0.047888550282977796,
"grad_norm": 0.4350769519805908,
"learning_rate": 4.995076880342271e-05,
"loss": 1.5081,
"step": 1980
},
{
"epoch": 0.04813041164804334,
"grad_norm": 0.33059579133987427,
"learning_rate": 4.994975826098831e-05,
"loss": 1.5157,
"step": 1990
},
{
"epoch": 0.04837227301310889,
"grad_norm": 0.4527088701725006,
"learning_rate": 4.994873746265073e-05,
"loss": 1.5202,
"step": 2000
},
{
"epoch": 0.04837227301310889,
"eval_loss": 1.5096291303634644,
"eval_runtime": 1228.5547,
"eval_sacrebleu": 96.66770045228822,
"eval_samples_per_second": 82.248,
"eval_steps_per_second": 0.643,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0001
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.116679707171226e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}