Training in progress, step 1000, checkpoint

Browse files

Files changed (13) hide show

last-checkpoint/config.json +36 -0
last-checkpoint/dict.txt +0 -0
last-checkpoint/generation_config.json +13 -0
last-checkpoint/model.safetensors +3 -0
last-checkpoint/optimizer.pt +3 -0
last-checkpoint/rng_state.pth +3 -0
last-checkpoint/scaler.pt +3 -0
last-checkpoint/scheduler.pt +3 -0
last-checkpoint/sentencepiece.bpe.model +3 -0
last-checkpoint/special_tokens_map.json +51 -0
last-checkpoint/tokenizer_config.json +56 -0
last-checkpoint/trainer_state.json +759 -0
last-checkpoint/training_args.bin +3 -0

last-checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "activation_dropout": 0.15,
+  "activation_function": "gelu",
+  "architectures": [
+    "MBartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.15,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 0,
+  "dropout": 0.15,
+  "dtype": "float32",
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_position_embeddings": 1024,
+  "model_type": "mbart",
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "tokenizer_class": "BartphoTokenizer",
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 40030
+}

last-checkpoint/dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": [
+    2
+  ],
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "max_length": 512,
+  "max_new_tokens": 512,
+  "pad_token_id": 1,
+  "transformers_version": "4.57.3"
+}

last-checkpoint/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa7e8dc76a1ad80fea8451376493e842c4a3fb713cc61e3730ffb7943836e26
+size 1583480280

last-checkpoint/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f61d9e38597f7e1097962bfa578b78425fe844a9ec6398e8bccb06d0d3be805e
+size 3166958572

last-checkpoint/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172c6e9da1198fecb1104ae5588ac154055d22275bb62749b67f1d60379ff0a7
+size 14645

last-checkpoint/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27712306309eef7ef52a4de0e2e9c9c3c61f74c51a1439a49e7192d4f554d614
+size 1383

last-checkpoint/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68aec417c91400a5fbe9c98d7447dabd74ed3b0812272a5f21d640985e919bad
+size 1465

last-checkpoint/sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

last-checkpoint/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

last-checkpoint/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "40029": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1024,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "BartphoTokenizer",
+  "unk_token": "<unk>"
+}

last-checkpoint/trainer_state.json ADDED Viewed

	@@ -0,0 +1,759 @@

+{
+  "best_global_step": 1000,
+  "best_metric": 1.7293134927749634,
+  "best_model_checkpoint": "hieptt/vietnamese-correction-ft/checkpoint-1000",
+  "epoch": 0.024186136506554445,
+  "eval_steps": 1000,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 2.4186136506554445e-05,
+      "grad_norm": 6.7611775398254395,
+      "learning_rate": 0.0,
+      "loss": 4.8761,
+      "step": 1
+    },
+    {
+      "epoch": 0.00024186136506554442,
+      "grad_norm": 7.387487888336182,
+      "learning_rate": 4.5e-07,
+      "loss": 4.9048,
+      "step": 10
+    },
+    {
+      "epoch": 0.00048372273013108885,
+      "grad_norm": 9.89797592163086,
+      "learning_rate": 9.5e-07,
+      "loss": 4.7162,
+      "step": 20
+    },
+    {
+      "epoch": 0.0007255840951966333,
+      "grad_norm": 12.0128755569458,
+      "learning_rate": 1.45e-06,
+      "loss": 4.3937,
+      "step": 30
+    },
+    {
+      "epoch": 0.0009674454602621777,
+      "grad_norm": 13.892581939697266,
+      "learning_rate": 1.95e-06,
+      "loss": 4.0291,
+      "step": 40
+    },
+    {
+      "epoch": 0.0012093068253277222,
+      "grad_norm": 14.026715278625488,
+      "learning_rate": 2.4500000000000003e-06,
+      "loss": 3.5469,
+      "step": 50
+    },
+    {
+      "epoch": 0.0014511681903932665,
+      "grad_norm": 4.720450401306152,
+      "learning_rate": 2.95e-06,
+      "loss": 2.8797,
+      "step": 60
+    },
+    {
+      "epoch": 0.001693029555458811,
+      "grad_norm": 2.526860237121582,
+      "learning_rate": 3.4500000000000004e-06,
+      "loss": 2.3633,
+      "step": 70
+    },
+    {
+      "epoch": 0.0019348909205243554,
+      "grad_norm": 1.6215327978134155,
+      "learning_rate": 3.95e-06,
+      "loss": 2.0775,
+      "step": 80
+    },
+    {
+      "epoch": 0.0021767522855899,
+      "grad_norm": 0.8917969465255737,
+      "learning_rate": 4.45e-06,
+      "loss": 1.9554,
+      "step": 90
+    },
+    {
+      "epoch": 0.0024186136506554445,
+      "grad_norm": 0.8278113603591919,
+      "learning_rate": 4.950000000000001e-06,
+      "loss": 1.887,
+      "step": 100
+    },
+    {
+      "epoch": 0.0026604750157209886,
+      "grad_norm": 0.8582026362419128,
+      "learning_rate": 5.45e-06,
+      "loss": 1.8405,
+      "step": 110
+    },
+    {
+      "epoch": 0.002902336380786533,
+      "grad_norm": 0.7158175110816956,
+      "learning_rate": 5.95e-06,
+      "loss": 1.8095,
+      "step": 120
+    },
+    {
+      "epoch": 0.0031441977458520776,
+      "grad_norm": 0.6908526420593262,
+      "learning_rate": 6.45e-06,
+      "loss": 1.7881,
+      "step": 130
+    },
+    {
+      "epoch": 0.003386059110917622,
+      "grad_norm": 0.746881902217865,
+      "learning_rate": 6.950000000000001e-06,
+      "loss": 1.7647,
+      "step": 140
+    },
+    {
+      "epoch": 0.0036279204759831663,
+      "grad_norm": 0.6858472228050232,
+      "learning_rate": 7.45e-06,
+      "loss": 1.7577,
+      "step": 150
+    },
+    {
+      "epoch": 0.003869781841048711,
+      "grad_norm": 0.7207092642784119,
+      "learning_rate": 7.95e-06,
+      "loss": 1.7499,
+      "step": 160
+    },
+    {
+      "epoch": 0.004111643206114255,
+      "grad_norm": 0.6050883531570435,
+      "learning_rate": 8.45e-06,
+      "loss": 1.741,
+      "step": 170
+    },
+    {
+      "epoch": 0.0043535045711798,
+      "grad_norm": 0.7387958765029907,
+      "learning_rate": 8.95e-06,
+      "loss": 1.7283,
+      "step": 180
+    },
+    {
+      "epoch": 0.004595365936245344,
+      "grad_norm": 0.7111212015151978,
+      "learning_rate": 9.450000000000001e-06,
+      "loss": 1.707,
+      "step": 190
+    },
+    {
+      "epoch": 0.004837227301310889,
+      "grad_norm": 0.6965436935424805,
+      "learning_rate": 9.950000000000001e-06,
+      "loss": 1.706,
+      "step": 200
+    },
+    {
+      "epoch": 0.005079088666376433,
+      "grad_norm": 0.6116521954536438,
+      "learning_rate": 1.045e-05,
+      "loss": 1.7065,
+      "step": 210
+    },
+    {
+      "epoch": 0.005320950031441977,
+      "grad_norm": 0.5820265412330627,
+      "learning_rate": 1.095e-05,
+      "loss": 1.6962,
+      "step": 220
+    },
+    {
+      "epoch": 0.005562811396507522,
+      "grad_norm": 0.6360993981361389,
+      "learning_rate": 1.145e-05,
+      "loss": 1.6822,
+      "step": 230
+    },
+    {
+      "epoch": 0.005804672761573066,
+      "grad_norm": 0.5463513731956482,
+      "learning_rate": 1.195e-05,
+      "loss": 1.6794,
+      "step": 240
+    },
+    {
+      "epoch": 0.006046534126638611,
+      "grad_norm": 0.5612601041793823,
+      "learning_rate": 1.2450000000000001e-05,
+      "loss": 1.6587,
+      "step": 250
+    },
+    {
+      "epoch": 0.006288395491704155,
+      "grad_norm": 0.657686173915863,
+      "learning_rate": 1.2950000000000001e-05,
+      "loss": 1.6756,
+      "step": 260
+    },
+    {
+      "epoch": 0.006530256856769699,
+      "grad_norm": 0.6136758327484131,
+      "learning_rate": 1.3450000000000002e-05,
+      "loss": 1.6723,
+      "step": 270
+    },
+    {
+      "epoch": 0.006772118221835244,
+      "grad_norm": 0.6555290818214417,
+      "learning_rate": 1.3950000000000002e-05,
+      "loss": 1.6605,
+      "step": 280
+    },
+    {
+      "epoch": 0.007013979586900788,
+      "grad_norm": 0.5237599611282349,
+      "learning_rate": 1.4449999999999999e-05,
+      "loss": 1.6497,
+      "step": 290
+    },
+    {
+      "epoch": 0.0072558409519663325,
+      "grad_norm": 0.617452085018158,
+      "learning_rate": 1.4950000000000001e-05,
+      "loss": 1.6441,
+      "step": 300
+    },
+    {
+      "epoch": 0.0074977023170318775,
+      "grad_norm": 0.5212193727493286,
+      "learning_rate": 1.545e-05,
+      "loss": 1.6478,
+      "step": 310
+    },
+    {
+      "epoch": 0.007739563682097422,
+      "grad_norm": 0.41100624203681946,
+      "learning_rate": 1.595e-05,
+      "loss": 1.6405,
+      "step": 320
+    },
+    {
+      "epoch": 0.007981425047162967,
+      "grad_norm": 0.5392923355102539,
+      "learning_rate": 1.645e-05,
+      "loss": 1.6441,
+      "step": 330
+    },
+    {
+      "epoch": 0.00822328641222851,
+      "grad_norm": 0.5176446437835693,
+      "learning_rate": 1.6950000000000002e-05,
+      "loss": 1.6401,
+      "step": 340
+    },
+    {
+      "epoch": 0.008465147777294055,
+      "grad_norm": 0.4990129768848419,
+      "learning_rate": 1.745e-05,
+      "loss": 1.6385,
+      "step": 350
+    },
+    {
+      "epoch": 0.0087070091423596,
+      "grad_norm": 0.6200714111328125,
+      "learning_rate": 1.795e-05,
+      "loss": 1.6222,
+      "step": 360
+    },
+    {
+      "epoch": 0.008948870507425145,
+      "grad_norm": 0.4544118046760559,
+      "learning_rate": 1.845e-05,
+      "loss": 1.6267,
+      "step": 370
+    },
+    {
+      "epoch": 0.009190731872490688,
+      "grad_norm": 0.4579808712005615,
+      "learning_rate": 1.895e-05,
+      "loss": 1.6265,
+      "step": 380
+    },
+    {
+      "epoch": 0.009432593237556233,
+      "grad_norm": 0.5392481088638306,
+      "learning_rate": 1.9450000000000002e-05,
+      "loss": 1.6208,
+      "step": 390
+    },
+    {
+      "epoch": 0.009674454602621778,
+      "grad_norm": 0.4114883840084076,
+      "learning_rate": 1.995e-05,
+      "loss": 1.6137,
+      "step": 400
+    },
+    {
+      "epoch": 0.009916315967687321,
+      "grad_norm": 0.4515584409236908,
+      "learning_rate": 2.045e-05,
+      "loss": 1.6196,
+      "step": 410
+    },
+    {
+      "epoch": 0.010158177332752866,
+      "grad_norm": 0.585402250289917,
+      "learning_rate": 2.095e-05,
+      "loss": 1.6049,
+      "step": 420
+    },
+    {
+      "epoch": 0.010400038697818411,
+      "grad_norm": 0.6052455902099609,
+      "learning_rate": 2.145e-05,
+      "loss": 1.6083,
+      "step": 430
+    },
+    {
+      "epoch": 0.010641900062883954,
+      "grad_norm": 0.5212528109550476,
+      "learning_rate": 2.195e-05,
+      "loss": 1.6074,
+      "step": 440
+    },
+    {
+      "epoch": 0.0108837614279495,
+      "grad_norm": 0.5251073837280273,
+      "learning_rate": 2.245e-05,
+      "loss": 1.6033,
+      "step": 450
+    },
+    {
+      "epoch": 0.011125622793015044,
+      "grad_norm": 0.5946005582809448,
+      "learning_rate": 2.2950000000000002e-05,
+      "loss": 1.5976,
+      "step": 460
+    },
+    {
+      "epoch": 0.011367484158080587,
+      "grad_norm": 0.5436714887619019,
+      "learning_rate": 2.345e-05,
+      "loss": 1.5982,
+      "step": 470
+    },
+    {
+      "epoch": 0.011609345523146132,
+      "grad_norm": 0.4346768260002136,
+      "learning_rate": 2.395e-05,
+      "loss": 1.5983,
+      "step": 480
+    },
+    {
+      "epoch": 0.011851206888211677,
+      "grad_norm": 0.4886501729488373,
+      "learning_rate": 2.445e-05,
+      "loss": 1.5915,
+      "step": 490
+    },
+    {
+      "epoch": 0.012093068253277222,
+      "grad_norm": 0.41521257162094116,
+      "learning_rate": 2.495e-05,
+      "loss": 1.5944,
+      "step": 500
+    },
+    {
+      "epoch": 0.012334929618342766,
+      "grad_norm": 0.42215368151664734,
+      "learning_rate": 2.5450000000000002e-05,
+      "loss": 1.5959,
+      "step": 510
+    },
+    {
+      "epoch": 0.01257679098340831,
+      "grad_norm": 0.5773257613182068,
+      "learning_rate": 2.595e-05,
+      "loss": 1.5945,
+      "step": 520
+    },
+    {
+      "epoch": 0.012818652348473855,
+      "grad_norm": 0.4475298225879669,
+      "learning_rate": 2.6450000000000003e-05,
+      "loss": 1.5976,
+      "step": 530
+    },
+    {
+      "epoch": 0.013060513713539399,
+      "grad_norm": 0.44435611367225647,
+      "learning_rate": 2.6950000000000005e-05,
+      "loss": 1.586,
+      "step": 540
+    },
+    {
+      "epoch": 0.013302375078604944,
+      "grad_norm": 0.3894321620464325,
+      "learning_rate": 2.7450000000000003e-05,
+      "loss": 1.591,
+      "step": 550
+    },
+    {
+      "epoch": 0.013544236443670489,
+      "grad_norm": 0.5355809926986694,
+      "learning_rate": 2.7950000000000005e-05,
+      "loss": 1.5951,
+      "step": 560
+    },
+    {
+      "epoch": 0.013786097808736032,
+      "grad_norm": 0.42732134461402893,
+      "learning_rate": 2.845e-05,
+      "loss": 1.5921,
+      "step": 570
+    },
+    {
+      "epoch": 0.014027959173801577,
+      "grad_norm": 0.4931578040122986,
+      "learning_rate": 2.895e-05,
+      "loss": 1.5833,
+      "step": 580
+    },
+    {
+      "epoch": 0.014269820538867122,
+      "grad_norm": 0.4680155813694,
+      "learning_rate": 2.945e-05,
+      "loss": 1.5854,
+      "step": 590
+    },
+    {
+      "epoch": 0.014511681903932665,
+      "grad_norm": 0.4610539674758911,
+      "learning_rate": 2.995e-05,
+      "loss": 1.5796,
+      "step": 600
+    },
+    {
+      "epoch": 0.01475354326899821,
+      "grad_norm": 0.5503636002540588,
+      "learning_rate": 3.045e-05,
+      "loss": 1.5765,
+      "step": 610
+    },
+    {
+      "epoch": 0.014995404634063755,
+      "grad_norm": 0.5213884711265564,
+      "learning_rate": 3.095e-05,
+      "loss": 1.5788,
+      "step": 620
+    },
+    {
+      "epoch": 0.015237265999129298,
+      "grad_norm": 0.44397738575935364,
+      "learning_rate": 3.145e-05,
+      "loss": 1.5799,
+      "step": 630
+    },
+    {
+      "epoch": 0.015479127364194843,
+      "grad_norm": 0.4421987235546112,
+      "learning_rate": 3.1950000000000004e-05,
+      "loss": 1.5818,
+      "step": 640
+    },
+    {
+      "epoch": 0.015720988729260386,
+      "grad_norm": 0.48444682359695435,
+      "learning_rate": 3.245e-05,
+      "loss": 1.5672,
+      "step": 650
+    },
+    {
+      "epoch": 0.015962850094325933,
+      "grad_norm": 0.3913522958755493,
+      "learning_rate": 3.295e-05,
+      "loss": 1.5795,
+      "step": 660
+    },
+    {
+      "epoch": 0.016204711459391476,
+      "grad_norm": 0.5272910594940186,
+      "learning_rate": 3.345000000000001e-05,
+      "loss": 1.5814,
+      "step": 670
+    },
+    {
+      "epoch": 0.01644657282445702,
+      "grad_norm": 0.6935471296310425,
+      "learning_rate": 3.3950000000000005e-05,
+      "loss": 1.5796,
+      "step": 680
+    },
+    {
+      "epoch": 0.016688434189522566,
+      "grad_norm": 0.45269250869750977,
+      "learning_rate": 3.445e-05,
+      "loss": 1.5775,
+      "step": 690
+    },
+    {
+      "epoch": 0.01693029555458811,
+      "grad_norm": 0.5092645883560181,
+      "learning_rate": 3.495e-05,
+      "loss": 1.5734,
+      "step": 700
+    },
+    {
+      "epoch": 0.017172156919653656,
+      "grad_norm": 0.45498231053352356,
+      "learning_rate": 3.545e-05,
+      "loss": 1.5763,
+      "step": 710
+    },
+    {
+      "epoch": 0.0174140182847192,
+      "grad_norm": 0.7897817492485046,
+      "learning_rate": 3.595e-05,
+      "loss": 1.5702,
+      "step": 720
+    },
+    {
+      "epoch": 0.017655879649784743,
+      "grad_norm": 0.5634914040565491,
+      "learning_rate": 3.645e-05,
+      "loss": 1.5794,
+      "step": 730
+    },
+    {
+      "epoch": 0.01789774101485029,
+      "grad_norm": 0.47791045904159546,
+      "learning_rate": 3.6950000000000004e-05,
+      "loss": 1.5667,
+      "step": 740
+    },
+    {
+      "epoch": 0.018139602379915833,
+      "grad_norm": 0.5094493627548218,
+      "learning_rate": 3.745e-05,
+      "loss": 1.5741,
+      "step": 750
+    },
+    {
+      "epoch": 0.018381463744981376,
+      "grad_norm": 0.8162134289741516,
+      "learning_rate": 3.795e-05,
+      "loss": 1.5617,
+      "step": 760
+    },
+    {
+      "epoch": 0.018623325110046923,
+      "grad_norm": 0.5486655235290527,
+      "learning_rate": 3.845e-05,
+      "loss": 1.567,
+      "step": 770
+    },
+    {
+      "epoch": 0.018865186475112466,
+      "grad_norm": 0.5235345959663391,
+      "learning_rate": 3.8950000000000005e-05,
+      "loss": 1.5721,
+      "step": 780
+    },
+    {
+      "epoch": 0.01910704784017801,
+      "grad_norm": 0.5451304316520691,
+      "learning_rate": 3.9450000000000003e-05,
+      "loss": 1.5636,
+      "step": 790
+    },
+    {
+      "epoch": 0.019348909205243556,
+      "grad_norm": 0.9819433093070984,
+      "learning_rate": 3.995e-05,
+      "loss": 1.5763,
+      "step": 800
+    },
+    {
+      "epoch": 0.0195907705703091,
+      "grad_norm": 0.5126760601997375,
+      "learning_rate": 4.045000000000001e-05,
+      "loss": 1.5624,
+      "step": 810
+    },
+    {
+      "epoch": 0.019832631935374642,
+      "grad_norm": 0.4358855187892914,
+      "learning_rate": 4.095e-05,
+      "loss": 1.5598,
+      "step": 820
+    },
+    {
+      "epoch": 0.02007449330044019,
+      "grad_norm": 0.46795687079429626,
+      "learning_rate": 4.145e-05,
+      "loss": 1.5662,
+      "step": 830
+    },
+    {
+      "epoch": 0.020316354665505732,
+      "grad_norm": 0.7062624096870422,
+      "learning_rate": 4.195e-05,
+      "loss": 1.5606,
+      "step": 840
+    },
+    {
+      "epoch": 0.020558216030571275,
+      "grad_norm": 0.4150901734828949,
+      "learning_rate": 4.245e-05,
+      "loss": 1.5537,
+      "step": 850
+    },
+    {
+      "epoch": 0.020800077395636822,
+      "grad_norm": 0.43676918745040894,
+      "learning_rate": 4.295e-05,
+      "loss": 1.559,
+      "step": 860
+    },
+    {
+      "epoch": 0.021041938760702365,
+      "grad_norm": 0.43422532081604004,
+      "learning_rate": 4.345e-05,
+      "loss": 1.5578,
+      "step": 870
+    },
+    {
+      "epoch": 0.02128380012576791,
+      "grad_norm": 0.5572603940963745,
+      "learning_rate": 4.3950000000000004e-05,
+      "loss": 1.56,
+      "step": 880
+    },
+    {
+      "epoch": 0.021525661490833455,
+      "grad_norm": 0.5086297392845154,
+      "learning_rate": 4.445e-05,
+      "loss": 1.5552,
+      "step": 890
+    },
+    {
+      "epoch": 0.021767522855899,
+      "grad_norm": 0.4402131736278534,
+      "learning_rate": 4.495e-05,
+      "loss": 1.5529,
+      "step": 900
+    },
+    {
+      "epoch": 0.02200938422096454,
+      "grad_norm": 0.48463812470436096,
+      "learning_rate": 4.545000000000001e-05,
+      "loss": 1.5545,
+      "step": 910
+    },
+    {
+      "epoch": 0.02225124558603009,
+      "grad_norm": 0.43822240829467773,
+      "learning_rate": 4.5950000000000006e-05,
+      "loss": 1.5478,
+      "step": 920
+    },
+    {
+      "epoch": 0.02249310695109563,
+      "grad_norm": 0.4400993883609772,
+      "learning_rate": 4.6450000000000004e-05,
+      "loss": 1.5394,
+      "step": 930
+    },
+    {
+      "epoch": 0.022734968316161175,
+      "grad_norm": 0.5330896377563477,
+      "learning_rate": 4.695e-05,
+      "loss": 1.5495,
+      "step": 940
+    },
+    {
+      "epoch": 0.02297682968122672,
+      "grad_norm": 0.48620447516441345,
+      "learning_rate": 4.745e-05,
+      "loss": 1.5498,
+      "step": 950
+    },
+    {
+      "epoch": 0.023218691046292265,
+      "grad_norm": 0.587838351726532,
+      "learning_rate": 4.795e-05,
+      "loss": 1.5612,
+      "step": 960
+    },
+    {
+      "epoch": 0.023460552411357808,
+      "grad_norm": 0.7441233396530151,
+      "learning_rate": 4.845e-05,
+      "loss": 1.5732,
+      "step": 970
+    },
+    {
+      "epoch": 0.023702413776423355,
+      "grad_norm": 1.0196274518966675,
+      "learning_rate": 4.8950000000000004e-05,
+      "loss": 1.7309,
+      "step": 980
+    },
+    {
+      "epoch": 0.023944275141488898,
+      "grad_norm": 2.096895217895508,
+      "learning_rate": 4.945e-05,
+      "loss": 1.9244,
+      "step": 990
+    },
+    {
+      "epoch": 0.024186136506554445,
+      "grad_norm": 0.6858440637588501,
+      "learning_rate": 4.995e-05,
+      "loss": 1.7569,
+      "step": 1000
+    },
+    {
+      "epoch": 0.024186136506554445,
+      "eval_loss": 1.7293134927749634,
+      "eval_runtime": 1146.3923,
+      "eval_sacrebleu": 88.63943237810263,
+      "eval_samples_per_second": 88.143,
+      "eval_steps_per_second": 0.689,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 50000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 5,
+        "early_stopping_threshold": 0.0001
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.054005594049741e+16,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

last-checkpoint/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8ac17f5232af6c5d50e0e4fa5343f65e47cffc82fe93621b234d0fdfe916e7e
+size 6033