Training in progress, epoch 1, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/config.json +31 -0
last-checkpoint/generation_config.json +7 -0
last-checkpoint/model.safetensors +3 -0
last-checkpoint/optimizer.pt +3 -0
last-checkpoint/rng_state.pth +3 -0
last-checkpoint/scheduler.pt +3 -0
last-checkpoint/trainer_state.json +1154 -0
last-checkpoint/training_args.bin +3 -0

last-checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "facebook/opt-350m",
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "OPTForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "do_layer_norm_before": false,
+  "dropout": 0.1,
+  "enable_bias": true,
+  "eos_token_id": 2,
+  "ffn_dim": 4096,
+  "hidden_size": 1024,
+  "init_std": 0.02,
+  "layer_norm_elementwise_affine": true,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 2048,
+  "model_type": "opt",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "prefix": "</s>",
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 50272,
+  "word_embed_proj_dim": 512
+}

last-checkpoint/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.47.1"
+}

last-checkpoint/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d7f3cfa105d55d981feea87dc1698d387783a26648d9c65ce26728eec859c4
+size 1324830880

last-checkpoint/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df180fbce5b0710b2f5ccb4689f53c435b2b47130f21918e3f901a58801e0b69
+size 2649896094

last-checkpoint/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97132a0bd0373c36b3bd93d6b28f9802a143f265a6eaa7b3097196f6002ba451
+size 14244

last-checkpoint/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11e3a63c9cfc5f5b20ed4deea6d33c221d465c076914b8f644e71e94dbcbcdbc
+size 1064

last-checkpoint/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1154 @@

+{
+  "best_metric": 0.371297687292099,
+  "best_model_checkpoint": "./opt_trained/checkpoint-159",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 159,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006314127861089187,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0001,
+      "loss": 6.0008,
+      "step": 1
+    },
+    {
+      "epoch": 0.012628255722178374,
+      "grad_norm": 161.0778350830078,
+      "learning_rate": 9.984177215189874e-05,
+      "loss": 6.1896,
+      "step": 2
+    },
+    {
+      "epoch": 0.018942383583267563,
+      "grad_norm": Infinity,
+      "learning_rate": 9.984177215189874e-05,
+      "loss": 20.1988,
+      "step": 3
+    },
+    {
+      "epoch": 0.025256511444356748,
+      "grad_norm": Infinity,
+      "learning_rate": 9.984177215189874e-05,
+      "loss": 19.8097,
+      "step": 4
+    },
+    {
+      "epoch": 0.03157063930544594,
+      "grad_norm": Infinity,
+      "learning_rate": 9.984177215189874e-05,
+      "loss": 19.9597,
+      "step": 5
+    },
+    {
+      "epoch": 0.037884767166535126,
+      "grad_norm": 584.6585083007812,
+      "learning_rate": 9.968354430379747e-05,
+      "loss": 20.0818,
+      "step": 6
+    },
+    {
+      "epoch": 0.04419889502762431,
+      "grad_norm": 723.80224609375,
+      "learning_rate": 9.952531645569621e-05,
+      "loss": 18.7822,
+      "step": 7
+    },
+    {
+      "epoch": 0.050513022888713496,
+      "grad_norm": 608.1036376953125,
+      "learning_rate": 9.936708860759493e-05,
+      "loss": 27.5703,
+      "step": 8
+    },
+    {
+      "epoch": 0.056827150749802685,
+      "grad_norm": 416.6692199707031,
+      "learning_rate": 9.920886075949367e-05,
+      "loss": 23.7453,
+      "step": 9
+    },
+    {
+      "epoch": 0.06314127861089187,
+      "grad_norm": 377.489990234375,
+      "learning_rate": 9.90506329113924e-05,
+      "loss": 18.0302,
+      "step": 10
+    },
+    {
+      "epoch": 0.06945540647198106,
+      "grad_norm": 187.46090698242188,
+      "learning_rate": 9.889240506329115e-05,
+      "loss": 13.8748,
+      "step": 11
+    },
+    {
+      "epoch": 0.07576953433307025,
+      "grad_norm": 93.60346984863281,
+      "learning_rate": 9.873417721518988e-05,
+      "loss": 12.0784,
+      "step": 12
+    },
+    {
+      "epoch": 0.08208366219415943,
+      "grad_norm": 99.17755889892578,
+      "learning_rate": 9.857594936708862e-05,
+      "loss": 10.016,
+      "step": 13
+    },
+    {
+      "epoch": 0.08839779005524862,
+      "grad_norm": 191.79139709472656,
+      "learning_rate": 9.841772151898735e-05,
+      "loss": 9.5476,
+      "step": 14
+    },
+    {
+      "epoch": 0.0947119179163378,
+      "grad_norm": 264.03436279296875,
+      "learning_rate": 9.825949367088608e-05,
+      "loss": 10.4265,
+      "step": 15
+    },
+    {
+      "epoch": 0.10102604577742699,
+      "grad_norm": 212.4002685546875,
+      "learning_rate": 9.810126582278482e-05,
+      "loss": 10.2333,
+      "step": 16
+    },
+    {
+      "epoch": 0.10734017363851618,
+      "grad_norm": 132.50941467285156,
+      "learning_rate": 9.794303797468355e-05,
+      "loss": 8.4413,
+      "step": 17
+    },
+    {
+      "epoch": 0.11365430149960537,
+      "grad_norm": 197.4528350830078,
+      "learning_rate": 9.778481012658228e-05,
+      "loss": 8.8448,
+      "step": 18
+    },
+    {
+      "epoch": 0.11996842936069456,
+      "grad_norm": 72.22266387939453,
+      "learning_rate": 9.762658227848101e-05,
+      "loss": 7.3288,
+      "step": 19
+    },
+    {
+      "epoch": 0.12628255722178375,
+      "grad_norm": 99.05847930908203,
+      "learning_rate": 9.746835443037975e-05,
+      "loss": 6.6077,
+      "step": 20
+    },
+    {
+      "epoch": 0.13259668508287292,
+      "grad_norm": 131.28546142578125,
+      "learning_rate": 9.731012658227849e-05,
+      "loss": 6.2343,
+      "step": 21
+    },
+    {
+      "epoch": 0.13891081294396213,
+      "grad_norm": 38.381507873535156,
+      "learning_rate": 9.715189873417721e-05,
+      "loss": 4.9639,
+      "step": 22
+    },
+    {
+      "epoch": 0.1452249408050513,
+      "grad_norm": 96.89019775390625,
+      "learning_rate": 9.699367088607595e-05,
+      "loss": 5.2425,
+      "step": 23
+    },
+    {
+      "epoch": 0.1515390686661405,
+      "grad_norm": 91.1437759399414,
+      "learning_rate": 9.683544303797469e-05,
+      "loss": 4.7628,
+      "step": 24
+    },
+    {
+      "epoch": 0.15785319652722968,
+      "grad_norm": 115.70115661621094,
+      "learning_rate": 9.667721518987343e-05,
+      "loss": 4.8574,
+      "step": 25
+    },
+    {
+      "epoch": 0.16416732438831885,
+      "grad_norm": 54.2802734375,
+      "learning_rate": 9.651898734177216e-05,
+      "loss": 4.3247,
+      "step": 26
+    },
+    {
+      "epoch": 0.17048145224940806,
+      "grad_norm": 83.99079895019531,
+      "learning_rate": 9.63607594936709e-05,
+      "loss": 4.6229,
+      "step": 27
+    },
+    {
+      "epoch": 0.17679558011049723,
+      "grad_norm": 38.651798248291016,
+      "learning_rate": 9.620253164556962e-05,
+      "loss": 4.097,
+      "step": 28
+    },
+    {
+      "epoch": 0.18310970797158643,
+      "grad_norm": 80.54218292236328,
+      "learning_rate": 9.604430379746836e-05,
+      "loss": 4.1475,
+      "step": 29
+    },
+    {
+      "epoch": 0.1894238358326756,
+      "grad_norm": 57.01435470581055,
+      "learning_rate": 9.58860759493671e-05,
+      "loss": 4.1019,
+      "step": 30
+    },
+    {
+      "epoch": 0.1957379636937648,
+      "grad_norm": 26.745500564575195,
+      "learning_rate": 9.572784810126582e-05,
+      "loss": 3.8805,
+      "step": 31
+    },
+    {
+      "epoch": 0.20205209155485399,
+      "grad_norm": 31.093868255615234,
+      "learning_rate": 9.556962025316456e-05,
+      "loss": 3.8496,
+      "step": 32
+    },
+    {
+      "epoch": 0.20836621941594316,
+      "grad_norm": 23.809965133666992,
+      "learning_rate": 9.541139240506329e-05,
+      "loss": 4.0042,
+      "step": 33
+    },
+    {
+      "epoch": 0.21468034727703236,
+      "grad_norm": 21.90875816345215,
+      "learning_rate": 9.525316455696203e-05,
+      "loss": 3.4837,
+      "step": 34
+    },
+    {
+      "epoch": 0.22099447513812154,
+      "grad_norm": 14.536842346191406,
+      "learning_rate": 9.509493670886075e-05,
+      "loss": 3.2669,
+      "step": 35
+    },
+    {
+      "epoch": 0.22730860299921074,
+      "grad_norm": 19.5743408203125,
+      "learning_rate": 9.493670886075949e-05,
+      "loss": 3.5913,
+      "step": 36
+    },
+    {
+      "epoch": 0.23362273086029992,
+      "grad_norm": 17.766374588012695,
+      "learning_rate": 9.477848101265823e-05,
+      "loss": 3.7308,
+      "step": 37
+    },
+    {
+      "epoch": 0.23993685872138912,
+      "grad_norm": 17.416940689086914,
+      "learning_rate": 9.462025316455697e-05,
+      "loss": 3.5262,
+      "step": 38
+    },
+    {
+      "epoch": 0.2462509865824783,
+      "grad_norm": 8.945268630981445,
+      "learning_rate": 9.446202531645571e-05,
+      "loss": 3.4766,
+      "step": 39
+    },
+    {
+      "epoch": 0.2525651144435675,
+      "grad_norm": 10.85609245300293,
+      "learning_rate": 9.430379746835444e-05,
+      "loss": 3.4725,
+      "step": 40
+    },
+    {
+      "epoch": 0.25887924230465664,
+      "grad_norm": 17.012737274169922,
+      "learning_rate": 9.414556962025317e-05,
+      "loss": 3.3638,
+      "step": 41
+    },
+    {
+      "epoch": 0.26519337016574585,
+      "grad_norm": 9.961986541748047,
+      "learning_rate": 9.39873417721519e-05,
+      "loss": 3.4341,
+      "step": 42
+    },
+    {
+      "epoch": 0.27150749802683505,
+      "grad_norm": 37.24739074707031,
+      "learning_rate": 9.382911392405064e-05,
+      "loss": 3.7622,
+      "step": 43
+    },
+    {
+      "epoch": 0.27782162588792425,
+      "grad_norm": 40.56873321533203,
+      "learning_rate": 9.367088607594936e-05,
+      "loss": 3.4796,
+      "step": 44
+    },
+    {
+      "epoch": 0.2841357537490134,
+      "grad_norm": 12.76440143585205,
+      "learning_rate": 9.35126582278481e-05,
+      "loss": 3.6249,
+      "step": 45
+    },
+    {
+      "epoch": 0.2904498816101026,
+      "grad_norm": 27.08614730834961,
+      "learning_rate": 9.335443037974684e-05,
+      "loss": 3.5403,
+      "step": 46
+    },
+    {
+      "epoch": 0.2967640094711918,
+      "grad_norm": 32.77756881713867,
+      "learning_rate": 9.319620253164557e-05,
+      "loss": 3.5529,
+      "step": 47
+    },
+    {
+      "epoch": 0.303078137332281,
+      "grad_norm": 10.868654251098633,
+      "learning_rate": 9.303797468354431e-05,
+      "loss": 3.3178,
+      "step": 48
+    },
+    {
+      "epoch": 0.30939226519337015,
+      "grad_norm": 28.319528579711914,
+      "learning_rate": 9.287974683544303e-05,
+      "loss": 3.3276,
+      "step": 49
+    },
+    {
+      "epoch": 0.31570639305445936,
+      "grad_norm": 27.239425659179688,
+      "learning_rate": 9.272151898734177e-05,
+      "loss": 3.5792,
+      "step": 50
+    },
+    {
+      "epoch": 0.32202052091554856,
+      "grad_norm": 7.753026962280273,
+      "learning_rate": 9.256329113924051e-05,
+      "loss": 3.5363,
+      "step": 51
+    },
+    {
+      "epoch": 0.3283346487766377,
+      "grad_norm": 18.737960815429688,
+      "learning_rate": 9.240506329113925e-05,
+      "loss": 3.3422,
+      "step": 52
+    },
+    {
+      "epoch": 0.3346487766377269,
+      "grad_norm": 14.71604061126709,
+      "learning_rate": 9.224683544303798e-05,
+      "loss": 3.2696,
+      "step": 53
+    },
+    {
+      "epoch": 0.3409629044988161,
+      "grad_norm": 15.523343086242676,
+      "learning_rate": 9.208860759493671e-05,
+      "loss": 3.5421,
+      "step": 54
+    },
+    {
+      "epoch": 0.3472770323599053,
+      "grad_norm": 11.16079044342041,
+      "learning_rate": 9.193037974683545e-05,
+      "loss": 3.3622,
+      "step": 55
+    },
+    {
+      "epoch": 0.35359116022099446,
+      "grad_norm": 9.385275840759277,
+      "learning_rate": 9.177215189873418e-05,
+      "loss": 3.3368,
+      "step": 56
+    },
+    {
+      "epoch": 0.35990528808208366,
+      "grad_norm": 7.030306816101074,
+      "learning_rate": 9.161392405063292e-05,
+      "loss": 3.383,
+      "step": 57
+    },
+    {
+      "epoch": 0.36621941594317287,
+      "grad_norm": 8.905327796936035,
+      "learning_rate": 9.145569620253164e-05,
+      "loss": 3.4087,
+      "step": 58
+    },
+    {
+      "epoch": 0.372533543804262,
+      "grad_norm": 13.373973846435547,
+      "learning_rate": 9.129746835443038e-05,
+      "loss": 3.407,
+      "step": 59
+    },
+    {
+      "epoch": 0.3788476716653512,
+      "grad_norm": 7.750667095184326,
+      "learning_rate": 9.113924050632912e-05,
+      "loss": 3.3293,
+      "step": 60
+    },
+    {
+      "epoch": 0.3851617995264404,
+      "grad_norm": 23.99922752380371,
+      "learning_rate": 9.098101265822785e-05,
+      "loss": 3.385,
+      "step": 61
+    },
+    {
+      "epoch": 0.3914759273875296,
+      "grad_norm": 20.64397430419922,
+      "learning_rate": 9.082278481012659e-05,
+      "loss": 3.4648,
+      "step": 62
+    },
+    {
+      "epoch": 0.39779005524861877,
+      "grad_norm": 8.52364444732666,
+      "learning_rate": 9.066455696202531e-05,
+      "loss": 3.3165,
+      "step": 63
+    },
+    {
+      "epoch": 0.40410418310970797,
+      "grad_norm": 10.17910385131836,
+      "learning_rate": 9.050632911392407e-05,
+      "loss": 3.3308,
+      "step": 64
+    },
+    {
+      "epoch": 0.4104183109707972,
+      "grad_norm": 11.811700820922852,
+      "learning_rate": 9.034810126582279e-05,
+      "loss": 3.316,
+      "step": 65
+    },
+    {
+      "epoch": 0.4167324388318863,
+      "grad_norm": 6.778939723968506,
+      "learning_rate": 9.018987341772153e-05,
+      "loss": 3.0975,
+      "step": 66
+    },
+    {
+      "epoch": 0.4230465666929755,
+      "grad_norm": 5.862908840179443,
+      "learning_rate": 9.003164556962026e-05,
+      "loss": 3.1885,
+      "step": 67
+    },
+    {
+      "epoch": 0.4293606945540647,
+      "grad_norm": 5.790811538696289,
+      "learning_rate": 8.9873417721519e-05,
+      "loss": 3.2072,
+      "step": 68
+    },
+    {
+      "epoch": 0.43567482241515393,
+      "grad_norm": 5.9749064445495605,
+      "learning_rate": 8.971518987341772e-05,
+      "loss": 3.0589,
+      "step": 69
+    },
+    {
+      "epoch": 0.4419889502762431,
+      "grad_norm": 11.786849975585938,
+      "learning_rate": 8.955696202531646e-05,
+      "loss": 3.188,
+      "step": 70
+    },
+    {
+      "epoch": 0.4483030781373323,
+      "grad_norm": 15.524914741516113,
+      "learning_rate": 8.93987341772152e-05,
+      "loss": 3.2465,
+      "step": 71
+    },
+    {
+      "epoch": 0.4546172059984215,
+      "grad_norm": 14.146249771118164,
+      "learning_rate": 8.924050632911392e-05,
+      "loss": 3.2995,
+      "step": 72
+    },
+    {
+      "epoch": 0.46093133385951063,
+      "grad_norm": 3.7766897678375244,
+      "learning_rate": 8.908227848101266e-05,
+      "loss": 3.2287,
+      "step": 73
+    },
+    {
+      "epoch": 0.46724546172059983,
+      "grad_norm": 13.466503143310547,
+      "learning_rate": 8.892405063291139e-05,
+      "loss": 3.2794,
+      "step": 74
+    },
+    {
+      "epoch": 0.47355958958168903,
+      "grad_norm": 9.60093879699707,
+      "learning_rate": 8.876582278481013e-05,
+      "loss": 3.2478,
+      "step": 75
+    },
+    {
+      "epoch": 0.47987371744277824,
+      "grad_norm": 8.259520530700684,
+      "learning_rate": 8.860759493670887e-05,
+      "loss": 3.3586,
+      "step": 76
+    },
+    {
+      "epoch": 0.4861878453038674,
+      "grad_norm": 6.9965410232543945,
+      "learning_rate": 8.84493670886076e-05,
+      "loss": 3.2365,
+      "step": 77
+    },
+    {
+      "epoch": 0.4925019731649566,
+      "grad_norm": 12.788185119628906,
+      "learning_rate": 8.829113924050633e-05,
+      "loss": 3.2107,
+      "step": 78
+    },
+    {
+      "epoch": 0.4988161010260458,
+      "grad_norm": 10.677863121032715,
+      "learning_rate": 8.813291139240507e-05,
+      "loss": 3.2843,
+      "step": 79
+    },
+    {
+      "epoch": 0.505130228887135,
+      "grad_norm": 8.940823554992676,
+      "learning_rate": 8.797468354430381e-05,
+      "loss": 3.1655,
+      "step": 80
+    },
+    {
+      "epoch": 0.5114443567482242,
+      "grad_norm": 4.832064628601074,
+      "learning_rate": 8.781645569620253e-05,
+      "loss": 3.1706,
+      "step": 81
+    },
+    {
+      "epoch": 0.5177584846093133,
+      "grad_norm": 10.519792556762695,
+      "learning_rate": 8.765822784810127e-05,
+      "loss": 3.2113,
+      "step": 82
+    },
+    {
+      "epoch": 0.5240726124704025,
+      "grad_norm": 21.498655319213867,
+      "learning_rate": 8.75e-05,
+      "loss": 3.0548,
+      "step": 83
+    },
+    {
+      "epoch": 0.5303867403314917,
+      "grad_norm": 16.82743263244629,
+      "learning_rate": 8.734177215189874e-05,
+      "loss": 3.0727,
+      "step": 84
+    },
+    {
+      "epoch": 0.5367008681925809,
+      "grad_norm": 17.08803367614746,
+      "learning_rate": 8.718354430379748e-05,
+      "loss": 3.1257,
+      "step": 85
+    },
+    {
+      "epoch": 0.5430149960536701,
+      "grad_norm": 8.622400283813477,
+      "learning_rate": 8.70253164556962e-05,
+      "loss": 3.1575,
+      "step": 86
+    },
+    {
+      "epoch": 0.5493291239147593,
+      "grad_norm": 17.043874740600586,
+      "learning_rate": 8.686708860759494e-05,
+      "loss": 3.2683,
+      "step": 87
+    },
+    {
+      "epoch": 0.5556432517758485,
+      "grad_norm": 11.532078742980957,
+      "learning_rate": 8.670886075949367e-05,
+      "loss": 3.1915,
+      "step": 88
+    },
+    {
+      "epoch": 0.5619573796369376,
+      "grad_norm": 23.534517288208008,
+      "learning_rate": 8.65506329113924e-05,
+      "loss": 3.2199,
+      "step": 89
+    },
+    {
+      "epoch": 0.5682715074980268,
+      "grad_norm": 20.578760147094727,
+      "learning_rate": 8.639240506329115e-05,
+      "loss": 3.052,
+      "step": 90
+    },
+    {
+      "epoch": 0.574585635359116,
+      "grad_norm": 7.6293535232543945,
+      "learning_rate": 8.623417721518988e-05,
+      "loss": 3.1255,
+      "step": 91
+    },
+    {
+      "epoch": 0.5808997632202052,
+      "grad_norm": 13.604530334472656,
+      "learning_rate": 8.607594936708861e-05,
+      "loss": 3.2718,
+      "step": 92
+    },
+    {
+      "epoch": 0.5872138910812944,
+      "grad_norm": 9.306023597717285,
+      "learning_rate": 8.591772151898735e-05,
+      "loss": 3.2007,
+      "step": 93
+    },
+    {
+      "epoch": 0.5935280189423836,
+      "grad_norm": 15.916861534118652,
+      "learning_rate": 8.575949367088609e-05,
+      "loss": 3.0144,
+      "step": 94
+    },
+    {
+      "epoch": 0.5998421468034728,
+      "grad_norm": 11.088306427001953,
+      "learning_rate": 8.560126582278481e-05,
+      "loss": 3.0094,
+      "step": 95
+    },
+    {
+      "epoch": 0.606156274664562,
+      "grad_norm": 12.851061820983887,
+      "learning_rate": 8.544303797468355e-05,
+      "loss": 3.0338,
+      "step": 96
+    },
+    {
+      "epoch": 0.6124704025256511,
+      "grad_norm": 8.163888931274414,
+      "learning_rate": 8.528481012658228e-05,
+      "loss": 3.0715,
+      "step": 97
+    },
+    {
+      "epoch": 0.6187845303867403,
+      "grad_norm": 10.81649398803711,
+      "learning_rate": 8.512658227848102e-05,
+      "loss": 3.1136,
+      "step": 98
+    },
+    {
+      "epoch": 0.6250986582478295,
+      "grad_norm": 4.056423664093018,
+      "learning_rate": 8.496835443037974e-05,
+      "loss": 3.1062,
+      "step": 99
+    },
+    {
+      "epoch": 0.6314127861089187,
+      "grad_norm": 10.310821533203125,
+      "learning_rate": 8.481012658227848e-05,
+      "loss": 3.1323,
+      "step": 100
+    },
+    {
+      "epoch": 0.6377269139700079,
+      "grad_norm": 4.946181774139404,
+      "learning_rate": 8.465189873417722e-05,
+      "loss": 3.112,
+      "step": 101
+    },
+    {
+      "epoch": 0.6440410418310971,
+      "grad_norm": 12.03683853149414,
+      "learning_rate": 8.449367088607595e-05,
+      "loss": 3.0755,
+      "step": 102
+    },
+    {
+      "epoch": 0.6503551696921863,
+      "grad_norm": 8.58892822265625,
+      "learning_rate": 8.43354430379747e-05,
+      "loss": 3.114,
+      "step": 103
+    },
+    {
+      "epoch": 0.6566692975532754,
+      "grad_norm": 4.111664295196533,
+      "learning_rate": 8.417721518987342e-05,
+      "loss": 3.1824,
+      "step": 104
+    },
+    {
+      "epoch": 0.6629834254143646,
+      "grad_norm": 8.031673431396484,
+      "learning_rate": 8.401898734177216e-05,
+      "loss": 3.0504,
+      "step": 105
+    },
+    {
+      "epoch": 0.6692975532754538,
+      "grad_norm": 8.085382461547852,
+      "learning_rate": 8.386075949367089e-05,
+      "loss": 2.9141,
+      "step": 106
+    },
+    {
+      "epoch": 0.675611681136543,
+      "grad_norm": 3.866814374923706,
+      "learning_rate": 8.370253164556963e-05,
+      "loss": 2.9354,
+      "step": 107
+    },
+    {
+      "epoch": 0.6819258089976322,
+      "grad_norm": 21.265792846679688,
+      "learning_rate": 8.354430379746835e-05,
+      "loss": 3.0994,
+      "step": 108
+    },
+    {
+      "epoch": 0.6882399368587214,
+      "grad_norm": 11.916318893432617,
+      "learning_rate": 8.33860759493671e-05,
+      "loss": 3.1088,
+      "step": 109
+    },
+    {
+      "epoch": 0.6945540647198106,
+      "grad_norm": 21.871856689453125,
+      "learning_rate": 8.322784810126583e-05,
+      "loss": 3.1318,
+      "step": 110
+    },
+    {
+      "epoch": 0.7008681925808997,
+      "grad_norm": 6.908069610595703,
+      "learning_rate": 8.306962025316456e-05,
+      "loss": 3.1491,
+      "step": 111
+    },
+    {
+      "epoch": 0.7071823204419889,
+      "grad_norm": 36.063926696777344,
+      "learning_rate": 8.29113924050633e-05,
+      "loss": 3.4333,
+      "step": 112
+    },
+    {
+      "epoch": 0.7134964483030781,
+      "grad_norm": 20.55953025817871,
+      "learning_rate": 8.275316455696202e-05,
+      "loss": 3.2292,
+      "step": 113
+    },
+    {
+      "epoch": 0.7198105761641673,
+      "grad_norm": 15.739864349365234,
+      "learning_rate": 8.259493670886076e-05,
+      "loss": 2.993,
+      "step": 114
+    },
+    {
+      "epoch": 0.7261247040252565,
+      "grad_norm": 15.668102264404297,
+      "learning_rate": 8.243670886075949e-05,
+      "loss": 2.9661,
+      "step": 115
+    },
+    {
+      "epoch": 0.7324388318863457,
+      "grad_norm": 13.283061981201172,
+      "learning_rate": 8.227848101265824e-05,
+      "loss": 3.3498,
+      "step": 116
+    },
+    {
+      "epoch": 0.7387529597474349,
+      "grad_norm": 10.03203296661377,
+      "learning_rate": 8.212025316455697e-05,
+      "loss": 3.0689,
+      "step": 117
+    },
+    {
+      "epoch": 0.745067087608524,
+      "grad_norm": 7.466861248016357,
+      "learning_rate": 8.19620253164557e-05,
+      "loss": 3.0641,
+      "step": 118
+    },
+    {
+      "epoch": 0.7513812154696132,
+      "grad_norm": 12.9628267288208,
+      "learning_rate": 8.180379746835444e-05,
+      "loss": 2.982,
+      "step": 119
+    },
+    {
+      "epoch": 0.7576953433307024,
+      "grad_norm": 6.9294023513793945,
+      "learning_rate": 8.164556962025317e-05,
+      "loss": 3.1472,
+      "step": 120
+    },
+    {
+      "epoch": 0.7640094711917916,
+      "grad_norm": 13.094769477844238,
+      "learning_rate": 8.148734177215191e-05,
+      "loss": 3.1785,
+      "step": 121
+    },
+    {
+      "epoch": 0.7703235990528808,
+      "grad_norm": 15.382439613342285,
+      "learning_rate": 8.132911392405063e-05,
+      "loss": 3.1517,
+      "step": 122
+    },
+    {
+      "epoch": 0.77663772691397,
+      "grad_norm": 3.922220230102539,
+      "learning_rate": 8.117088607594937e-05,
+      "loss": 3.0825,
+      "step": 123
+    },
+    {
+      "epoch": 0.7829518547750592,
+      "grad_norm": 13.367452621459961,
+      "learning_rate": 8.10126582278481e-05,
+      "loss": 3.1165,
+      "step": 124
+    },
+    {
+      "epoch": 0.7892659826361483,
+      "grad_norm": 13.817737579345703,
+      "learning_rate": 8.085443037974684e-05,
+      "loss": 3.3522,
+      "step": 125
+    },
+    {
+      "epoch": 0.7955801104972375,
+      "grad_norm": 6.693469524383545,
+      "learning_rate": 8.069620253164558e-05,
+      "loss": 2.9785,
+      "step": 126
+    },
+    {
+      "epoch": 0.8018942383583267,
+      "grad_norm": 6.970970630645752,
+      "learning_rate": 8.05379746835443e-05,
+      "loss": 3.2253,
+      "step": 127
+    },
+    {
+      "epoch": 0.8082083662194159,
+      "grad_norm": 6.731435775756836,
+      "learning_rate": 8.037974683544304e-05,
+      "loss": 3.323,
+      "step": 128
+    },
+    {
+      "epoch": 0.8145224940805051,
+      "grad_norm": 4.875186920166016,
+      "learning_rate": 8.022151898734177e-05,
+      "loss": 3.1744,
+      "step": 129
+    },
+    {
+      "epoch": 0.8208366219415943,
+      "grad_norm": 9.212851524353027,
+      "learning_rate": 8.006329113924052e-05,
+      "loss": 3.3191,
+      "step": 130
+    },
+    {
+      "epoch": 0.8271507498026835,
+      "grad_norm": 4.607917785644531,
+      "learning_rate": 7.990506329113924e-05,
+      "loss": 3.1394,
+      "step": 131
+    },
+    {
+      "epoch": 0.8334648776637726,
+      "grad_norm": 14.266739845275879,
+      "learning_rate": 7.974683544303798e-05,
+      "loss": 2.985,
+      "step": 132
+    },
+    {
+      "epoch": 0.8397790055248618,
+      "grad_norm": 7.989418983459473,
+      "learning_rate": 7.958860759493671e-05,
+      "loss": 2.882,
+      "step": 133
+    },
+    {
+      "epoch": 0.846093133385951,
+      "grad_norm": 7.034262657165527,
+      "learning_rate": 7.943037974683545e-05,
+      "loss": 3.1717,
+      "step": 134
+    },
+    {
+      "epoch": 0.8524072612470402,
+      "grad_norm": 7.286618709564209,
+      "learning_rate": 7.927215189873419e-05,
+      "loss": 3.0567,
+      "step": 135
+    },
+    {
+      "epoch": 0.8587213891081295,
+      "grad_norm": 4.958951473236084,
+      "learning_rate": 7.911392405063291e-05,
+      "loss": 3.0012,
+      "step": 136
+    },
+    {
+      "epoch": 0.8650355169692187,
+      "grad_norm": 2.9031686782836914,
+      "learning_rate": 7.895569620253165e-05,
+      "loss": 3.0797,
+      "step": 137
+    },
+    {
+      "epoch": 0.8713496448303079,
+      "grad_norm": 6.901837348937988,
+      "learning_rate": 7.879746835443038e-05,
+      "loss": 3.2493,
+      "step": 138
+    },
+    {
+      "epoch": 0.877663772691397,
+      "grad_norm": 3.709942579269409,
+      "learning_rate": 7.863924050632912e-05,
+      "loss": 3.165,
+      "step": 139
+    },
+    {
+      "epoch": 0.8839779005524862,
+      "grad_norm": 7.831309795379639,
+      "learning_rate": 7.848101265822784e-05,
+      "loss": 3.1126,
+      "step": 140
+    },
+    {
+      "epoch": 0.8902920284135754,
+      "grad_norm": 8.239750862121582,
+      "learning_rate": 7.832278481012658e-05,
+      "loss": 2.9461,
+      "step": 141
+    },
+    {
+      "epoch": 0.8966061562746646,
+      "grad_norm": 5.883851528167725,
+      "learning_rate": 7.816455696202532e-05,
+      "loss": 3.1751,
+      "step": 142
+    },
+    {
+      "epoch": 0.9029202841357538,
+      "grad_norm": 4.314850330352783,
+      "learning_rate": 7.800632911392406e-05,
+      "loss": 3.1033,
+      "step": 143
+    },
+    {
+      "epoch": 0.909234411996843,
+      "grad_norm": 4.290492534637451,
+      "learning_rate": 7.78481012658228e-05,
+      "loss": 2.9642,
+      "step": 144
+    },
+    {
+      "epoch": 0.9155485398579322,
+      "grad_norm": 5.874259948730469,
+      "learning_rate": 7.768987341772152e-05,
+      "loss": 3.0721,
+      "step": 145
+    },
+    {
+      "epoch": 0.9218626677190213,
+      "grad_norm": 3.419154644012451,
+      "learning_rate": 7.753164556962026e-05,
+      "loss": 2.8922,
+      "step": 146
+    },
+    {
+      "epoch": 0.9281767955801105,
+      "grad_norm": 7.392196178436279,
+      "learning_rate": 7.737341772151899e-05,
+      "loss": 3.3249,
+      "step": 147
+    },
+    {
+      "epoch": 0.9344909234411997,
+      "grad_norm": 8.384613990783691,
+      "learning_rate": 7.721518987341773e-05,
+      "loss": 3.177,
+      "step": 148
+    },
+    {
+      "epoch": 0.9408050513022889,
+      "grad_norm": 3.6932125091552734,
+      "learning_rate": 7.705696202531645e-05,
+      "loss": 2.9024,
+      "step": 149
+    },
+    {
+      "epoch": 0.9471191791633781,
+      "grad_norm": 10.121501922607422,
+      "learning_rate": 7.689873417721519e-05,
+      "loss": 2.8811,
+      "step": 150
+    },
+    {
+      "epoch": 0.9534333070244673,
+      "grad_norm": 6.764036655426025,
+      "learning_rate": 7.674050632911393e-05,
+      "loss": 3.0374,
+      "step": 151
+    },
+    {
+      "epoch": 0.9597474348855565,
+      "grad_norm": 8.226846694946289,
+      "learning_rate": 7.658227848101266e-05,
+      "loss": 3.1455,
+      "step": 152
+    },
+    {
+      "epoch": 0.9660615627466457,
+      "grad_norm": 7.442160129547119,
+      "learning_rate": 7.64240506329114e-05,
+      "loss": 3.2016,
+      "step": 153
+    },
+    {
+      "epoch": 0.9723756906077348,
+      "grad_norm": 4.500311851501465,
+      "learning_rate": 7.626582278481012e-05,
+      "loss": 3.1067,
+      "step": 154
+    },
+    {
+      "epoch": 0.978689818468824,
+      "grad_norm": 4.05573034286499,
+      "learning_rate": 7.610759493670886e-05,
+      "loss": 3.0543,
+      "step": 155
+    },
+    {
+      "epoch": 0.9850039463299132,
+      "grad_norm": 5.813141345977783,
+      "learning_rate": 7.59493670886076e-05,
+      "loss": 3.0306,
+      "step": 156
+    },
+    {
+      "epoch": 0.9913180741910024,
+      "grad_norm": 9.513184547424316,
+      "learning_rate": 7.579113924050634e-05,
+      "loss": 2.8891,
+      "step": 157
+    },
+    {
+      "epoch": 0.9976322020520916,
+      "grad_norm": 4.737900257110596,
+      "learning_rate": 7.563291139240506e-05,
+      "loss": 3.142,
+      "step": 158
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.998995304107666,
+      "learning_rate": 7.54746835443038e-05,
+      "loss": 1.1642,
+      "step": 159
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.371297687292099,
+      "eval_runtime": 19.3489,
+      "eval_samples_per_second": 20.673,
+      "eval_steps_per_second": 5.168,
+      "step": 159
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 632,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0705428106739712e+16,
+  "train_batch_size": 6,
+  "trial_name": null,
+  "trial_params": null
+}

last-checkpoint/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ab77a5eae38b0e271c42e2acd0600243694639d1424fbedb4fd0a49e94c28e2
+size 5368