Subiendo modelo Yuuki

Files changed (8) hide show

generation_config.json +6 -0
model-index.yml +0 -10
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1014 -0
training_args.bin +3 -0

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.57.3"
+}

model-index.yml DELETED Viewed

@@ -1,10 +0,0 @@
-model-index:
-  - name: Yuuki
-    results:
-      - task:
-          type: text-generation
-        dataset:
-          name: bigcode/the-stack-smol-xl
-        metrics:
-          - type: perplexity
-            value: 1.0

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4522c53bb4758ec6e1a89bd459ca4b8ad402c7b64fd31da157e16f02df29a0c9
+size 327657928

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e13de0daa046bcdd46f6c8720617f9dd28cd1dfa744daac41b43d4d083adaba
+size 655362763

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8153e58819c87a064d2b27784c1b36c80e1ac58d9491de99c0804690e63a8d06
+size 14455

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6673b5da1b8c20fa3a5c9d1031738d7688c18cdb79ad14194034135716c349f5
+size 1465

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1014 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.07466666666666667,
+  "eval_steps": 500,
+  "global_step": 1400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005333333333333334,
+      "grad_norm": 5.792410850524902,
+      "learning_rate": 4.9988e-05,
+      "loss": 3.3493,
+      "step": 10
+    },
+    {
+      "epoch": 0.0010666666666666667,
+      "grad_norm": 5.163405418395996,
+      "learning_rate": 4.997466666666667e-05,
+      "loss": 3.2695,
+      "step": 20
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 4.789336204528809,
+      "learning_rate": 4.996133333333334e-05,
+      "loss": 3.3535,
+      "step": 30
+    },
+    {
+      "epoch": 0.0021333333333333334,
+      "grad_norm": 6.795251846313477,
+      "learning_rate": 4.9948000000000004e-05,
+      "loss": 3.3185,
+      "step": 40
+    },
+    {
+      "epoch": 0.0026666666666666666,
+      "grad_norm": 5.427957057952881,
+      "learning_rate": 4.993466666666667e-05,
+      "loss": 3.0503,
+      "step": 50
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 5.994451999664307,
+      "learning_rate": 4.992133333333334e-05,
+      "loss": 3.2466,
+      "step": 60
+    },
+    {
+      "epoch": 0.0037333333333333333,
+      "grad_norm": 5.032135486602783,
+      "learning_rate": 4.9908000000000004e-05,
+      "loss": 2.7803,
+      "step": 70
+    },
+    {
+      "epoch": 0.004266666666666667,
+      "grad_norm": 5.3739495277404785,
+      "learning_rate": 4.989466666666667e-05,
+      "loss": 2.8109,
+      "step": 80
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 5.321890354156494,
+      "learning_rate": 4.988133333333333e-05,
+      "loss": 2.8156,
+      "step": 90
+    },
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 8.430088996887207,
+      "learning_rate": 4.9868000000000004e-05,
+      "loss": 2.7049,
+      "step": 100
+    },
+    {
+      "epoch": 0.005866666666666667,
+      "grad_norm": 6.157585620880127,
+      "learning_rate": 4.985466666666667e-05,
+      "loss": 3.0948,
+      "step": 110
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 5.328046798706055,
+      "learning_rate": 4.9841333333333333e-05,
+      "loss": 2.724,
+      "step": 120
+    },
+    {
+      "epoch": 0.006933333333333333,
+      "grad_norm": 5.178571701049805,
+      "learning_rate": 4.9828000000000005e-05,
+      "loss": 3.0151,
+      "step": 130
+    },
+    {
+      "epoch": 0.007466666666666667,
+      "grad_norm": 6.57607364654541,
+      "learning_rate": 4.981466666666667e-05,
+      "loss": 2.7384,
+      "step": 140
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 11.210726737976074,
+      "learning_rate": 4.9801333333333334e-05,
+      "loss": 2.8189,
+      "step": 150
+    },
+    {
+      "epoch": 0.008533333333333334,
+      "grad_norm": 4.816577911376953,
+      "learning_rate": 4.9788e-05,
+      "loss": 2.6138,
+      "step": 160
+    },
+    {
+      "epoch": 0.009066666666666667,
+      "grad_norm": 6.2408294677734375,
+      "learning_rate": 4.977466666666667e-05,
+      "loss": 2.6402,
+      "step": 170
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 5.550693511962891,
+      "learning_rate": 4.976133333333334e-05,
+      "loss": 2.8217,
+      "step": 180
+    },
+    {
+      "epoch": 0.010133333333333333,
+      "grad_norm": 6.007472515106201,
+      "learning_rate": 4.9748e-05,
+      "loss": 2.7337,
+      "step": 190
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": 5.4420623779296875,
+      "learning_rate": 4.973466666666667e-05,
+      "loss": 2.4179,
+      "step": 200
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 4.604928493499756,
+      "learning_rate": 4.9721333333333335e-05,
+      "loss": 2.4616,
+      "step": 210
+    },
+    {
+      "epoch": 0.011733333333333333,
+      "grad_norm": 4.743863582611084,
+      "learning_rate": 4.9708000000000006e-05,
+      "loss": 2.6844,
+      "step": 220
+    },
+    {
+      "epoch": 0.012266666666666667,
+      "grad_norm": 6.098178386688232,
+      "learning_rate": 4.969466666666667e-05,
+      "loss": 2.8374,
+      "step": 230
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 5.230837821960449,
+      "learning_rate": 4.9681333333333335e-05,
+      "loss": 2.6294,
+      "step": 240
+    },
+    {
+      "epoch": 0.013333333333333334,
+      "grad_norm": 4.995968341827393,
+      "learning_rate": 4.9668000000000006e-05,
+      "loss": 2.7009,
+      "step": 250
+    },
+    {
+      "epoch": 0.013866666666666666,
+      "grad_norm": 5.136434078216553,
+      "learning_rate": 4.965466666666667e-05,
+      "loss": 2.6702,
+      "step": 260
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 4.960114479064941,
+      "learning_rate": 4.9641333333333335e-05,
+      "loss": 2.543,
+      "step": 270
+    },
+    {
+      "epoch": 0.014933333333333333,
+      "grad_norm": 3.7399258613586426,
+      "learning_rate": 4.9628e-05,
+      "loss": 2.6073,
+      "step": 280
+    },
+    {
+      "epoch": 0.015466666666666667,
+      "grad_norm": 5.888091087341309,
+      "learning_rate": 4.961466666666667e-05,
+      "loss": 3.0508,
+      "step": 290
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 3.991913318634033,
+      "learning_rate": 4.9601333333333336e-05,
+      "loss": 2.7923,
+      "step": 300
+    },
+    {
+      "epoch": 0.016533333333333334,
+      "grad_norm": 7.179355144500732,
+      "learning_rate": 4.9588e-05,
+      "loss": 2.6183,
+      "step": 310
+    },
+    {
+      "epoch": 0.017066666666666667,
+      "grad_norm": 5.805356502532959,
+      "learning_rate": 4.957466666666667e-05,
+      "loss": 2.7449,
+      "step": 320
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 3.777961492538452,
+      "learning_rate": 4.9561333333333336e-05,
+      "loss": 2.7355,
+      "step": 330
+    },
+    {
+      "epoch": 0.018133333333333335,
+      "grad_norm": 4.463932991027832,
+      "learning_rate": 4.9548e-05,
+      "loss": 2.6022,
+      "step": 340
+    },
+    {
+      "epoch": 0.018666666666666668,
+      "grad_norm": 4.360901832580566,
+      "learning_rate": 4.9534666666666665e-05,
+      "loss": 2.4569,
+      "step": 350
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 4.430737495422363,
+      "learning_rate": 4.9521333333333336e-05,
+      "loss": 2.7994,
+      "step": 360
+    },
+    {
+      "epoch": 0.019733333333333332,
+      "grad_norm": 5.609028339385986,
+      "learning_rate": 4.9508e-05,
+      "loss": 2.4863,
+      "step": 370
+    },
+    {
+      "epoch": 0.020266666666666665,
+      "grad_norm": 4.087404251098633,
+      "learning_rate": 4.9494666666666666e-05,
+      "loss": 2.5082,
+      "step": 380
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 3.5008347034454346,
+      "learning_rate": 4.948133333333334e-05,
+      "loss": 2.6335,
+      "step": 390
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 5.405452251434326,
+      "learning_rate": 4.9468e-05,
+      "loss": 2.9961,
+      "step": 400
+    },
+    {
+      "epoch": 0.021866666666666666,
+      "grad_norm": 3.3590590953826904,
+      "learning_rate": 4.945466666666667e-05,
+      "loss": 2.7085,
+      "step": 410
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 4.877053260803223,
+      "learning_rate": 4.944133333333334e-05,
+      "loss": 2.4832,
+      "step": 420
+    },
+    {
+      "epoch": 0.022933333333333333,
+      "grad_norm": 3.8995587825775146,
+      "learning_rate": 4.9428e-05,
+      "loss": 2.3182,
+      "step": 430
+    },
+    {
+      "epoch": 0.023466666666666667,
+      "grad_norm": 5.022705554962158,
+      "learning_rate": 4.941466666666667e-05,
+      "loss": 2.4094,
+      "step": 440
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 4.630198001861572,
+      "learning_rate": 4.940133333333334e-05,
+      "loss": 2.1688,
+      "step": 450
+    },
+    {
+      "epoch": 0.024533333333333334,
+      "grad_norm": 5.065393447875977,
+      "learning_rate": 4.9388e-05,
+      "loss": 2.7714,
+      "step": 460
+    },
+    {
+      "epoch": 0.025066666666666668,
+      "grad_norm": 4.215227127075195,
+      "learning_rate": 4.937466666666667e-05,
+      "loss": 2.7938,
+      "step": 470
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 3.03859281539917,
+      "learning_rate": 4.936133333333334e-05,
+      "loss": 2.5154,
+      "step": 480
+    },
+    {
+      "epoch": 0.026133333333333335,
+      "grad_norm": 4.028010845184326,
+      "learning_rate": 4.9348e-05,
+      "loss": 2.2416,
+      "step": 490
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 5.132800579071045,
+      "learning_rate": 4.933466666666667e-05,
+      "loss": 2.4243,
+      "step": 500
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 4.919140815734863,
+      "learning_rate": 4.932133333333334e-05,
+      "loss": 2.1868,
+      "step": 510
+    },
+    {
+      "epoch": 0.027733333333333332,
+      "grad_norm": 3.64642071723938,
+      "learning_rate": 4.9308e-05,
+      "loss": 2.1696,
+      "step": 520
+    },
+    {
+      "epoch": 0.028266666666666666,
+      "grad_norm": 4.6441755294799805,
+      "learning_rate": 4.929466666666667e-05,
+      "loss": 2.4774,
+      "step": 530
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 4.154843807220459,
+      "learning_rate": 4.928133333333333e-05,
+      "loss": 2.5508,
+      "step": 540
+    },
+    {
+      "epoch": 0.029333333333333333,
+      "grad_norm": 4.62436580657959,
+      "learning_rate": 4.9268e-05,
+      "loss": 2.3155,
+      "step": 550
+    },
+    {
+      "epoch": 0.029866666666666666,
+      "grad_norm": 5.344653129577637,
+      "learning_rate": 4.925466666666667e-05,
+      "loss": 2.3551,
+      "step": 560
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 3.7210919857025146,
+      "learning_rate": 4.924133333333333e-05,
+      "loss": 2.346,
+      "step": 570
+    },
+    {
+      "epoch": 0.030933333333333334,
+      "grad_norm": 4.005031585693359,
+      "learning_rate": 4.9228000000000004e-05,
+      "loss": 2.4143,
+      "step": 580
+    },
+    {
+      "epoch": 0.031466666666666664,
+      "grad_norm": 7.194045066833496,
+      "learning_rate": 4.921466666666667e-05,
+      "loss": 2.5205,
+      "step": 590
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 5.16976261138916,
+      "learning_rate": 4.920133333333334e-05,
+      "loss": 2.3044,
+      "step": 600
+    },
+    {
+      "epoch": 0.03253333333333333,
+      "grad_norm": 4.67457389831543,
+      "learning_rate": 4.9188000000000004e-05,
+      "loss": 2.4939,
+      "step": 610
+    },
+    {
+      "epoch": 0.03306666666666667,
+      "grad_norm": 3.4367618560791016,
+      "learning_rate": 4.917466666666667e-05,
+      "loss": 2.3615,
+      "step": 620
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 4.060678005218506,
+      "learning_rate": 4.916133333333334e-05,
+      "loss": 2.5419,
+      "step": 630
+    },
+    {
+      "epoch": 0.034133333333333335,
+      "grad_norm": 4.616146564483643,
+      "learning_rate": 4.9148e-05,
+      "loss": 2.0667,
+      "step": 640
+    },
+    {
+      "epoch": 0.034666666666666665,
+      "grad_norm": 4.443355560302734,
+      "learning_rate": 4.913466666666667e-05,
+      "loss": 2.2449,
+      "step": 650
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 7.294761657714844,
+      "learning_rate": 4.9121333333333334e-05,
+      "loss": 2.15,
+      "step": 660
+    },
+    {
+      "epoch": 0.03573333333333333,
+      "grad_norm": 4.311218738555908,
+      "learning_rate": 4.9108000000000005e-05,
+      "loss": 2.1496,
+      "step": 670
+    },
+    {
+      "epoch": 0.03626666666666667,
+      "grad_norm": 3.891162395477295,
+      "learning_rate": 4.909466666666667e-05,
+      "loss": 2.2169,
+      "step": 680
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 5.323464870452881,
+      "learning_rate": 4.9081333333333334e-05,
+      "loss": 2.338,
+      "step": 690
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 3.7327029705047607,
+      "learning_rate": 4.9068000000000005e-05,
+      "loss": 2.1279,
+      "step": 700
+    },
+    {
+      "epoch": 0.037866666666666667,
+      "grad_norm": 3.986100673675537,
+      "learning_rate": 4.905466666666667e-05,
+      "loss": 2.2684,
+      "step": 710
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 5.458177089691162,
+      "learning_rate": 4.9041333333333334e-05,
+      "loss": 2.323,
+      "step": 720
+    },
+    {
+      "epoch": 0.038933333333333334,
+      "grad_norm": 3.8247170448303223,
+      "learning_rate": 4.9028e-05,
+      "loss": 2.0734,
+      "step": 730
+    },
+    {
+      "epoch": 0.039466666666666664,
+      "grad_norm": 3.4353110790252686,
+      "learning_rate": 4.901466666666667e-05,
+      "loss": 2.3232,
+      "step": 740
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 4.783677101135254,
+      "learning_rate": 4.9001333333333335e-05,
+      "loss": 2.3483,
+      "step": 750
+    },
+    {
+      "epoch": 0.04053333333333333,
+      "grad_norm": 4.975961208343506,
+      "learning_rate": 4.8988e-05,
+      "loss": 1.9943,
+      "step": 760
+    },
+    {
+      "epoch": 0.04106666666666667,
+      "grad_norm": 3.3294217586517334,
+      "learning_rate": 4.897466666666667e-05,
+      "loss": 2.2036,
+      "step": 770
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 3.2321457862854004,
+      "learning_rate": 4.8961333333333335e-05,
+      "loss": 2.246,
+      "step": 780
+    },
+    {
+      "epoch": 0.042133333333333335,
+      "grad_norm": 3.804316520690918,
+      "learning_rate": 4.8948000000000006e-05,
+      "loss": 2.1379,
+      "step": 790
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 3.2469778060913086,
+      "learning_rate": 4.893466666666667e-05,
+      "loss": 2.3573,
+      "step": 800
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 2.9724888801574707,
+      "learning_rate": 4.8921333333333335e-05,
+      "loss": 2.4392,
+      "step": 810
+    },
+    {
+      "epoch": 0.04373333333333333,
+      "grad_norm": 3.239983081817627,
+      "learning_rate": 4.890800000000001e-05,
+      "loss": 2.0712,
+      "step": 820
+    },
+    {
+      "epoch": 0.04426666666666667,
+      "grad_norm": 4.348440647125244,
+      "learning_rate": 4.8894666666666665e-05,
+      "loss": 2.4012,
+      "step": 830
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 3.6949236392974854,
+      "learning_rate": 4.8881333333333336e-05,
+      "loss": 2.5602,
+      "step": 840
+    },
+    {
+      "epoch": 0.04533333333333334,
+      "grad_norm": 4.177002429962158,
+      "learning_rate": 4.8868e-05,
+      "loss": 2.3311,
+      "step": 850
+    },
+    {
+      "epoch": 0.04586666666666667,
+      "grad_norm": 3.845810651779175,
+      "learning_rate": 4.885466666666667e-05,
+      "loss": 2.2281,
+      "step": 860
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 4.1254401206970215,
+      "learning_rate": 4.8841333333333336e-05,
+      "loss": 2.4041,
+      "step": 870
+    },
+    {
+      "epoch": 0.046933333333333334,
+      "grad_norm": 3.803191900253296,
+      "learning_rate": 4.8828e-05,
+      "loss": 2.4445,
+      "step": 880
+    },
+    {
+      "epoch": 0.047466666666666664,
+      "grad_norm": 3.87357234954834,
+      "learning_rate": 4.881466666666667e-05,
+      "loss": 2.384,
+      "step": 890
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 3.664041757583618,
+      "learning_rate": 4.8801333333333337e-05,
+      "loss": 2.3232,
+      "step": 900
+    },
+    {
+      "epoch": 0.04853333333333333,
+      "grad_norm": 3.4232048988342285,
+      "learning_rate": 4.8788e-05,
+      "loss": 2.1113,
+      "step": 910
+    },
+    {
+      "epoch": 0.04906666666666667,
+      "grad_norm": 4.911906719207764,
+      "learning_rate": 4.8774666666666666e-05,
+      "loss": 1.895,
+      "step": 920
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 5.789026737213135,
+      "learning_rate": 4.876133333333334e-05,
+      "loss": 2.0885,
+      "step": 930
+    },
+    {
+      "epoch": 0.050133333333333335,
+      "grad_norm": 3.3766918182373047,
+      "learning_rate": 4.8748e-05,
+      "loss": 2.3543,
+      "step": 940
+    },
+    {
+      "epoch": 0.050666666666666665,
+      "grad_norm": 3.5231785774230957,
+      "learning_rate": 4.8734666666666666e-05,
+      "loss": 2.1267,
+      "step": 950
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 3.2112128734588623,
+      "learning_rate": 4.872133333333334e-05,
+      "loss": 2.3124,
+      "step": 960
+    },
+    {
+      "epoch": 0.05173333333333333,
+      "grad_norm": 3.1260693073272705,
+      "learning_rate": 4.8708e-05,
+      "loss": 2.1612,
+      "step": 970
+    },
+    {
+      "epoch": 0.05226666666666667,
+      "grad_norm": 4.093555450439453,
+      "learning_rate": 4.869466666666667e-05,
+      "loss": 1.9753,
+      "step": 980
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 4.036992073059082,
+      "learning_rate": 4.868133333333333e-05,
+      "loss": 2.1585,
+      "step": 990
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 3.4580295085906982,
+      "learning_rate": 4.8668e-05,
+      "loss": 2.2032,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05386666666666667,
+      "grad_norm": 3.512268304824829,
+      "learning_rate": 4.8654666666666674e-05,
+      "loss": 2.303,
+      "step": 1010
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 3.7754204273223877,
+      "learning_rate": 4.864133333333333e-05,
+      "loss": 2.5944,
+      "step": 1020
+    },
+    {
+      "epoch": 0.054933333333333334,
+      "grad_norm": 3.053835391998291,
+      "learning_rate": 4.8628e-05,
+      "loss": 2.083,
+      "step": 1030
+    },
+    {
+      "epoch": 0.055466666666666664,
+      "grad_norm": 3.839688539505005,
+      "learning_rate": 4.861466666666667e-05,
+      "loss": 2.3334,
+      "step": 1040
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 4.084265232086182,
+      "learning_rate": 4.860133333333334e-05,
+      "loss": 2.4088,
+      "step": 1050
+    },
+    {
+      "epoch": 0.05653333333333333,
+      "grad_norm": 3.9182255268096924,
+      "learning_rate": 4.8588e-05,
+      "loss": 2.4002,
+      "step": 1060
+    },
+    {
+      "epoch": 0.05706666666666667,
+      "grad_norm": 3.005012035369873,
+      "learning_rate": 4.857466666666667e-05,
+      "loss": 2.4005,
+      "step": 1070
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 3.1780312061309814,
+      "learning_rate": 4.856133333333334e-05,
+      "loss": 2.1391,
+      "step": 1080
+    },
+    {
+      "epoch": 0.058133333333333335,
+      "grad_norm": 4.896857261657715,
+      "learning_rate": 4.8548000000000003e-05,
+      "loss": 2.0301,
+      "step": 1090
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 3.1400632858276367,
+      "learning_rate": 4.853466666666667e-05,
+      "loss": 2.2478,
+      "step": 1100
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 2.5740103721618652,
+      "learning_rate": 4.852133333333333e-05,
+      "loss": 2.0002,
+      "step": 1110
+    },
+    {
+      "epoch": 0.05973333333333333,
+      "grad_norm": 2.3037431240081787,
+      "learning_rate": 4.8508000000000004e-05,
+      "loss": 2.0746,
+      "step": 1120
+    },
+    {
+      "epoch": 0.06026666666666667,
+      "grad_norm": 3.2235605716705322,
+      "learning_rate": 4.849466666666667e-05,
+      "loss": 2.0035,
+      "step": 1130
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 4.025834560394287,
+      "learning_rate": 4.848133333333333e-05,
+      "loss": 2.2058,
+      "step": 1140
+    },
+    {
+      "epoch": 0.06133333333333333,
+      "grad_norm": 3.152123212814331,
+      "learning_rate": 4.8468000000000004e-05,
+      "loss": 2.2427,
+      "step": 1150
+    },
+    {
+      "epoch": 0.06186666666666667,
+      "grad_norm": 3.052441120147705,
+      "learning_rate": 4.845466666666667e-05,
+      "loss": 2.2619,
+      "step": 1160
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 3.9626965522766113,
+      "learning_rate": 4.844133333333333e-05,
+      "loss": 2.4091,
+      "step": 1170
+    },
+    {
+      "epoch": 0.06293333333333333,
+      "grad_norm": 4.5252838134765625,
+      "learning_rate": 4.8428e-05,
+      "loss": 2.3108,
+      "step": 1180
+    },
+    {
+      "epoch": 0.06346666666666667,
+      "grad_norm": 3.293490409851074,
+      "learning_rate": 4.841466666666667e-05,
+      "loss": 2.1721,
+      "step": 1190
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 3.1755733489990234,
+      "learning_rate": 4.840133333333334e-05,
+      "loss": 2.0308,
+      "step": 1200
+    },
+    {
+      "epoch": 0.06453333333333333,
+      "grad_norm": 4.124546051025391,
+      "learning_rate": 4.8388e-05,
+      "loss": 2.1766,
+      "step": 1210
+    },
+    {
+      "epoch": 0.06506666666666666,
+      "grad_norm": 2.7379612922668457,
+      "learning_rate": 4.837466666666667e-05,
+      "loss": 2.0725,
+      "step": 1220
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 3.224748373031616,
+      "learning_rate": 4.8361333333333334e-05,
+      "loss": 2.115,
+      "step": 1230
+    },
+    {
+      "epoch": 0.06613333333333334,
+      "grad_norm": 3.2604916095733643,
+      "learning_rate": 4.8348000000000005e-05,
+      "loss": 2.156,
+      "step": 1240
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 2.5119776725769043,
+      "learning_rate": 4.833466666666667e-05,
+      "loss": 1.8298,
+      "step": 1250
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 2.750699043273926,
+      "learning_rate": 4.8321333333333334e-05,
+      "loss": 1.9445,
+      "step": 1260
+    },
+    {
+      "epoch": 0.06773333333333334,
+      "grad_norm": 3.47723126411438,
+      "learning_rate": 4.8308000000000006e-05,
+      "loss": 2.2289,
+      "step": 1270
+    },
+    {
+      "epoch": 0.06826666666666667,
+      "grad_norm": 3.4429843425750732,
+      "learning_rate": 4.829466666666667e-05,
+      "loss": 1.7053,
+      "step": 1280
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 3.9478533267974854,
+      "learning_rate": 4.8281333333333335e-05,
+      "loss": 1.9367,
+      "step": 1290
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 5.963420391082764,
+      "learning_rate": 4.8268e-05,
+      "loss": 2.0725,
+      "step": 1300
+    },
+    {
+      "epoch": 0.06986666666666666,
+      "grad_norm": 4.6685285568237305,
+      "learning_rate": 4.825466666666667e-05,
+      "loss": 2.1006,
+      "step": 1310
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 3.369166851043701,
+      "learning_rate": 4.8241333333333335e-05,
+      "loss": 2.2664,
+      "step": 1320
+    },
+    {
+      "epoch": 0.07093333333333333,
+      "grad_norm": 2.6912147998809814,
+      "learning_rate": 4.8228e-05,
+      "loss": 1.7266,
+      "step": 1330
+    },
+    {
+      "epoch": 0.07146666666666666,
+      "grad_norm": 3.640000104904175,
+      "learning_rate": 4.821466666666667e-05,
+      "loss": 1.8255,
+      "step": 1340
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 3.908271312713623,
+      "learning_rate": 4.8201333333333336e-05,
+      "loss": 2.0521,
+      "step": 1350
+    },
+    {
+      "epoch": 0.07253333333333334,
+      "grad_norm": 3.6119441986083984,
+      "learning_rate": 4.8188e-05,
+      "loss": 2.203,
+      "step": 1360
+    },
+    {
+      "epoch": 0.07306666666666667,
+      "grad_norm": 3.105259418487549,
+      "learning_rate": 4.8174666666666665e-05,
+      "loss": 1.8647,
+      "step": 1370
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 2.787991762161255,
+      "learning_rate": 4.8161333333333336e-05,
+      "loss": 1.8087,
+      "step": 1380
+    },
+    {
+      "epoch": 0.07413333333333333,
+      "grad_norm": 2.5194826126098633,
+      "learning_rate": 4.814800000000001e-05,
+      "loss": 2.033,
+      "step": 1390
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 3.3716447353363037,
+      "learning_rate": 4.8134666666666665e-05,
+      "loss": 2.1324,
+      "step": 1400
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 37500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 365815450828800.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02727503188e13150fa70296d6b033a9e6cbcb8193e7bb3d57f897e57d9e178a
+size 5841