init upload

Browse files

Files changed (8) hide show

config.json +42 -0
generation_config.json +8 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1091 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "architectures": [
+    "Lfm2ForCausalLM"
+  ],
+  "block_auto_adjust_ff_dim": true,
+  "block_ffn_dim_multiplier": 1.0,
+  "block_multiple_of": 256,
+  "bos_token_id": 1,
+  "conv_L_cache": 3,
+  "conv_bias": false,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 2720,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_batch_size": 512,
+  "max_position_embeddings": 2048,
+  "model_type": "lfm2",
+  "norm_eps": 1e-05,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "rope_theta": 1000000.0,
+  "transformers_version": "4.56.0",
+  "use_cache": false,
+  "vocab_size": 50257
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.56.0",
+  "use_cache": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb92b5865aa709b54c9d7bb64d60f2c8a690fbf68b0c92744e34df75eeec855
+size 633795192

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62ac04c1e118e5c1bc21ecafff703a9ba1d10c75001a54cb5b8b10a73889ddb7
+size 633943947

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62fae89a87dda02c920f53346ec8f529110400776b4bc34106565d26314c1d04
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13a1c6e38638738c0bc5a722660016155dc0152f6443cf33650b074db30a7091
+size 1465

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1091 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3932878867330886,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00026219192448872575,
+      "grad_norm": 22.20619010925293,
+      "learning_rate": 0.0,
+      "loss": 10.5131,
+      "step": 1
+    },
+    {
+      "epoch": 0.0026219192448872575,
+      "grad_norm": 22.429588317871094,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 10.4662,
+      "step": 10
+    },
+    {
+      "epoch": 0.005243838489774515,
+      "grad_norm": 22.83245086669922,
+      "learning_rate": 9.5e-05,
+      "loss": 10.1612,
+      "step": 20
+    },
+    {
+      "epoch": 0.007865757734661772,
+      "grad_norm": 23.247602462768555,
+      "learning_rate": 0.000145,
+      "loss": 9.5256,
+      "step": 30
+    },
+    {
+      "epoch": 0.01048767697954903,
+      "grad_norm": 23.51291275024414,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 8.5708,
+      "step": 40
+    },
+    {
+      "epoch": 0.013109596224436287,
+      "grad_norm": 22.496492385864258,
+      "learning_rate": 0.000245,
+      "loss": 7.3388,
+      "step": 50
+    },
+    {
+      "epoch": 0.015731515469323543,
+      "grad_norm": 16.345460891723633,
+      "learning_rate": 0.000295,
+      "loss": 5.9703,
+      "step": 60
+    },
+    {
+      "epoch": 0.018353434714210803,
+      "grad_norm": 3.921259880065918,
+      "learning_rate": 0.000345,
+      "loss": 4.9478,
+      "step": 70
+    },
+    {
+      "epoch": 0.02097535395909806,
+      "grad_norm": 7.0385589599609375,
+      "learning_rate": 0.000395,
+      "loss": 4.6803,
+      "step": 80
+    },
+    {
+      "epoch": 0.023597273203985317,
+      "grad_norm": 2.6207873821258545,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 4.4974,
+      "step": 90
+    },
+    {
+      "epoch": 0.026219192448872573,
+      "grad_norm": 1.9961260557174683,
+      "learning_rate": 0.000495,
+      "loss": 4.3314,
+      "step": 100
+    },
+    {
+      "epoch": 0.028841111693759833,
+      "grad_norm": 1.6183704137802124,
+      "learning_rate": 0.000545,
+      "loss": 4.1959,
+      "step": 110
+    },
+    {
+      "epoch": 0.03146303093864709,
+      "grad_norm": 1.331021785736084,
+      "learning_rate": 0.0005949999999999999,
+      "loss": 4.0158,
+      "step": 120
+    },
+    {
+      "epoch": 0.03408495018353435,
+      "grad_norm": 1.14554762840271,
+      "learning_rate": 0.0006450000000000001,
+      "loss": 3.9321,
+      "step": 130
+    },
+    {
+      "epoch": 0.03670686942842161,
+      "grad_norm": 0.9175837635993958,
+      "learning_rate": 0.000695,
+      "loss": 3.802,
+      "step": 140
+    },
+    {
+      "epoch": 0.03932878867330886,
+      "grad_norm": 0.7335033416748047,
+      "learning_rate": 0.000745,
+      "loss": 3.6618,
+      "step": 150
+    },
+    {
+      "epoch": 0.04195070791819612,
+      "grad_norm": 0.5916274785995483,
+      "learning_rate": 0.000795,
+      "loss": 3.5341,
+      "step": 160
+    },
+    {
+      "epoch": 0.04457262716308338,
+      "grad_norm": 0.4947799742221832,
+      "learning_rate": 0.0008449999999999999,
+      "loss": 3.5311,
+      "step": 170
+    },
+    {
+      "epoch": 0.04719454640797063,
+      "grad_norm": 0.40263015031814575,
+      "learning_rate": 0.0008950000000000001,
+      "loss": 3.4709,
+      "step": 180
+    },
+    {
+      "epoch": 0.04981646565285789,
+      "grad_norm": 0.32677406072616577,
+      "learning_rate": 0.000945,
+      "loss": 3.2973,
+      "step": 190
+    },
+    {
+      "epoch": 0.05243838489774515,
+      "grad_norm": 0.3071628212928772,
+      "learning_rate": 0.000995,
+      "loss": 3.28,
+      "step": 200
+    },
+    {
+      "epoch": 0.05506030414263241,
+      "grad_norm": 0.3233015835285187,
+      "learning_rate": 0.001045,
+      "loss": 3.2038,
+      "step": 210
+    },
+    {
+      "epoch": 0.05768222338751967,
+      "grad_norm": 0.39402100443840027,
+      "learning_rate": 0.001095,
+      "loss": 3.1627,
+      "step": 220
+    },
+    {
+      "epoch": 0.060304142632406924,
+      "grad_norm": 0.5528343915939331,
+      "learning_rate": 0.001145,
+      "loss": 3.1341,
+      "step": 230
+    },
+    {
+      "epoch": 0.06292606187729417,
+      "grad_norm": 0.4888489842414856,
+      "learning_rate": 0.001195,
+      "loss": 3.0192,
+      "step": 240
+    },
+    {
+      "epoch": 0.06554798112218144,
+      "grad_norm": 0.5662292838096619,
+      "learning_rate": 0.0012450000000000002,
+      "loss": 2.991,
+      "step": 250
+    },
+    {
+      "epoch": 0.0681699003670687,
+      "grad_norm": 0.5800466537475586,
+      "learning_rate": 0.001295,
+      "loss": 2.992,
+      "step": 260
+    },
+    {
+      "epoch": 0.07079181961195595,
+      "grad_norm": 0.5511091947555542,
+      "learning_rate": 0.001345,
+      "loss": 2.9246,
+      "step": 270
+    },
+    {
+      "epoch": 0.07341373885684321,
+      "grad_norm": 0.7486537098884583,
+      "learning_rate": 0.001395,
+      "loss": 2.8996,
+      "step": 280
+    },
+    {
+      "epoch": 0.07603565810173046,
+      "grad_norm": 0.6995801329612732,
+      "learning_rate": 0.001445,
+      "loss": 2.7945,
+      "step": 290
+    },
+    {
+      "epoch": 0.07865757734661773,
+      "grad_norm": 0.7938666939735413,
+      "learning_rate": 0.0014950000000000002,
+      "loss": 2.7632,
+      "step": 300
+    },
+    {
+      "epoch": 0.08127949659150498,
+      "grad_norm": 0.7555065155029297,
+      "learning_rate": 0.001545,
+      "loss": 2.7513,
+      "step": 310
+    },
+    {
+      "epoch": 0.08390141583639224,
+      "grad_norm": 0.7714865803718567,
+      "learning_rate": 0.001595,
+      "loss": 2.6165,
+      "step": 320
+    },
+    {
+      "epoch": 0.08652333508127949,
+      "grad_norm": 0.7604843974113464,
+      "learning_rate": 0.001645,
+      "loss": 2.6391,
+      "step": 330
+    },
+    {
+      "epoch": 0.08914525432616675,
+      "grad_norm": 0.7840315699577332,
+      "learning_rate": 0.0016950000000000001,
+      "loss": 2.5818,
+      "step": 340
+    },
+    {
+      "epoch": 0.09176717357105402,
+      "grad_norm": 1.0126832723617554,
+      "learning_rate": 0.0017450000000000002,
+      "loss": 2.5417,
+      "step": 350
+    },
+    {
+      "epoch": 0.09438909281594127,
+      "grad_norm": 1.0092129707336426,
+      "learning_rate": 0.001795,
+      "loss": 2.4844,
+      "step": 360
+    },
+    {
+      "epoch": 0.09701101206082853,
+      "grad_norm": 1.1585489511489868,
+      "learning_rate": 0.001845,
+      "loss": 2.4645,
+      "step": 370
+    },
+    {
+      "epoch": 0.09963293130571578,
+      "grad_norm": 1.0778034925460815,
+      "learning_rate": 0.001895,
+      "loss": 2.4003,
+      "step": 380
+    },
+    {
+      "epoch": 0.10225485055060304,
+      "grad_norm": 1.146636962890625,
+      "learning_rate": 0.0019450000000000001,
+      "loss": 2.3466,
+      "step": 390
+    },
+    {
+      "epoch": 0.1048767697954903,
+      "grad_norm": 0.9742526412010193,
+      "learning_rate": 0.0019950000000000002,
+      "loss": 2.3088,
+      "step": 400
+    },
+    {
+      "epoch": 0.10749868904037756,
+      "grad_norm": 1.3035728931427002,
+      "learning_rate": 0.0019999657054386192,
+      "loss": 2.2834,
+      "step": 410
+    },
+    {
+      "epoch": 0.11012060828526482,
+      "grad_norm": 1.0689384937286377,
+      "learning_rate": 0.0019998471593574603,
+      "loss": 2.2473,
+      "step": 420
+    },
+    {
+      "epoch": 0.11274252753015207,
+      "grad_norm": 1.1519441604614258,
+      "learning_rate": 0.001999643948402709,
+      "loss": 2.1925,
+      "step": 430
+    },
+    {
+      "epoch": 0.11536444677503933,
+      "grad_norm": 0.9427940249443054,
+      "learning_rate": 0.0019993560897818255,
+      "loss": 2.1774,
+      "step": 440
+    },
+    {
+      "epoch": 0.11798636601992658,
+      "grad_norm": 0.9017934203147888,
+      "learning_rate": 0.0019989836078700496,
+      "loss": 2.152,
+      "step": 450
+    },
+    {
+      "epoch": 0.12060828526481385,
+      "grad_norm": 1.018966555595398,
+      "learning_rate": 0.001998526534208335,
+      "loss": 2.0825,
+      "step": 460
+    },
+    {
+      "epoch": 0.1232302045097011,
+      "grad_norm": 1.0533466339111328,
+      "learning_rate": 0.0019979849075006813,
+      "loss": 2.1358,
+      "step": 470
+    },
+    {
+      "epoch": 0.12585212375458835,
+      "grad_norm": 0.941605806350708,
+      "learning_rate": 0.001997358773610856,
+      "loss": 2.0524,
+      "step": 480
+    },
+    {
+      "epoch": 0.12847404299947562,
+      "grad_norm": 0.8877449035644531,
+      "learning_rate": 0.0019966481855585075,
+      "loss": 2.0308,
+      "step": 490
+    },
+    {
+      "epoch": 0.13109596224436287,
+      "grad_norm": 0.8652307391166687,
+      "learning_rate": 0.001995853203514682,
+      "loss": 2.012,
+      "step": 500
+    },
+    {
+      "epoch": 0.13371788148925012,
+      "grad_norm": 0.8943641781806946,
+      "learning_rate": 0.0019949738947967217,
+      "loss": 1.9729,
+      "step": 510
+    },
+    {
+      "epoch": 0.1363398007341374,
+      "grad_norm": 0.9359736442565918,
+      "learning_rate": 0.001994010333862568,
+      "loss": 1.9997,
+      "step": 520
+    },
+    {
+      "epoch": 0.13896171997902465,
+      "grad_norm": 1.0085017681121826,
+      "learning_rate": 0.001992962602304456,
+      "loss": 1.937,
+      "step": 530
+    },
+    {
+      "epoch": 0.1415836392239119,
+      "grad_norm": 0.7549618482589722,
+      "learning_rate": 0.0019918307888420065,
+      "loss": 1.9268,
+      "step": 540
+    },
+    {
+      "epoch": 0.14420555846879915,
+      "grad_norm": 0.8932085037231445,
+      "learning_rate": 0.0019906149893147104,
+      "loss": 1.9014,
+      "step": 550
+    },
+    {
+      "epoch": 0.14682747771368643,
+      "grad_norm": 0.8130724430084229,
+      "learning_rate": 0.001989315306673817,
+      "loss": 1.8577,
+      "step": 560
+    },
+    {
+      "epoch": 0.14944939695857368,
+      "grad_norm": 0.8497139811515808,
+      "learning_rate": 0.0019879318509736137,
+      "loss": 1.8185,
+      "step": 570
+    },
+    {
+      "epoch": 0.15207131620346093,
+      "grad_norm": 0.6299962997436523,
+      "learning_rate": 0.001986464739362106,
+      "loss": 1.811,
+      "step": 580
+    },
+    {
+      "epoch": 0.1546932354483482,
+      "grad_norm": 0.7180768251419067,
+      "learning_rate": 0.0019849140960711024,
+      "loss": 1.7944,
+      "step": 590
+    },
+    {
+      "epoch": 0.15731515469323545,
+      "grad_norm": 0.8082334399223328,
+      "learning_rate": 0.0019832800524056888,
+      "loss": 1.8333,
+      "step": 600
+    },
+    {
+      "epoch": 0.1599370739381227,
+      "grad_norm": 0.8284159302711487,
+      "learning_rate": 0.0019815627467331142,
+      "loss": 1.811,
+      "step": 610
+    },
+    {
+      "epoch": 0.16255899318300995,
+      "grad_norm": 0.7332941293716431,
+      "learning_rate": 0.0019797623244710715,
+      "loss": 1.7704,
+      "step": 620
+    },
+    {
+      "epoch": 0.16518091242789723,
+      "grad_norm": 0.7234723567962646,
+      "learning_rate": 0.0019778789380753862,
+      "loss": 1.7558,
+      "step": 630
+    },
+    {
+      "epoch": 0.16780283167278448,
+      "grad_norm": 0.693242073059082,
+      "learning_rate": 0.001975912747027104,
+      "loss": 1.742,
+      "step": 640
+    },
+    {
+      "epoch": 0.17042475091767173,
+      "grad_norm": 0.8523733019828796,
+      "learning_rate": 0.0019738639178189885,
+      "loss": 1.7438,
+      "step": 650
+    },
+    {
+      "epoch": 0.17304667016255898,
+      "grad_norm": 0.7505561709403992,
+      "learning_rate": 0.001971732623941422,
+      "loss": 1.7251,
+      "step": 660
+    },
+    {
+      "epoch": 0.17566858940744626,
+      "grad_norm": 0.7338821887969971,
+      "learning_rate": 0.0019695190458677144,
+      "loss": 1.7281,
+      "step": 670
+    },
+    {
+      "epoch": 0.1782905086523335,
+      "grad_norm": 0.8278585076332092,
+      "learning_rate": 0.001967223371038823,
+      "loss": 1.6983,
+      "step": 680
+    },
+    {
+      "epoch": 0.18091242789722076,
+      "grad_norm": 0.6785498261451721,
+      "learning_rate": 0.0019648457938474776,
+      "loss": 1.7018,
+      "step": 690
+    },
+    {
+      "epoch": 0.18353434714210803,
+      "grad_norm": 0.7954968810081482,
+      "learning_rate": 0.0019623865156217215,
+      "loss": 1.6978,
+      "step": 700
+    },
+    {
+      "epoch": 0.18615626638699528,
+      "grad_norm": 0.6877925992012024,
+      "learning_rate": 0.001959845744607864,
+      "loss": 1.6693,
+      "step": 710
+    },
+    {
+      "epoch": 0.18877818563188253,
+      "grad_norm": 0.6183112859725952,
+      "learning_rate": 0.001957223695952844,
+      "loss": 1.656,
+      "step": 720
+    },
+    {
+      "epoch": 0.19140010487676978,
+      "grad_norm": 0.6864896416664124,
+      "learning_rate": 0.0019545205916860152,
+      "loss": 1.6188,
+      "step": 730
+    },
+    {
+      "epoch": 0.19402202412165706,
+      "grad_norm": 0.6678555011749268,
+      "learning_rate": 0.0019517366607003429,
+      "loss": 1.6195,
+      "step": 740
+    },
+    {
+      "epoch": 0.1966439433665443,
+      "grad_norm": 0.724320113658905,
+      "learning_rate": 0.0019488721387330222,
+      "loss": 1.6067,
+      "step": 750
+    },
+    {
+      "epoch": 0.19926586261143156,
+      "grad_norm": 0.6665757298469543,
+      "learning_rate": 0.0019459272683455162,
+      "loss": 1.5781,
+      "step": 760
+    },
+    {
+      "epoch": 0.20188778185631884,
+      "grad_norm": 0.7139772772789001,
+      "learning_rate": 0.0019429022989030176,
+      "loss": 1.5647,
+      "step": 770
+    },
+    {
+      "epoch": 0.2045097011012061,
+      "grad_norm": 0.6505457758903503,
+      "learning_rate": 0.0019397974865533315,
+      "loss": 1.5869,
+      "step": 780
+    },
+    {
+      "epoch": 0.20713162034609334,
+      "grad_norm": 0.6815754175186157,
+      "learning_rate": 0.001936613094205186,
+      "loss": 1.5848,
+      "step": 790
+    },
+    {
+      "epoch": 0.2097535395909806,
+      "grad_norm": 0.6977171897888184,
+      "learning_rate": 0.00193334939150597,
+      "loss": 1.5284,
+      "step": 800
+    },
+    {
+      "epoch": 0.21237545883586786,
+      "grad_norm": 0.5965753197669983,
+      "learning_rate": 0.0019300066548188998,
+      "loss": 1.5468,
+      "step": 810
+    },
+    {
+      "epoch": 0.2149973780807551,
+      "grad_norm": 0.596052885055542,
+      "learning_rate": 0.001926585167199616,
+      "loss": 1.5579,
+      "step": 820
+    },
+    {
+      "epoch": 0.21761929732564236,
+      "grad_norm": 0.6821017861366272,
+      "learning_rate": 0.001923085218372218,
+      "loss": 1.4984,
+      "step": 830
+    },
+    {
+      "epoch": 0.22024121657052964,
+      "grad_norm": 0.6523297429084778,
+      "learning_rate": 0.0019195071047047277,
+      "loss": 1.537,
+      "step": 840
+    },
+    {
+      "epoch": 0.2228631358154169,
+      "grad_norm": 0.648935079574585,
+      "learning_rate": 0.0019158511291839945,
+      "loss": 1.5192,
+      "step": 850
+    },
+    {
+      "epoch": 0.22548505506030414,
+      "grad_norm": 0.6102792620658875,
+      "learning_rate": 0.0019121176013900407,
+      "loss": 1.5209,
+      "step": 860
+    },
+    {
+      "epoch": 0.2281069743051914,
+      "grad_norm": 0.6573307514190674,
+      "learning_rate": 0.0019083068374698448,
+      "loss": 1.49,
+      "step": 870
+    },
+    {
+      "epoch": 0.23072889355007867,
+      "grad_norm": 0.6355723738670349,
+      "learning_rate": 0.0019044191601105727,
+      "loss": 1.4929,
+      "step": 880
+    },
+    {
+      "epoch": 0.23335081279496592,
+      "grad_norm": 0.5931225419044495,
+      "learning_rate": 0.0019004548985122511,
+      "loss": 1.4813,
+      "step": 890
+    },
+    {
+      "epoch": 0.23597273203985317,
+      "grad_norm": 0.6640650629997253,
+      "learning_rate": 0.0018964143883598936,
+      "loss": 1.4808,
+      "step": 900
+    },
+    {
+      "epoch": 0.23859465128474042,
+      "grad_norm": 0.6377866268157959,
+      "learning_rate": 0.0018922979717950748,
+      "loss": 1.4901,
+      "step": 910
+    },
+    {
+      "epoch": 0.2412165705296277,
+      "grad_norm": 0.6502982378005981,
+      "learning_rate": 0.0018881059973869581,
+      "loss": 1.4501,
+      "step": 920
+    },
+    {
+      "epoch": 0.24383848977451494,
+      "grad_norm": 0.602969765663147,
+      "learning_rate": 0.0018838388201027805,
+      "loss": 1.4661,
+      "step": 930
+    },
+    {
+      "epoch": 0.2464604090194022,
+      "grad_norm": 0.6061879396438599,
+      "learning_rate": 0.001879496801277794,
+      "loss": 1.4408,
+      "step": 940
+    },
+    {
+      "epoch": 0.24908232826428947,
+      "grad_norm": 0.8049127459526062,
+      "learning_rate": 0.001875080308584669,
+      "loss": 1.4466,
+      "step": 950
+    },
+    {
+      "epoch": 0.2517042475091767,
+      "grad_norm": 0.46771517395973206,
+      "learning_rate": 0.00187058971600236,
+      "loss": 1.4382,
+      "step": 960
+    },
+    {
+      "epoch": 0.254326166754064,
+      "grad_norm": 0.6081333756446838,
+      "learning_rate": 0.001866025403784439,
+      "loss": 1.4518,
+      "step": 970
+    },
+    {
+      "epoch": 0.25694808599895125,
+      "grad_norm": 0.6247040033340454,
+      "learning_rate": 0.0018613877584268944,
+      "loss": 1.4639,
+      "step": 980
+    },
+    {
+      "epoch": 0.2595700052438385,
+      "grad_norm": 0.5699506998062134,
+      "learning_rate": 0.0018566771726354063,
+      "loss": 1.4218,
+      "step": 990
+    },
+    {
+      "epoch": 0.26219192448872575,
+      "grad_norm": 0.5360729694366455,
+      "learning_rate": 0.0018518940452920906,
+      "loss": 1.4189,
+      "step": 1000
+    },
+    {
+      "epoch": 0.264813843733613,
+      "grad_norm": 0.5921474695205688,
+      "learning_rate": 0.0018470387814217232,
+      "loss": 1.424,
+      "step": 1010
+    },
+    {
+      "epoch": 0.26743576297850025,
+      "grad_norm": 0.6162559986114502,
+      "learning_rate": 0.0018421117921574438,
+      "loss": 1.4307,
+      "step": 1020
+    },
+    {
+      "epoch": 0.2700576822233875,
+      "grad_norm": 0.5530286431312561,
+      "learning_rate": 0.001837113494705942,
+      "loss": 1.4158,
+      "step": 1030
+    },
+    {
+      "epoch": 0.2726796014682748,
+      "grad_norm": 0.5585499405860901,
+      "learning_rate": 0.0018320443123121283,
+      "loss": 1.3861,
+      "step": 1040
+    },
+    {
+      "epoch": 0.27530152071316205,
+      "grad_norm": 0.6225973963737488,
+      "learning_rate": 0.0018269046742232966,
+      "loss": 1.3942,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2779234399580493,
+      "grad_norm": 0.49642321467399597,
+      "learning_rate": 0.0018216950156527737,
+      "loss": 1.3912,
+      "step": 1060
+    },
+    {
+      "epoch": 0.28054535920293655,
+      "grad_norm": 0.6089576482772827,
+      "learning_rate": 0.0018164157777430681,
+      "loss": 1.3732,
+      "step": 1070
+    },
+    {
+      "epoch": 0.2831672784478238,
+      "grad_norm": 0.5753847360610962,
+      "learning_rate": 0.0018110674075285157,
+      "loss": 1.398,
+      "step": 1080
+    },
+    {
+      "epoch": 0.28578919769271105,
+      "grad_norm": 0.5357734560966492,
+      "learning_rate": 0.0018056503578974242,
+      "loss": 1.3851,
+      "step": 1090
+    },
+    {
+      "epoch": 0.2884111169375983,
+      "grad_norm": 0.5319791436195374,
+      "learning_rate": 0.001800165087553724,
+      "loss": 1.3804,
+      "step": 1100
+    },
+    {
+      "epoch": 0.2910330361824856,
+      "grad_norm": 0.5765709280967712,
+      "learning_rate": 0.0017946120609781276,
+      "loss": 1.3534,
+      "step": 1110
+    },
+    {
+      "epoch": 0.29365495542737285,
+      "grad_norm": 0.48765453696250916,
+      "learning_rate": 0.001788991748388796,
+      "loss": 1.3693,
+      "step": 1120
+    },
+    {
+      "epoch": 0.2962768746722601,
+      "grad_norm": 0.5916075110435486,
+      "learning_rate": 0.001783304625701524,
+      "loss": 1.3697,
+      "step": 1130
+    },
+    {
+      "epoch": 0.29889879391714735,
+      "grad_norm": 0.411699503660202,
+      "learning_rate": 0.0017775511744894384,
+      "loss": 1.3588,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3015207131620346,
+      "grad_norm": 0.5155631899833679,
+      "learning_rate": 0.0017717318819422214,
+      "loss": 1.3697,
+      "step": 1150
+    },
+    {
+      "epoch": 0.30414263240692185,
+      "grad_norm": 0.5687488913536072,
+      "learning_rate": 0.0017658472408248551,
+      "loss": 1.3558,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3067645516518091,
+      "grad_norm": 0.5609891414642334,
+      "learning_rate": 0.0017598977494358967,
+      "loss": 1.3376,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3093864708966964,
+      "grad_norm": 0.5137512683868408,
+      "learning_rate": 0.0017538839115652817,
+      "loss": 1.3534,
+      "step": 1180
+    },
+    {
+      "epoch": 0.31200839014158366,
+      "grad_norm": 0.5840641260147095,
+      "learning_rate": 0.001747806236451666,
+      "loss": 1.3394,
+      "step": 1190
+    },
+    {
+      "epoch": 0.3146303093864709,
+      "grad_norm": 0.5758949518203735,
+      "learning_rate": 0.0017416652387393027,
+      "loss": 1.3417,
+      "step": 1200
+    },
+    {
+      "epoch": 0.31725222863135816,
+      "grad_norm": 0.5121742486953735,
+      "learning_rate": 0.0017354614384344658,
+      "loss": 1.341,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3198741478762454,
+      "grad_norm": 0.5056650638580322,
+      "learning_rate": 0.001729195360861414,
+      "loss": 1.316,
+      "step": 1220
+    },
+    {
+      "epoch": 0.32249606712113266,
+      "grad_norm": 0.4782615602016449,
+      "learning_rate": 0.0017228675366179106,
+      "loss": 1.3226,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3251179863660199,
+      "grad_norm": 0.49403342604637146,
+      "learning_rate": 0.0017164785015302906,
+      "loss": 1.37,
+      "step": 1240
+    },
+    {
+      "epoch": 0.3277399056109072,
+      "grad_norm": 0.4836321175098419,
+      "learning_rate": 0.0017100287966080906,
+      "loss": 1.3272,
+      "step": 1250
+    },
+    {
+      "epoch": 0.33036182485579446,
+      "grad_norm": 0.48174890875816345,
+      "learning_rate": 0.001703518967998236,
+      "loss": 1.3148,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3329837441006817,
+      "grad_norm": 0.4627121090888977,
+      "learning_rate": 0.001696949566938795,
+      "loss": 1.3161,
+      "step": 1270
+    },
+    {
+      "epoch": 0.33560566334556896,
+      "grad_norm": 0.470414936542511,
+      "learning_rate": 0.0016903211497123003,
+      "loss": 1.3313,
+      "step": 1280
+    },
+    {
+      "epoch": 0.3382275825904562,
+      "grad_norm": 0.4437310993671417,
+      "learning_rate": 0.0016836342775986446,
+      "loss": 1.3073,
+      "step": 1290
+    },
+    {
+      "epoch": 0.34084950183534346,
+      "grad_norm": 0.47688329219818115,
+      "learning_rate": 0.0016768895168275534,
+      "loss": 1.3128,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3434714210802307,
+      "grad_norm": 0.5143507122993469,
+      "learning_rate": 0.0016700874385306363,
+      "loss": 1.3357,
+      "step": 1310
+    },
+    {
+      "epoch": 0.34609334032511796,
+      "grad_norm": 0.4100657105445862,
+      "learning_rate": 0.0016632286186930275,
+      "loss": 1.3061,
+      "step": 1320
+    },
+    {
+      "epoch": 0.34871525957000526,
+      "grad_norm": 0.4421868920326233,
+      "learning_rate": 0.0016563136381046088,
+      "loss": 1.3158,
+      "step": 1330
+    },
+    {
+      "epoch": 0.3513371788148925,
+      "grad_norm": 0.4668099582195282,
+      "learning_rate": 0.0016493430823108332,
+      "loss": 1.3088,
+      "step": 1340
+    },
+    {
+      "epoch": 0.35395909805977976,
+      "grad_norm": 0.5451709032058716,
+      "learning_rate": 0.0016423175415631404,
+      "loss": 1.3344,
+      "step": 1350
+    },
+    {
+      "epoch": 0.356581017304667,
+      "grad_norm": 0.45294106006622314,
+      "learning_rate": 0.0016352376107689754,
+      "loss": 1.2778,
+      "step": 1360
+    },
+    {
+      "epoch": 0.35920293654955426,
+      "grad_norm": 0.4404051601886749,
+      "learning_rate": 0.0016281038894414143,
+      "loss": 1.2871,
+      "step": 1370
+    },
+    {
+      "epoch": 0.3618248557944415,
+      "grad_norm": 0.45863279700279236,
+      "learning_rate": 0.0016209169816483971,
+      "loss": 1.3286,
+      "step": 1380
+    },
+    {
+      "epoch": 0.36444677503932876,
+      "grad_norm": 0.45011425018310547,
+      "learning_rate": 0.0016136774959615784,
+      "loss": 1.2979,
+      "step": 1390
+    },
+    {
+      "epoch": 0.36706869428421607,
+      "grad_norm": 0.5113876461982727,
+      "learning_rate": 0.0016063860454047943,
+      "loss": 1.3088,
+      "step": 1400
+    },
+    {
+      "epoch": 0.3696906135291033,
+      "grad_norm": 0.40740302205085754,
+      "learning_rate": 0.001599043247402151,
+      "loss": 1.2703,
+      "step": 1410
+    },
+    {
+      "epoch": 0.37231253277399057,
+      "grad_norm": 0.4261358976364136,
+      "learning_rate": 0.0015916497237257455,
+      "loss": 1.2681,
+      "step": 1420
+    },
+    {
+      "epoch": 0.3749344520188778,
+      "grad_norm": 0.4349290132522583,
+      "learning_rate": 0.0015842061004430145,
+      "loss": 1.317,
+      "step": 1430
+    },
+    {
+      "epoch": 0.37755637126376507,
+      "grad_norm": 0.4363626539707184,
+      "learning_rate": 0.0015767130078637183,
+      "loss": 1.2707,
+      "step": 1440
+    },
+    {
+      "epoch": 0.3801782905086523,
+      "grad_norm": 0.41238006949424744,
+      "learning_rate": 0.0015691710804865706,
+      "loss": 1.2763,
+      "step": 1450
+    },
+    {
+      "epoch": 0.38280020975353957,
+      "grad_norm": 0.476226270198822,
+      "learning_rate": 0.0015615809569455089,
+      "loss": 1.3037,
+      "step": 1460
+    },
+    {
+      "epoch": 0.38542212899842687,
+      "grad_norm": 0.45900896191596985,
+      "learning_rate": 0.0015539432799556159,
+      "loss": 1.287,
+      "step": 1470
+    },
+    {
+      "epoch": 0.3880440482433141,
+      "grad_norm": 0.3873949348926544,
+      "learning_rate": 0.0015462586962586972,
+      "loss": 1.2793,
+      "step": 1480
+    },
+    {
+      "epoch": 0.39066596748820137,
+      "grad_norm": 0.4380306601524353,
+      "learning_rate": 0.001538527856568515,
+      "loss": 1.2916,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3932878867330886,
+      "grad_norm": 0.39479300379753113,
+      "learning_rate": 0.0015307514155156895,
+      "loss": 1.272,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3814,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0095396499845284e+18,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81c91fc77496ffd4fb9e10a9ec46455f5246c9c164ddf0a8d2f8b08013987959
+size 5777