Training in progress, step 5000

Browse files

Files changed (12) hide show

config.json +28 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
runs/Jun07_20-36-29_829f6f605e43/events.out.tfevents.1717792718.829f6f605e43.85.0 +3 -0
scheduler.pt +3 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +53 -0
trainer_state.json +1792 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "architectures": [
+    "AlbertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0,
+  "bos_token_id": 2,
+  "classifier_dropout_prob": 0.1,
+  "embedding_size": 128,
+  "eos_token_id": 3,
+  "hidden_act": "gelu_new",
+  "hidden_dropout_prob": 0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "inner_group_num": 1,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 32,
+  "model_type": "albert",
+  "num_attention_heads": 6,
+  "num_hidden_groups": 1,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.2",
+  "type_vocab_size": 2,
+  "vocab_size": 30000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8abaa42d3463b1fade9c9a8962318359c17b20036209deee3f1533c022f6ff45
+size 44644496

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82254252c8439c400690d9e912a9b41091500366183fc979cee1cef9b2d35ddb
+size 11230198

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cb795a5cea0baa625c50007a6c9da09c6bbb5c16b560424070384a479e7d8a6
+size 14512

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f19604377bd828eb366c68946ad997a4ff4d69beaeea93ee58915135768ec63
+size 14512

runs/Jun07_20-36-29_829f6f605e43/events.out.tfevents.1717792718.829f6f605e43.85.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5f5a58aa39d257803087286f6970f617b220fb828688a142a7a6f6b2ebe0a9a
+size 57851

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe25b65e375d09c26af85ce4a53e5909be64fae7753341d624d8971238990a51
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]",
+  "use_fast_tokenizer": true
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1792 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0904895484571532,
+  "eval_steps": 5000,
+  "global_step": 5000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0003619581938286128,
+      "grad_norm": 1.063517451286316,
+      "learning_rate": 1.0858418290402365e-06,
+      "loss": 10.3292,
+      "step": 20
+    },
+    {
+      "epoch": 0.0007239163876572256,
+      "grad_norm": 0.7639716267585754,
+      "learning_rate": 2.171683658080473e-06,
+      "loss": 10.3095,
+      "step": 40
+    },
+    {
+      "epoch": 0.0010858745814858383,
+      "grad_norm": 0.5479791760444641,
+      "learning_rate": 3.2575254871207094e-06,
+      "loss": 10.2858,
+      "step": 60
+    },
+    {
+      "epoch": 0.0014478327753144511,
+      "grad_norm": 0.5541791319847107,
+      "learning_rate": 4.343367316160946e-06,
+      "loss": 10.2636,
+      "step": 80
+    },
+    {
+      "epoch": 0.001809790969143064,
+      "grad_norm": 0.5530744791030884,
+      "learning_rate": 5.429209145201182e-06,
+      "loss": 10.2491,
+      "step": 100
+    },
+    {
+      "epoch": 0.0021717491629716767,
+      "grad_norm": 0.5137726068496704,
+      "learning_rate": 6.515050974241419e-06,
+      "loss": 10.2343,
+      "step": 120
+    },
+    {
+      "epoch": 0.0025337073568002895,
+      "grad_norm": 0.512411892414093,
+      "learning_rate": 7.600892803281655e-06,
+      "loss": 10.2178,
+      "step": 140
+    },
+    {
+      "epoch": 0.0028956655506289022,
+      "grad_norm": 0.5807462334632874,
+      "learning_rate": 8.686734632321892e-06,
+      "loss": 10.1966,
+      "step": 160
+    },
+    {
+      "epoch": 0.003257623744457515,
+      "grad_norm": 0.7059533596038818,
+      "learning_rate": 9.772576461362129e-06,
+      "loss": 10.1748,
+      "step": 180
+    },
+    {
+      "epoch": 0.003619581938286128,
+      "grad_norm": 1.3133445978164673,
+      "learning_rate": 1.0858418290402365e-05,
+      "loss": 10.1451,
+      "step": 200
+    },
+    {
+      "epoch": 0.003981540132114741,
+      "grad_norm": 1.6613627672195435,
+      "learning_rate": 1.1944260119442602e-05,
+      "loss": 10.1148,
+      "step": 220
+    },
+    {
+      "epoch": 0.004343498325943353,
+      "grad_norm": 1.7141027450561523,
+      "learning_rate": 1.3030101948482837e-05,
+      "loss": 10.0838,
+      "step": 240
+    },
+    {
+      "epoch": 0.004705456519771966,
+      "grad_norm": 2.0595993995666504,
+      "learning_rate": 1.4115943777523073e-05,
+      "loss": 10.0519,
+      "step": 260
+    },
+    {
+      "epoch": 0.005067414713600579,
+      "grad_norm": 2.2947471141815186,
+      "learning_rate": 1.520178560656331e-05,
+      "loss": 10.0166,
+      "step": 280
+    },
+    {
+      "epoch": 0.005429372907429192,
+      "grad_norm": 2.2053394317626953,
+      "learning_rate": 1.628762743560355e-05,
+      "loss": 9.9809,
+      "step": 300
+    },
+    {
+      "epoch": 0.0057913311012578045,
+      "grad_norm": 2.2479448318481445,
+      "learning_rate": 1.7373469264643783e-05,
+      "loss": 9.9443,
+      "step": 320
+    },
+    {
+      "epoch": 0.006153289295086417,
+      "grad_norm": 1.9384366273880005,
+      "learning_rate": 1.845931109368402e-05,
+      "loss": 9.9023,
+      "step": 340
+    },
+    {
+      "epoch": 0.00651524748891503,
+      "grad_norm": 1.7514017820358276,
+      "learning_rate": 1.9545152922724258e-05,
+      "loss": 9.8601,
+      "step": 360
+    },
+    {
+      "epoch": 0.006877205682743643,
+      "grad_norm": 1.5420582294464111,
+      "learning_rate": 2.0630994751764492e-05,
+      "loss": 9.8214,
+      "step": 380
+    },
+    {
+      "epoch": 0.007239163876572256,
+      "grad_norm": 1.223652958869934,
+      "learning_rate": 2.171683658080473e-05,
+      "loss": 9.7768,
+      "step": 400
+    },
+    {
+      "epoch": 0.007601122070400868,
+      "grad_norm": 1.058700442314148,
+      "learning_rate": 2.2802678409844966e-05,
+      "loss": 9.7302,
+      "step": 420
+    },
+    {
+      "epoch": 0.007963080264229481,
+      "grad_norm": 1.0271687507629395,
+      "learning_rate": 2.3888520238885204e-05,
+      "loss": 9.6882,
+      "step": 440
+    },
+    {
+      "epoch": 0.008325038458058095,
+      "grad_norm": 0.9583302140235901,
+      "learning_rate": 2.4974362067925438e-05,
+      "loss": 9.6446,
+      "step": 460
+    },
+    {
+      "epoch": 0.008686996651886707,
+      "grad_norm": 0.8648065328598022,
+      "learning_rate": 2.6060203896965675e-05,
+      "loss": 9.6003,
+      "step": 480
+    },
+    {
+      "epoch": 0.00904895484571532,
+      "grad_norm": 0.7204363346099854,
+      "learning_rate": 2.7146045726005912e-05,
+      "loss": 9.5586,
+      "step": 500
+    },
+    {
+      "epoch": 0.009410913039543932,
+      "grad_norm": 0.8680516481399536,
+      "learning_rate": 2.8231887555046146e-05,
+      "loss": 9.5111,
+      "step": 520
+    },
+    {
+      "epoch": 0.009772871233372546,
+      "grad_norm": 0.6860862374305725,
+      "learning_rate": 2.9317729384086387e-05,
+      "loss": 9.4771,
+      "step": 540
+    },
+    {
+      "epoch": 0.010134829427201158,
+      "grad_norm": 0.711931049823761,
+      "learning_rate": 3.040357121312662e-05,
+      "loss": 9.4369,
+      "step": 560
+    },
+    {
+      "epoch": 0.010496787621029771,
+      "grad_norm": 0.7360084652900696,
+      "learning_rate": 3.148941304216686e-05,
+      "loss": 9.3904,
+      "step": 580
+    },
+    {
+      "epoch": 0.010858745814858383,
+      "grad_norm": 0.7077947854995728,
+      "learning_rate": 3.25752548712071e-05,
+      "loss": 9.3552,
+      "step": 600
+    },
+    {
+      "epoch": 0.011220704008686997,
+      "grad_norm": 0.562315821647644,
+      "learning_rate": 3.366109670024733e-05,
+      "loss": 9.3164,
+      "step": 620
+    },
+    {
+      "epoch": 0.011582662202515609,
+      "grad_norm": 0.6069725751876831,
+      "learning_rate": 3.4746938529287566e-05,
+      "loss": 9.2838,
+      "step": 640
+    },
+    {
+      "epoch": 0.011944620396344223,
+      "grad_norm": 0.6498883962631226,
+      "learning_rate": 3.583278035832781e-05,
+      "loss": 9.2487,
+      "step": 660
+    },
+    {
+      "epoch": 0.012306578590172835,
+      "grad_norm": 0.615738034248352,
+      "learning_rate": 3.691862218736804e-05,
+      "loss": 9.2222,
+      "step": 680
+    },
+    {
+      "epoch": 0.012668536784001448,
+      "grad_norm": 0.5743957757949829,
+      "learning_rate": 3.8004464016408275e-05,
+      "loss": 9.1898,
+      "step": 700
+    },
+    {
+      "epoch": 0.01303049497783006,
+      "grad_norm": 0.5580448508262634,
+      "learning_rate": 3.9090305845448516e-05,
+      "loss": 9.1659,
+      "step": 720
+    },
+    {
+      "epoch": 0.013392453171658674,
+      "grad_norm": 0.5862032771110535,
+      "learning_rate": 4.017614767448875e-05,
+      "loss": 9.1376,
+      "step": 740
+    },
+    {
+      "epoch": 0.013754411365487286,
+      "grad_norm": 0.650966465473175,
+      "learning_rate": 4.1261989503528983e-05,
+      "loss": 9.1042,
+      "step": 760
+    },
+    {
+      "epoch": 0.0141163695593159,
+      "grad_norm": 0.6448431015014648,
+      "learning_rate": 4.2347831332569224e-05,
+      "loss": 9.0836,
+      "step": 780
+    },
+    {
+      "epoch": 0.014478327753144511,
+      "grad_norm": 0.5597690939903259,
+      "learning_rate": 4.343367316160946e-05,
+      "loss": 9.0728,
+      "step": 800
+    },
+    {
+      "epoch": 0.014840285946973125,
+      "grad_norm": 0.5503562092781067,
+      "learning_rate": 4.451951499064969e-05,
+      "loss": 9.0544,
+      "step": 820
+    },
+    {
+      "epoch": 0.015202244140801737,
+      "grad_norm": 0.5494912266731262,
+      "learning_rate": 4.560535681968993e-05,
+      "loss": 9.0229,
+      "step": 840
+    },
+    {
+      "epoch": 0.01556420233463035,
+      "grad_norm": 0.5567879676818848,
+      "learning_rate": 4.6691198648730167e-05,
+      "loss": 9.016,
+      "step": 860
+    },
+    {
+      "epoch": 0.015926160528458962,
+      "grad_norm": 0.5893707275390625,
+      "learning_rate": 4.777704047777041e-05,
+      "loss": 8.9989,
+      "step": 880
+    },
+    {
+      "epoch": 0.016288118722287574,
+      "grad_norm": 0.5258359909057617,
+      "learning_rate": 4.886288230681064e-05,
+      "loss": 8.9894,
+      "step": 900
+    },
+    {
+      "epoch": 0.01665007691611619,
+      "grad_norm": 0.5892924070358276,
+      "learning_rate": 4.9948724135850875e-05,
+      "loss": 8.9762,
+      "step": 920
+    },
+    {
+      "epoch": 0.0170120351099448,
+      "grad_norm": 0.6365529894828796,
+      "learning_rate": 5.1034565964891116e-05,
+      "loss": 8.9545,
+      "step": 940
+    },
+    {
+      "epoch": 0.017373993303773413,
+      "grad_norm": 0.5396739840507507,
+      "learning_rate": 5.212040779393135e-05,
+      "loss": 8.9387,
+      "step": 960
+    },
+    {
+      "epoch": 0.017735951497602025,
+      "grad_norm": 0.5578063130378723,
+      "learning_rate": 5.3206249622971584e-05,
+      "loss": 8.9452,
+      "step": 980
+    },
+    {
+      "epoch": 0.01809790969143064,
+      "grad_norm": 0.49258968234062195,
+      "learning_rate": 5.4292091452011824e-05,
+      "loss": 8.9329,
+      "step": 1000
+    },
+    {
+      "epoch": 0.018459867885259253,
+      "grad_norm": 0.5692590475082397,
+      "learning_rate": 5.537793328105206e-05,
+      "loss": 8.9197,
+      "step": 1020
+    },
+    {
+      "epoch": 0.018821826079087865,
+      "grad_norm": 0.5728791356086731,
+      "learning_rate": 5.646377511009229e-05,
+      "loss": 8.915,
+      "step": 1040
+    },
+    {
+      "epoch": 0.019183784272916476,
+      "grad_norm": 0.4755489230155945,
+      "learning_rate": 5.754961693913253e-05,
+      "loss": 8.8982,
+      "step": 1060
+    },
+    {
+      "epoch": 0.019545742466745092,
+      "grad_norm": 0.5339570641517639,
+      "learning_rate": 5.8635458768172773e-05,
+      "loss": 8.896,
+      "step": 1080
+    },
+    {
+      "epoch": 0.019907700660573704,
+      "grad_norm": 0.58181232213974,
+      "learning_rate": 5.9721300597213e-05,
+      "loss": 8.8863,
+      "step": 1100
+    },
+    {
+      "epoch": 0.020269658854402316,
+      "grad_norm": 0.5068308115005493,
+      "learning_rate": 6.080714242625324e-05,
+      "loss": 8.8815,
+      "step": 1120
+    },
+    {
+      "epoch": 0.020631617048230928,
+      "grad_norm": 0.5282208919525146,
+      "learning_rate": 6.189298425529349e-05,
+      "loss": 8.8762,
+      "step": 1140
+    },
+    {
+      "epoch": 0.020993575242059543,
+      "grad_norm": 0.6251116991043091,
+      "learning_rate": 6.297882608433372e-05,
+      "loss": 8.875,
+      "step": 1160
+    },
+    {
+      "epoch": 0.021355533435888155,
+      "grad_norm": 0.7021293044090271,
+      "learning_rate": 6.406466791337396e-05,
+      "loss": 8.8549,
+      "step": 1180
+    },
+    {
+      "epoch": 0.021717491629716767,
+      "grad_norm": 0.5882667303085327,
+      "learning_rate": 6.51505097424142e-05,
+      "loss": 8.8602,
+      "step": 1200
+    },
+    {
+      "epoch": 0.022079449823545382,
+      "grad_norm": 0.552535891532898,
+      "learning_rate": 6.623635157145442e-05,
+      "loss": 8.8478,
+      "step": 1220
+    },
+    {
+      "epoch": 0.022441408017373994,
+      "grad_norm": 0.5536801218986511,
+      "learning_rate": 6.732219340049467e-05,
+      "loss": 8.8465,
+      "step": 1240
+    },
+    {
+      "epoch": 0.022803366211202606,
+      "grad_norm": 0.7319750189781189,
+      "learning_rate": 6.84080352295349e-05,
+      "loss": 8.8456,
+      "step": 1260
+    },
+    {
+      "epoch": 0.023165324405031218,
+      "grad_norm": 0.6104538440704346,
+      "learning_rate": 6.949387705857513e-05,
+      "loss": 8.8237,
+      "step": 1280
+    },
+    {
+      "epoch": 0.023527282598859833,
+      "grad_norm": 0.596066415309906,
+      "learning_rate": 7.057971888761537e-05,
+      "loss": 8.8247,
+      "step": 1300
+    },
+    {
+      "epoch": 0.023889240792688445,
+      "grad_norm": 0.516325056552887,
+      "learning_rate": 7.166556071665561e-05,
+      "loss": 8.8194,
+      "step": 1320
+    },
+    {
+      "epoch": 0.024251198986517057,
+      "grad_norm": 0.5828744173049927,
+      "learning_rate": 7.275140254569584e-05,
+      "loss": 8.8158,
+      "step": 1340
+    },
+    {
+      "epoch": 0.02461315718034567,
+      "grad_norm": 0.6400988101959229,
+      "learning_rate": 7.383724437473608e-05,
+      "loss": 8.797,
+      "step": 1360
+    },
+    {
+      "epoch": 0.024975115374174284,
+      "grad_norm": 0.6185368895530701,
+      "learning_rate": 7.492308620377632e-05,
+      "loss": 8.798,
+      "step": 1380
+    },
+    {
+      "epoch": 0.025337073568002896,
+      "grad_norm": 0.5269763469696045,
+      "learning_rate": 7.600892803281655e-05,
+      "loss": 8.7835,
+      "step": 1400
+    },
+    {
+      "epoch": 0.025699031761831508,
+      "grad_norm": 0.5572532415390015,
+      "learning_rate": 7.709476986185679e-05,
+      "loss": 8.7743,
+      "step": 1420
+    },
+    {
+      "epoch": 0.02606098995566012,
+      "grad_norm": 0.5819870233535767,
+      "learning_rate": 7.818061169089703e-05,
+      "loss": 8.7728,
+      "step": 1440
+    },
+    {
+      "epoch": 0.026422948149488736,
+      "grad_norm": 0.724423348903656,
+      "learning_rate": 7.926645351993726e-05,
+      "loss": 8.7537,
+      "step": 1460
+    },
+    {
+      "epoch": 0.026784906343317347,
+      "grad_norm": 0.729290246963501,
+      "learning_rate": 8.03522953489775e-05,
+      "loss": 8.7566,
+      "step": 1480
+    },
+    {
+      "epoch": 0.02714686453714596,
+      "grad_norm": 0.6978908777236938,
+      "learning_rate": 8.143813717801774e-05,
+      "loss": 8.7412,
+      "step": 1500
+    },
+    {
+      "epoch": 0.02750882273097457,
+      "grad_norm": 0.9260867238044739,
+      "learning_rate": 8.252397900705797e-05,
+      "loss": 8.7469,
+      "step": 1520
+    },
+    {
+      "epoch": 0.027870780924803187,
+      "grad_norm": 0.8353539109230042,
+      "learning_rate": 8.360982083609821e-05,
+      "loss": 8.726,
+      "step": 1540
+    },
+    {
+      "epoch": 0.0282327391186318,
+      "grad_norm": 0.6905779242515564,
+      "learning_rate": 8.469566266513845e-05,
+      "loss": 8.7128,
+      "step": 1560
+    },
+    {
+      "epoch": 0.02859469731246041,
+      "grad_norm": 0.6228475570678711,
+      "learning_rate": 8.578150449417868e-05,
+      "loss": 8.7487,
+      "step": 1580
+    },
+    {
+      "epoch": 0.028956655506289022,
+      "grad_norm": 0.9750573635101318,
+      "learning_rate": 8.686734632321892e-05,
+      "loss": 8.7054,
+      "step": 1600
+    },
+    {
+      "epoch": 0.029318613700117638,
+      "grad_norm": 0.874622106552124,
+      "learning_rate": 8.795318815225916e-05,
+      "loss": 8.715,
+      "step": 1620
+    },
+    {
+      "epoch": 0.02968057189394625,
+      "grad_norm": 0.7310810685157776,
+      "learning_rate": 8.903902998129938e-05,
+      "loss": 8.6835,
+      "step": 1640
+    },
+    {
+      "epoch": 0.03004253008777486,
+      "grad_norm": 0.7120797038078308,
+      "learning_rate": 9.012487181033962e-05,
+      "loss": 8.6734,
+      "step": 1660
+    },
+    {
+      "epoch": 0.030404488281603474,
+      "grad_norm": 1.0088528394699097,
+      "learning_rate": 9.121071363937987e-05,
+      "loss": 8.6657,
+      "step": 1680
+    },
+    {
+      "epoch": 0.03076644647543209,
+      "grad_norm": 1.4212431907653809,
+      "learning_rate": 9.229655546842009e-05,
+      "loss": 8.6429,
+      "step": 1700
+    },
+    {
+      "epoch": 0.0311284046692607,
+      "grad_norm": 1.2623440027236938,
+      "learning_rate": 9.338239729746033e-05,
+      "loss": 8.6424,
+      "step": 1720
+    },
+    {
+      "epoch": 0.031490362863089316,
+      "grad_norm": 1.190894365310669,
+      "learning_rate": 9.446823912650057e-05,
+      "loss": 8.616,
+      "step": 1740
+    },
+    {
+      "epoch": 0.031852321056917925,
+      "grad_norm": 0.8714050054550171,
+      "learning_rate": 9.555408095554081e-05,
+      "loss": 8.6261,
+      "step": 1760
+    },
+    {
+      "epoch": 0.03221427925074654,
+      "grad_norm": 0.7428932189941406,
+      "learning_rate": 9.663992278458104e-05,
+      "loss": 8.6054,
+      "step": 1780
+    },
+    {
+      "epoch": 0.03257623744457515,
+      "grad_norm": 0.8327052593231201,
+      "learning_rate": 9.772576461362128e-05,
+      "loss": 8.6072,
+      "step": 1800
+    },
+    {
+      "epoch": 0.032938195638403764,
+      "grad_norm": 0.7944102883338928,
+      "learning_rate": 9.881160644266152e-05,
+      "loss": 8.5757,
+      "step": 1820
+    },
+    {
+      "epoch": 0.03330015383223238,
+      "grad_norm": 0.929286003112793,
+      "learning_rate": 9.989744827170175e-05,
+      "loss": 8.5651,
+      "step": 1840
+    },
+    {
+      "epoch": 0.03366211202606099,
+      "grad_norm": 0.9866963028907776,
+      "learning_rate": 0.00010098329010074199,
+      "loss": 8.55,
+      "step": 1860
+    },
+    {
+      "epoch": 0.0340240702198896,
+      "grad_norm": 1.5684864521026611,
+      "learning_rate": 0.00010206913192978223,
+      "loss": 8.5508,
+      "step": 1880
+    },
+    {
+      "epoch": 0.03438602841371822,
+      "grad_norm": 1.268312692642212,
+      "learning_rate": 0.00010315497375882246,
+      "loss": 8.5303,
+      "step": 1900
+    },
+    {
+      "epoch": 0.03474798660754683,
+      "grad_norm": 1.384734034538269,
+      "learning_rate": 0.0001042408155878627,
+      "loss": 8.526,
+      "step": 1920
+    },
+    {
+      "epoch": 0.03510994480137544,
+      "grad_norm": 1.1619597673416138,
+      "learning_rate": 0.00010532665741690294,
+      "loss": 8.5083,
+      "step": 1940
+    },
+    {
+      "epoch": 0.03547190299520405,
+      "grad_norm": 1.4747376441955566,
+      "learning_rate": 0.00010641249924594317,
+      "loss": 8.492,
+      "step": 1960
+    },
+    {
+      "epoch": 0.035833861189032666,
+      "grad_norm": 1.1740059852600098,
+      "learning_rate": 0.00010749834107498341,
+      "loss": 8.4921,
+      "step": 1980
+    },
+    {
+      "epoch": 0.03619581938286128,
+      "grad_norm": 1.6106988191604614,
+      "learning_rate": 0.00010858418290402365,
+      "loss": 8.4829,
+      "step": 2000
+    },
+    {
+      "epoch": 0.03655777757668989,
+      "grad_norm": 1.2826563119888306,
+      "learning_rate": 0.00010967002473306388,
+      "loss": 8.4542,
+      "step": 2020
+    },
+    {
+      "epoch": 0.036919735770518505,
+      "grad_norm": 1.4442939758300781,
+      "learning_rate": 0.00011075586656210412,
+      "loss": 8.4472,
+      "step": 2040
+    },
+    {
+      "epoch": 0.03728169396434712,
+      "grad_norm": 1.3481926918029785,
+      "learning_rate": 0.00011184170839114436,
+      "loss": 8.4245,
+      "step": 2060
+    },
+    {
+      "epoch": 0.03764365215817573,
+      "grad_norm": 1.377825379371643,
+      "learning_rate": 0.00011292755022018458,
+      "loss": 8.4233,
+      "step": 2080
+    },
+    {
+      "epoch": 0.038005610352004345,
+      "grad_norm": 1.656119465827942,
+      "learning_rate": 0.00011401339204922482,
+      "loss": 8.4225,
+      "step": 2100
+    },
+    {
+      "epoch": 0.03836756854583295,
+      "grad_norm": 1.5955251455307007,
+      "learning_rate": 0.00011509923387826507,
+      "loss": 8.4216,
+      "step": 2120
+    },
+    {
+      "epoch": 0.03872952673966157,
+      "grad_norm": 1.6190309524536133,
+      "learning_rate": 0.0001161850757073053,
+      "loss": 8.4182,
+      "step": 2140
+    },
+    {
+      "epoch": 0.039091484933490184,
+      "grad_norm": 1.4823400974273682,
+      "learning_rate": 0.00011727091753634555,
+      "loss": 8.3738,
+      "step": 2160
+    },
+    {
+      "epoch": 0.03945344312731879,
+      "grad_norm": 4.104274749755859,
+      "learning_rate": 0.00011835675936538576,
+      "loss": 8.3636,
+      "step": 2180
+    },
+    {
+      "epoch": 0.03981540132114741,
+      "grad_norm": 1.6707680225372314,
+      "learning_rate": 0.000119442601194426,
+      "loss": 8.3745,
+      "step": 2200
+    },
+    {
+      "epoch": 0.04017735951497602,
+      "grad_norm": 1.1206501722335815,
+      "learning_rate": 0.00012052844302346624,
+      "loss": 8.353,
+      "step": 2220
+    },
+    {
+      "epoch": 0.04053931770880463,
+      "grad_norm": 2.2229607105255127,
+      "learning_rate": 0.00012161428485250648,
+      "loss": 8.3539,
+      "step": 2240
+    },
+    {
+      "epoch": 0.04090127590263325,
+      "grad_norm": 1.9011199474334717,
+      "learning_rate": 0.00012270012668154674,
+      "loss": 8.3293,
+      "step": 2260
+    },
+    {
+      "epoch": 0.041263234096461855,
+      "grad_norm": 2.2467918395996094,
+      "learning_rate": 0.00012378596851058698,
+      "loss": 8.326,
+      "step": 2280
+    },
+    {
+      "epoch": 0.04162519229029047,
+      "grad_norm": 1.530720829963684,
+      "learning_rate": 0.0001248718103396272,
+      "loss": 8.3197,
+      "step": 2300
+    },
+    {
+      "epoch": 0.041987150484119086,
+      "grad_norm": 2.6763076782226562,
+      "learning_rate": 0.00012595765216866743,
+      "loss": 8.3004,
+      "step": 2320
+    },
+    {
+      "epoch": 0.042349108677947694,
+      "grad_norm": 2.142010450363159,
+      "learning_rate": 0.00012704349399770767,
+      "loss": 8.3065,
+      "step": 2340
+    },
+    {
+      "epoch": 0.04271106687177631,
+      "grad_norm": 2.1896350383758545,
+      "learning_rate": 0.0001281293358267479,
+      "loss": 8.2688,
+      "step": 2360
+    },
+    {
+      "epoch": 0.043073025065604925,
+      "grad_norm": 2.1078433990478516,
+      "learning_rate": 0.00012921517765578815,
+      "loss": 8.2639,
+      "step": 2380
+    },
+    {
+      "epoch": 0.043434983259433534,
+      "grad_norm": 1.8464548587799072,
+      "learning_rate": 0.0001303010194848284,
+      "loss": 8.2638,
+      "step": 2400
+    },
+    {
+      "epoch": 0.04379694145326215,
+      "grad_norm": 2.71945858001709,
+      "learning_rate": 0.0001313868613138686,
+      "loss": 8.2516,
+      "step": 2420
+    },
+    {
+      "epoch": 0.044158899647090764,
+      "grad_norm": 1.1496859788894653,
+      "learning_rate": 0.00013247270314290885,
+      "loss": 8.2561,
+      "step": 2440
+    },
+    {
+      "epoch": 0.04452085784091937,
+      "grad_norm": 2.2801716327667236,
+      "learning_rate": 0.0001335585449719491,
+      "loss": 8.2525,
+      "step": 2460
+    },
+    {
+      "epoch": 0.04488281603474799,
+      "grad_norm": 2.1865906715393066,
+      "learning_rate": 0.00013464438680098933,
+      "loss": 8.2548,
+      "step": 2480
+    },
+    {
+      "epoch": 0.0452447742285766,
+      "grad_norm": 1.8173776865005493,
+      "learning_rate": 0.00013573022863002957,
+      "loss": 8.2447,
+      "step": 2500
+    },
+    {
+      "epoch": 0.04560673242240521,
+      "grad_norm": 2.018167018890381,
+      "learning_rate": 0.0001368160704590698,
+      "loss": 8.2116,
+      "step": 2520
+    },
+    {
+      "epoch": 0.04596869061623383,
+      "grad_norm": 2.387749433517456,
+      "learning_rate": 0.00013790191228811003,
+      "loss": 8.2071,
+      "step": 2540
+    },
+    {
+      "epoch": 0.046330648810062436,
+      "grad_norm": 2.1164238452911377,
+      "learning_rate": 0.00013898775411715027,
+      "loss": 8.2173,
+      "step": 2560
+    },
+    {
+      "epoch": 0.04669260700389105,
+      "grad_norm": 2.6271204948425293,
+      "learning_rate": 0.0001400735959461905,
+      "loss": 8.1928,
+      "step": 2580
+    },
+    {
+      "epoch": 0.04705456519771967,
+      "grad_norm": 2.146430730819702,
+      "learning_rate": 0.00014115943777523075,
+      "loss": 8.2093,
+      "step": 2600
+    },
+    {
+      "epoch": 0.047416523391548275,
+      "grad_norm": 1.758144736289978,
+      "learning_rate": 0.000142245279604271,
+      "loss": 8.1709,
+      "step": 2620
+    },
+    {
+      "epoch": 0.04777848158537689,
+      "grad_norm": 1.3466659784317017,
+      "learning_rate": 0.00014333112143331123,
+      "loss": 8.1766,
+      "step": 2640
+    },
+    {
+      "epoch": 0.0481404397792055,
+      "grad_norm": 1.9450665712356567,
+      "learning_rate": 0.00014441696326235144,
+      "loss": 8.1901,
+      "step": 2660
+    },
+    {
+      "epoch": 0.048502397973034114,
+      "grad_norm": 1.6330885887145996,
+      "learning_rate": 0.00014550280509139168,
+      "loss": 8.1911,
+      "step": 2680
+    },
+    {
+      "epoch": 0.04886435616686273,
+      "grad_norm": 1.8187795877456665,
+      "learning_rate": 0.00014658864692043192,
+      "loss": 8.1737,
+      "step": 2700
+    },
+    {
+      "epoch": 0.04922631436069134,
+      "grad_norm": 2.8557980060577393,
+      "learning_rate": 0.00014767448874947216,
+      "loss": 8.1732,
+      "step": 2720
+    },
+    {
+      "epoch": 0.049588272554519953,
+      "grad_norm": 2.0480148792266846,
+      "learning_rate": 0.0001487603305785124,
+      "loss": 8.1636,
+      "step": 2740
+    },
+    {
+      "epoch": 0.04995023074834857,
+      "grad_norm": 1.8413054943084717,
+      "learning_rate": 0.00014984617240755265,
+      "loss": 8.1644,
+      "step": 2760
+    },
+    {
+      "epoch": 0.05031218894217718,
+      "grad_norm": 1.5977145433425903,
+      "learning_rate": 0.00015093201423659289,
+      "loss": 8.1652,
+      "step": 2780
+    },
+    {
+      "epoch": 0.05067414713600579,
+      "grad_norm": 2.060908317565918,
+      "learning_rate": 0.0001520178560656331,
+      "loss": 8.1747,
+      "step": 2800
+    },
+    {
+      "epoch": 0.0510361053298344,
+      "grad_norm": 2.097968339920044,
+      "learning_rate": 0.00015310369789467334,
+      "loss": 8.1595,
+      "step": 2820
+    },
+    {
+      "epoch": 0.051398063523663017,
+      "grad_norm": 2.275170087814331,
+      "learning_rate": 0.00015418953972371358,
+      "loss": 8.1677,
+      "step": 2840
+    },
+    {
+      "epoch": 0.05176002171749163,
+      "grad_norm": 1.372065544128418,
+      "learning_rate": 0.00015527538155275382,
+      "loss": 8.14,
+      "step": 2860
+    },
+    {
+      "epoch": 0.05212197991132024,
+      "grad_norm": 1.472987174987793,
+      "learning_rate": 0.00015636122338179406,
+      "loss": 8.1664,
+      "step": 2880
+    },
+    {
+      "epoch": 0.052483938105148856,
+      "grad_norm": 1.398430347442627,
+      "learning_rate": 0.0001574470652108343,
+      "loss": 8.1435,
+      "step": 2900
+    },
+    {
+      "epoch": 0.05284589629897747,
+      "grad_norm": 2.2276878356933594,
+      "learning_rate": 0.00015853290703987452,
+      "loss": 8.1588,
+      "step": 2920
+    },
+    {
+      "epoch": 0.05320785449280608,
+      "grad_norm": 2.8768556118011475,
+      "learning_rate": 0.00015961874886891476,
+      "loss": 8.15,
+      "step": 2940
+    },
+    {
+      "epoch": 0.053569812686634695,
+      "grad_norm": 2.1943325996398926,
+      "learning_rate": 0.000160704590697955,
+      "loss": 8.1515,
+      "step": 2960
+    },
+    {
+      "epoch": 0.0539317708804633,
+      "grad_norm": 2.0583720207214355,
+      "learning_rate": 0.00016179043252699524,
+      "loss": 8.1343,
+      "step": 2980
+    },
+    {
+      "epoch": 0.05429372907429192,
+      "grad_norm": 2.024512767791748,
+      "learning_rate": 0.00016282198226458345,
+      "loss": 8.1324,
+      "step": 3000
+    },
+    {
+      "epoch": 0.054655687268120534,
+      "grad_norm": 2.0019822120666504,
+      "learning_rate": 0.0001639078240936237,
+      "loss": 8.1268,
+      "step": 3020
+    },
+    {
+      "epoch": 0.05501764546194914,
+      "grad_norm": 2.0442886352539062,
+      "learning_rate": 0.00016499366592266393,
+      "loss": 8.1173,
+      "step": 3040
+    },
+    {
+      "epoch": 0.05537960365577776,
+      "grad_norm": 2.780470371246338,
+      "learning_rate": 0.00016607950775170417,
+      "loss": 8.1345,
+      "step": 3060
+    },
+    {
+      "epoch": 0.05574156184960637,
+      "grad_norm": 2.3489673137664795,
+      "learning_rate": 0.00016716534958074441,
+      "loss": 8.1356,
+      "step": 3080
+    },
+    {
+      "epoch": 0.05610352004343498,
+      "grad_norm": 1.5495760440826416,
+      "learning_rate": 0.00016825119140978465,
+      "loss": 8.1344,
+      "step": 3100
+    },
+    {
+      "epoch": 0.0564654782372636,
+      "grad_norm": 1.9256787300109863,
+      "learning_rate": 0.00016933703323882487,
+      "loss": 8.1298,
+      "step": 3120
+    },
+    {
+      "epoch": 0.056827436431092206,
+      "grad_norm": 2.0177950859069824,
+      "learning_rate": 0.0001704228750678651,
+      "loss": 8.1168,
+      "step": 3140
+    },
+    {
+      "epoch": 0.05718939462492082,
+      "grad_norm": 2.141857147216797,
+      "learning_rate": 0.00017150871689690535,
+      "loss": 8.1286,
+      "step": 3160
+    },
+    {
+      "epoch": 0.057551352818749436,
+      "grad_norm": 2.1056764125823975,
+      "learning_rate": 0.0001725945587259456,
+      "loss": 8.1398,
+      "step": 3180
+    },
+    {
+      "epoch": 0.057913311012578045,
+      "grad_norm": 1.9108024835586548,
+      "learning_rate": 0.00017368040055498583,
+      "loss": 8.1244,
+      "step": 3200
+    },
+    {
+      "epoch": 0.05827526920640666,
+      "grad_norm": 1.7270424365997314,
+      "learning_rate": 0.00017476624238402607,
+      "loss": 8.1102,
+      "step": 3220
+    },
+    {
+      "epoch": 0.058637227400235276,
+      "grad_norm": 1.684164047241211,
+      "learning_rate": 0.00017585208421306629,
+      "loss": 8.1079,
+      "step": 3240
+    },
+    {
+      "epoch": 0.058999185594063884,
+      "grad_norm": 2.553480625152588,
+      "learning_rate": 0.00017688363395065452,
+      "loss": 8.1094,
+      "step": 3260
+    },
+    {
+      "epoch": 0.0593611437878925,
+      "grad_norm": 1.9391483068466187,
+      "learning_rate": 0.00017796947577969477,
+      "loss": 8.1078,
+      "step": 3280
+    },
+    {
+      "epoch": 0.05972310198172111,
+      "grad_norm": 2.539398193359375,
+      "learning_rate": 0.000179055317608735,
+      "loss": 8.1159,
+      "step": 3300
+    },
+    {
+      "epoch": 0.06008506017554972,
+      "grad_norm": 2.3800413608551025,
+      "learning_rate": 0.00018014115943777525,
+      "loss": 8.1043,
+      "step": 3320
+    },
+    {
+      "epoch": 0.06044701836937834,
+      "grad_norm": 2.1216628551483154,
+      "learning_rate": 0.00018122700126681546,
+      "loss": 8.0919,
+      "step": 3340
+    },
+    {
+      "epoch": 0.06080897656320695,
+      "grad_norm": 3.430650472640991,
+      "learning_rate": 0.0001823128430958557,
+      "loss": 8.1078,
+      "step": 3360
+    },
+    {
+      "epoch": 0.06117093475703556,
+      "grad_norm": 2.1858770847320557,
+      "learning_rate": 0.00018339868492489594,
+      "loss": 8.1184,
+      "step": 3380
+    },
+    {
+      "epoch": 0.06153289295086418,
+      "grad_norm": 2.896089553833008,
+      "learning_rate": 0.00018448452675393618,
+      "loss": 8.106,
+      "step": 3400
+    },
+    {
+      "epoch": 0.061894851144692786,
+      "grad_norm": 2.7624003887176514,
+      "learning_rate": 0.00018557036858297642,
+      "loss": 8.1072,
+      "step": 3420
+    },
+    {
+      "epoch": 0.0622568093385214,
+      "grad_norm": 2.464115619659424,
+      "learning_rate": 0.00018665621041201666,
+      "loss": 8.0962,
+      "step": 3440
+    },
+    {
+      "epoch": 0.06261876753235002,
+      "grad_norm": 2.5943491458892822,
+      "learning_rate": 0.00018774205224105688,
+      "loss": 8.0914,
+      "step": 3460
+    },
+    {
+      "epoch": 0.06298072572617863,
+      "grad_norm": 2.0824356079101562,
+      "learning_rate": 0.00018882789407009712,
+      "loss": 8.1222,
+      "step": 3480
+    },
+    {
+      "epoch": 0.06334268392000723,
+      "grad_norm": 2.8781402111053467,
+      "learning_rate": 0.00018991373589913736,
+      "loss": 8.088,
+      "step": 3500
+    },
+    {
+      "epoch": 0.06370464211383585,
+      "grad_norm": 2.0000219345092773,
+      "learning_rate": 0.0001909995777281776,
+      "loss": 8.103,
+      "step": 3520
+    },
+    {
+      "epoch": 0.06406660030766446,
+      "grad_norm": 2.4691524505615234,
+      "learning_rate": 0.00019208541955721784,
+      "loss": 8.1045,
+      "step": 3540
+    },
+    {
+      "epoch": 0.06442855850149308,
+      "grad_norm": 2.583723545074463,
+      "learning_rate": 0.00019317126138625808,
+      "loss": 8.0879,
+      "step": 3560
+    },
+    {
+      "epoch": 0.0647905166953217,
+      "grad_norm": 2.7288269996643066,
+      "learning_rate": 0.0001942571032152983,
+      "loss": 8.091,
+      "step": 3580
+    },
+    {
+      "epoch": 0.0651524748891503,
+      "grad_norm": 2.360276699066162,
+      "learning_rate": 0.00019534294504433854,
+      "loss": 8.0894,
+      "step": 3600
+    },
+    {
+      "epoch": 0.06551443308297891,
+      "grad_norm": 2.6591217517852783,
+      "learning_rate": 0.00019642878687337878,
+      "loss": 8.082,
+      "step": 3620
+    },
+    {
+      "epoch": 0.06587639127680753,
+      "grad_norm": 2.5572097301483154,
+      "learning_rate": 0.00019751462870241902,
+      "loss": 8.1033,
+      "step": 3640
+    },
+    {
+      "epoch": 0.06623834947063614,
+      "grad_norm": 2.643139600753784,
+      "learning_rate": 0.00019860047053145926,
+      "loss": 8.1124,
+      "step": 3660
+    },
+    {
+      "epoch": 0.06660030766446476,
+      "grad_norm": 3.0322484970092773,
+      "learning_rate": 0.0001996863123604995,
+      "loss": 8.1049,
+      "step": 3680
+    },
+    {
+      "epoch": 0.06696226585829337,
+      "grad_norm": 2.7740566730499268,
+      "learning_rate": 0.0002007721541895397,
+      "loss": 8.0876,
+      "step": 3700
+    },
+    {
+      "epoch": 0.06732422405212198,
+      "grad_norm": 2.7816436290740967,
+      "learning_rate": 0.00020185799601857995,
+      "loss": 8.1121,
+      "step": 3720
+    },
+    {
+      "epoch": 0.06768618224595059,
+      "grad_norm": 2.4198217391967773,
+      "learning_rate": 0.0002029438378476202,
+      "loss": 8.0958,
+      "step": 3740
+    },
+    {
+      "epoch": 0.0680481404397792,
+      "grad_norm": 2.8162808418273926,
+      "learning_rate": 0.00020402967967666043,
+      "loss": 8.0897,
+      "step": 3760
+    },
+    {
+      "epoch": 0.06841009863360782,
+      "grad_norm": 2.883631706237793,
+      "learning_rate": 0.00020511552150570067,
+      "loss": 8.085,
+      "step": 3780
+    },
+    {
+      "epoch": 0.06877205682743644,
+      "grad_norm": 2.4725236892700195,
+      "learning_rate": 0.00020620136333474092,
+      "loss": 8.0807,
+      "step": 3800
+    },
+    {
+      "epoch": 0.06913401502126504,
+      "grad_norm": 2.3156867027282715,
+      "learning_rate": 0.00020728720516378116,
+      "loss": 8.0747,
+      "step": 3820
+    },
+    {
+      "epoch": 0.06949597321509365,
+      "grad_norm": 3.308699607849121,
+      "learning_rate": 0.00020837304699282137,
+      "loss": 8.0767,
+      "step": 3840
+    },
+    {
+      "epoch": 0.06985793140892227,
+      "grad_norm": 3.143287181854248,
+      "learning_rate": 0.0002094588888218616,
+      "loss": 8.0831,
+      "step": 3860
+    },
+    {
+      "epoch": 0.07021988960275088,
+      "grad_norm": 3.100562810897827,
+      "learning_rate": 0.00021054473065090185,
+      "loss": 8.0862,
+      "step": 3880
+    },
+    {
+      "epoch": 0.0705818477965795,
+      "grad_norm": 2.48494029045105,
+      "learning_rate": 0.0002116305724799421,
+      "loss": 8.0952,
+      "step": 3900
+    },
+    {
+      "epoch": 0.0709438059904081,
+      "grad_norm": 3.1432759761810303,
+      "learning_rate": 0.00021271641430898233,
+      "loss": 8.0906,
+      "step": 3920
+    },
+    {
+      "epoch": 0.07130576418423672,
+      "grad_norm": 3.330761194229126,
+      "learning_rate": 0.00021380225613802257,
+      "loss": 8.0856,
+      "step": 3940
+    },
+    {
+      "epoch": 0.07166772237806533,
+      "grad_norm": 3.6338093280792236,
+      "learning_rate": 0.00021488809796706279,
+      "loss": 8.0733,
+      "step": 3960
+    },
+    {
+      "epoch": 0.07202968057189395,
+      "grad_norm": 3.014366388320923,
+      "learning_rate": 0.00021597393979610303,
+      "loss": 8.086,
+      "step": 3980
+    },
+    {
+      "epoch": 0.07239163876572256,
+      "grad_norm": 2.774247169494629,
+      "learning_rate": 0.00021705978162514327,
+      "loss": 8.093,
+      "step": 4000
+    },
+    {
+      "epoch": 0.07275359695955118,
+      "grad_norm": 4.0621867179870605,
+      "learning_rate": 0.0002181456234541835,
+      "loss": 8.0801,
+      "step": 4020
+    },
+    {
+      "epoch": 0.07311555515337978,
+      "grad_norm": 3.9556710720062256,
+      "learning_rate": 0.00021923146528322375,
+      "loss": 8.0632,
+      "step": 4040
+    },
+    {
+      "epoch": 0.0734775133472084,
+      "grad_norm": 3.0062179565429688,
+      "learning_rate": 0.000220317307112264,
+      "loss": 8.0713,
+      "step": 4060
+    },
+    {
+      "epoch": 0.07383947154103701,
+      "grad_norm": 3.4333982467651367,
+      "learning_rate": 0.0002214031489413042,
+      "loss": 8.0656,
+      "step": 4080
+    },
+    {
+      "epoch": 0.07420142973486563,
+      "grad_norm": 3.107091188430786,
+      "learning_rate": 0.00022248899077034444,
+      "loss": 8.0832,
+      "step": 4100
+    },
+    {
+      "epoch": 0.07456338792869424,
+      "grad_norm": 3.5279381275177,
+      "learning_rate": 0.00022357483259938468,
+      "loss": 8.0703,
+      "step": 4120
+    },
+    {
+      "epoch": 0.07492534612252284,
+      "grad_norm": 2.9503841400146484,
+      "learning_rate": 0.00022466067442842493,
+      "loss": 8.0796,
+      "step": 4140
+    },
+    {
+      "epoch": 0.07528730431635146,
+      "grad_norm": 3.018066644668579,
+      "learning_rate": 0.00022574651625746517,
+      "loss": 8.07,
+      "step": 4160
+    },
+    {
+      "epoch": 0.07564926251018007,
+      "grad_norm": 3.552546501159668,
+      "learning_rate": 0.0002268323580865054,
+      "loss": 8.1046,
+      "step": 4180
+    },
+    {
+      "epoch": 0.07601122070400869,
+      "grad_norm": 3.881967306137085,
+      "learning_rate": 0.00022791819991554565,
+      "loss": 8.0622,
+      "step": 4200
+    },
+    {
+      "epoch": 0.0763731788978373,
+      "grad_norm": 4.438878536224365,
+      "learning_rate": 0.0002290040417445859,
+      "loss": 8.0783,
+      "step": 4220
+    },
+    {
+      "epoch": 0.0767351370916659,
+      "grad_norm": 3.928950071334839,
+      "learning_rate": 0.00023008988357362613,
+      "loss": 8.0887,
+      "step": 4240
+    },
+    {
+      "epoch": 0.07709709528549452,
+      "grad_norm": 4.781463623046875,
+      "learning_rate": 0.00023117572540266632,
+      "loss": 8.0672,
+      "step": 4260
+    },
+    {
+      "epoch": 0.07745905347932314,
+      "grad_norm": 3.8346338272094727,
+      "learning_rate": 0.00023226156723170656,
+      "loss": 8.0822,
+      "step": 4280
+    },
+    {
+      "epoch": 0.07782101167315175,
+      "grad_norm": 3.835999011993408,
+      "learning_rate": 0.0002333474090607468,
+      "loss": 8.0662,
+      "step": 4300
+    },
+    {
+      "epoch": 0.07818296986698037,
+      "grad_norm": 4.432645320892334,
+      "learning_rate": 0.00023443325088978704,
+      "loss": 8.0816,
+      "step": 4320
+    },
+    {
+      "epoch": 0.07854492806080898,
+      "grad_norm": 3.856933116912842,
+      "learning_rate": 0.00023551909271882728,
+      "loss": 8.055,
+      "step": 4340
+    },
+    {
+      "epoch": 0.07890688625463758,
+      "grad_norm": 4.055251598358154,
+      "learning_rate": 0.00023660493454786752,
+      "loss": 8.0833,
+      "step": 4360
+    },
+    {
+      "epoch": 0.0792688444484662,
+      "grad_norm": 4.129009246826172,
+      "learning_rate": 0.00023769077637690776,
+      "loss": 8.0715,
+      "step": 4380
+    },
+    {
+      "epoch": 0.07963080264229482,
+      "grad_norm": 3.944307565689087,
+      "learning_rate": 0.000238776618205948,
+      "loss": 8.0721,
+      "step": 4400
+    },
+    {
+      "epoch": 0.07999276083612343,
+      "grad_norm": 4.226454257965088,
+      "learning_rate": 0.00023986246003498824,
+      "loss": 8.081,
+      "step": 4420
+    },
+    {
+      "epoch": 0.08035471902995205,
+      "grad_norm": 4.180859088897705,
+      "learning_rate": 0.00024094830186402848,
+      "loss": 8.0674,
+      "step": 4440
+    },
+    {
+      "epoch": 0.08071667722378065,
+      "grad_norm": 4.678595066070557,
+      "learning_rate": 0.0002418712674187127,
+      "loss": 8.6491,
+      "step": 4460
+    },
+    {
+      "epoch": 0.08107863541760926,
+      "grad_norm": 3.916884422302246,
+      "learning_rate": 0.00024295710924775293,
+      "loss": 8.0667,
+      "step": 4480
+    },
+    {
+      "epoch": 0.08144059361143788,
+      "grad_norm": 5.014923572540283,
+      "learning_rate": 0.00024404295107679317,
+      "loss": 8.0709,
+      "step": 4500
+    },
+    {
+      "epoch": 0.0818025518052665,
+      "grad_norm": 5.424384593963623,
+      "learning_rate": 0.00024512879290583333,
+      "loss": 8.0714,
+      "step": 4520
+    },
+    {
+      "epoch": 0.08216450999909511,
+      "grad_norm": 4.7506890296936035,
+      "learning_rate": 0.0002462146347348736,
+      "loss": 8.1164,
+      "step": 4540
+    },
+    {
+      "epoch": 0.08252646819292371,
+      "grad_norm": 4.271077632904053,
+      "learning_rate": 0.0002473004765639138,
+      "loss": 8.0456,
+      "step": 4560
+    },
+    {
+      "epoch": 0.08288842638675233,
+      "grad_norm": 5.108422756195068,
+      "learning_rate": 0.00024838631839295405,
+      "loss": 8.0683,
+      "step": 4580
+    },
+    {
+      "epoch": 0.08325038458058094,
+      "grad_norm": 4.00137186050415,
+      "learning_rate": 0.0002494721602219943,
+      "loss": 8.0877,
+      "step": 4600
+    },
+    {
+      "epoch": 0.08361234277440956,
+      "grad_norm": 5.020883560180664,
+      "learning_rate": 0.00025055800205103454,
+      "loss": 8.0669,
+      "step": 4620
+    },
+    {
+      "epoch": 0.08397430096823817,
+      "grad_norm": 4.774672985076904,
+      "learning_rate": 0.0002516438438800748,
+      "loss": 8.0978,
+      "step": 4640
+    },
+    {
+      "epoch": 0.08433625916206679,
+      "grad_norm": 4.414867877960205,
+      "learning_rate": 0.000252729685709115,
+      "loss": 8.0909,
+      "step": 4660
+    },
+    {
+      "epoch": 0.08469821735589539,
+      "grad_norm": 4.128323554992676,
+      "learning_rate": 0.00025381552753815526,
+      "loss": 8.09,
+      "step": 4680
+    },
+    {
+      "epoch": 0.085060175549724,
+      "grad_norm": 4.486546039581299,
+      "learning_rate": 0.0002549013693671955,
+      "loss": 8.0744,
+      "step": 4700
+    },
+    {
+      "epoch": 0.08542213374355262,
+      "grad_norm": 4.218783855438232,
+      "learning_rate": 0.00025598721119623574,
+      "loss": 8.0902,
+      "step": 4720
+    },
+    {
+      "epoch": 0.08578409193738123,
+      "grad_norm": 4.9139084815979,
+      "learning_rate": 0.000257073053025276,
+      "loss": 8.0726,
+      "step": 4740
+    },
+    {
+      "epoch": 0.08614605013120985,
+      "grad_norm": 4.544093132019043,
+      "learning_rate": 0.0002581588948543162,
+      "loss": 8.0735,
+      "step": 4760
+    },
+    {
+      "epoch": 0.08650800832503845,
+      "grad_norm": 5.45837926864624,
+      "learning_rate": 0.0002592447366833564,
+      "loss": 8.0781,
+      "step": 4780
+    },
+    {
+      "epoch": 0.08686996651886707,
+      "grad_norm": 5.2784423828125,
+      "learning_rate": 0.00026033057851239665,
+      "loss": 8.088,
+      "step": 4800
+    },
+    {
+      "epoch": 0.08723192471269568,
+      "grad_norm": 4.507415294647217,
+      "learning_rate": 0.0002614164203414369,
+      "loss": 8.0707,
+      "step": 4820
+    },
+    {
+      "epoch": 0.0875938829065243,
+      "grad_norm": 4.857511520385742,
+      "learning_rate": 0.00026250226217047713,
+      "loss": 8.0994,
+      "step": 4840
+    },
+    {
+      "epoch": 0.08795584110035291,
+      "grad_norm": 4.420199871063232,
+      "learning_rate": 0.00026358810399951737,
+      "loss": 8.0605,
+      "step": 4860
+    },
+    {
+      "epoch": 0.08831779929418153,
+      "grad_norm": 3.7216994762420654,
+      "learning_rate": 0.0002646739458285576,
+      "loss": 8.1182,
+      "step": 4880
+    },
+    {
+      "epoch": 0.08867975748801013,
+      "grad_norm": 4.462796688079834,
+      "learning_rate": 0.00026575978765759785,
+      "loss": 8.0666,
+      "step": 4900
+    },
+    {
+      "epoch": 0.08904171568183875,
+      "grad_norm": 4.29760217666626,
+      "learning_rate": 0.0002668456294866381,
+      "loss": 8.0548,
+      "step": 4920
+    },
+    {
+      "epoch": 0.08940367387566736,
+      "grad_norm": 5.2155046463012695,
+      "learning_rate": 0.00026793147131567833,
+      "loss": 8.1614,
+      "step": 4940
+    },
+    {
+      "epoch": 0.08976563206949598,
+      "grad_norm": 4.687706470489502,
+      "learning_rate": 0.0002690173131447186,
+      "loss": 8.051,
+      "step": 4960
+    },
+    {
+      "epoch": 0.09012759026332459,
+      "grad_norm": 3.9040534496307373,
+      "learning_rate": 0.0002701031549737588,
+      "loss": 8.0645,
+      "step": 4980
+    },
+    {
+      "epoch": 0.0904895484571532,
+      "grad_norm": 4.2080159187316895,
+      "learning_rate": 0.00027118899680279905,
+      "loss": 8.059,
+      "step": 5000
+    },
+    {
+      "epoch": 0.0904895484571532,
+      "eval_accuracy": 0.10885986383166214,
+      "eval_loss": 8.154509544372559,
+      "eval_runtime": 172.8056,
+      "eval_samples_per_second": 3517.479,
+      "eval_steps_per_second": 3.437,
+      "step": 5000
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 165765,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7191810539520000.0,
+  "train_batch_size": 512,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f84e7b8139fc15504603acf198a1d0e2de337ab7857597ceb1e053e7d16b4de6
+size 5176