ivanmorals commited on 22 days ago

Commit

519470f

verified ·

1 Parent(s): e47bff5

add model_bart

Browse files

Files changed (39) hide show

models/dostoievsky_v1/checkpoint-21500/config.json +71 -0
models/dostoievsky_v1/checkpoint-21500/generation_config.json +16 -0
models/dostoievsky_v1/checkpoint-21500/model.safetensors +3 -0
models/dostoievsky_v1/checkpoint-21500/optimizer.pt +3 -0
models/dostoievsky_v1/checkpoint-21500/rng_state.pth +3 -0
models/dostoievsky_v1/checkpoint-21500/scaler.pt +3 -0
models/dostoievsky_v1/checkpoint-21500/scheduler.pt +3 -0
models/dostoievsky_v1/checkpoint-21500/tokenizer.json +0 -0
models/dostoievsky_v1/checkpoint-21500/tokenizer_config.json +16 -0
models/dostoievsky_v1/checkpoint-21500/trainer_state.json +1547 -0
models/dostoievsky_v1/checkpoint-21500/training_args.bin +3 -0
models/dostoievsky_v1/checkpoint-22000/config.json +71 -0
models/dostoievsky_v1/checkpoint-22000/generation_config.json +16 -0
models/dostoievsky_v1/checkpoint-22000/model.safetensors +3 -0
models/dostoievsky_v1/checkpoint-22000/optimizer.pt +3 -0
models/dostoievsky_v1/checkpoint-22000/rng_state.pth +3 -0
models/dostoievsky_v1/checkpoint-22000/scaler.pt +3 -0
models/dostoievsky_v1/checkpoint-22000/scheduler.pt +3 -0
models/dostoievsky_v1/checkpoint-22000/tokenizer.json +0 -0
models/dostoievsky_v1/checkpoint-22000/tokenizer_config.json +16 -0
models/dostoievsky_v1/checkpoint-22000/trainer_state.json +1582 -0
models/dostoievsky_v1/checkpoint-22000/training_args.bin +3 -0
models/dostoievsky_v1/checkpoint-22016/config.json +71 -0
models/dostoievsky_v1/checkpoint-22016/generation_config.json +16 -0
models/dostoievsky_v1/checkpoint-22016/model.safetensors +3 -0
models/dostoievsky_v1/checkpoint-22016/optimizer.pt +3 -0
models/dostoievsky_v1/checkpoint-22016/rng_state.pth +3 -0
models/dostoievsky_v1/checkpoint-22016/scaler.pt +3 -0
models/dostoievsky_v1/checkpoint-22016/scheduler.pt +3 -0
models/dostoievsky_v1/checkpoint-22016/tokenizer.json +0 -0
models/dostoievsky_v1/checkpoint-22016/tokenizer_config.json +16 -0
models/dostoievsky_v1/checkpoint-22016/trainer_state.json +1582 -0
models/dostoievsky_v1/checkpoint-22016/training_args.bin +3 -0
models/dostoievsky_v1/final_model/config.json +71 -0
models/dostoievsky_v1/final_model/generation_config.json +16 -0
models/dostoievsky_v1/final_model/model.safetensors +3 -0
models/dostoievsky_v1/final_model/tokenizer.json +0 -0
models/dostoievsky_v1/final_model/tokenizer_config.json +16 -0
models/dostoievsky_v1/final_model/training_args.bin +3 -0

models/dostoievsky_v1/checkpoint-21500/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "classif_dropout": 0.1,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_decoder": false,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_position_embeddings": 1024,
+  "model_type": "bart",
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "task_specific_params": {
+    "summarization": {
+      "length_penalty": 1.0,
+      "max_length": 128,
+      "min_length": 12,
+      "num_beams": 4
+    },
+    "summarization_cnn": {
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "num_beams": 4
+    },
+    "summarization_xsum": {
+      "length_penalty": 1.0,
+      "max_length": 62,
+      "min_length": 11,
+      "num_beams": 6
+    }
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.0",
+  "use_cache": false,
+  "vocab_size": 50265
+}

models/dostoievsky_v1/checkpoint-21500/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "early_stopping": true,
+  "eos_token_id": [
+    2
+  ],
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "no_repeat_ngram_size": 3,
+  "num_beams": 4,
+  "pad_token_id": 1,
+  "transformers_version": "5.5.0",
+  "use_cache": true
+}

models/dostoievsky_v1/checkpoint-21500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acfdbd1127b03f850a3a4fc3107a7df8a0856325e1ce149a5e6d040f6c708422
+size 557912620

models/dostoievsky_v1/checkpoint-21500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bef3b9865106bb639656edae5b6df8906c13a65dc86e92c13bba152cb4e25be
+size 1115583947

models/dostoievsky_v1/checkpoint-21500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d9d9a8088d7f25d4d4e3b3b5cf3b6a1457481bb5f05575ed6491980c3ce339a
+size 14645

models/dostoievsky_v1/checkpoint-21500/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad8834c6e87c2456e1563a68ce36f528eb72a4db9786b554beef824028ea6c9c
+size 1383

models/dostoievsky_v1/checkpoint-21500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9528466ddff0be39dea387c74652e8426c9e6284d21cabd291b3a0b686c01d7b
+size 1465

models/dostoievsky_v1/checkpoint-21500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/dostoievsky_v1/checkpoint-21500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

models/dostoievsky_v1/checkpoint-21500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1547 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.953125,
+  "eval_steps": 500,
+  "global_step": 21500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009084302325581396,
+      "grad_norm": 1.9626818895339966,
+      "learning_rate": 4.9775163517441866e-05,
+      "loss": 3.2140347290039064,
+      "step": 100
+    },
+    {
+      "epoch": 0.018168604651162792,
+      "grad_norm": 1.511451244354248,
+      "learning_rate": 4.9548055959302324e-05,
+      "loss": 0.01261173963546753,
+      "step": 200
+    },
+    {
+      "epoch": 0.027252906976744186,
+      "grad_norm": 0.18194368481636047,
+      "learning_rate": 4.9320948401162795e-05,
+      "loss": 0.009183254837989808,
+      "step": 300
+    },
+    {
+      "epoch": 0.036337209302325583,
+      "grad_norm": 0.27592894434928894,
+      "learning_rate": 4.909384084302326e-05,
+      "loss": 0.0027826914191246034,
+      "step": 400
+    },
+    {
+      "epoch": 0.045421511627906974,
+      "grad_norm": 1.5584100484848022,
+      "learning_rate": 4.8866733284883724e-05,
+      "loss": 0.0029672542214393615,
+      "step": 500
+    },
+    {
+      "epoch": 0.05450581395348837,
+      "grad_norm": 0.02698471024632454,
+      "learning_rate": 4.863962572674419e-05,
+      "loss": 0.004296095669269561,
+      "step": 600
+    },
+    {
+      "epoch": 0.06359011627906977,
+      "grad_norm": 0.137993723154068,
+      "learning_rate": 4.841251816860465e-05,
+      "loss": 0.0031410756707191466,
+      "step": 700
+    },
+    {
+      "epoch": 0.07267441860465117,
+      "grad_norm": 0.07874622195959091,
+      "learning_rate": 4.818541061046512e-05,
+      "loss": 0.003037240505218506,
+      "step": 800
+    },
+    {
+      "epoch": 0.08175872093023256,
+      "grad_norm": 0.013660268858075142,
+      "learning_rate": 4.795830305232558e-05,
+      "loss": 0.0031082597374916076,
+      "step": 900
+    },
+    {
+      "epoch": 0.09084302325581395,
+      "grad_norm": 0.3354911804199219,
+      "learning_rate": 4.7731195494186046e-05,
+      "loss": 0.0048285979032516475,
+      "step": 1000
+    },
+    {
+      "epoch": 0.09992732558139535,
+      "grad_norm": 0.43667030334472656,
+      "learning_rate": 4.750408793604652e-05,
+      "loss": 0.0019270157814025878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.10901162790697674,
+      "grad_norm": 0.03849343582987785,
+      "learning_rate": 4.7276980377906975e-05,
+      "loss": 0.001011388897895813,
+      "step": 1200
+    },
+    {
+      "epoch": 0.11809593023255814,
+      "grad_norm": 0.017985744401812553,
+      "learning_rate": 4.7049872819767446e-05,
+      "loss": 0.0018200889229774476,
+      "step": 1300
+    },
+    {
+      "epoch": 0.12718023255813954,
+      "grad_norm": 0.05241613835096359,
+      "learning_rate": 4.682276526162791e-05,
+      "loss": 0.004675151705741882,
+      "step": 1400
+    },
+    {
+      "epoch": 0.13626453488372092,
+      "grad_norm": 0.16119782626628876,
+      "learning_rate": 4.6595657703488375e-05,
+      "loss": 0.0025834646821022034,
+      "step": 1500
+    },
+    {
+      "epoch": 0.14534883720930233,
+      "grad_norm": 0.8985283374786377,
+      "learning_rate": 4.636855014534884e-05,
+      "loss": 0.001825849711894989,
+      "step": 1600
+    },
+    {
+      "epoch": 0.15443313953488372,
+      "grad_norm": 0.021285999566316605,
+      "learning_rate": 4.6141442587209304e-05,
+      "loss": 0.004164438843727112,
+      "step": 1700
+    },
+    {
+      "epoch": 0.16351744186046513,
+      "grad_norm": 0.01013926975429058,
+      "learning_rate": 4.591433502906977e-05,
+      "loss": 0.002582077383995056,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1726017441860465,
+      "grad_norm": 0.005713903810828924,
+      "learning_rate": 4.568722747093023e-05,
+      "loss": 0.0026018735766410827,
+      "step": 1900
+    },
+    {
+      "epoch": 0.1816860465116279,
+      "grad_norm": 0.019032707437872887,
+      "learning_rate": 4.54601199127907e-05,
+      "loss": 0.0017240011692047118,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1907703488372093,
+      "grad_norm": 0.03191379830241203,
+      "learning_rate": 4.523301235465117e-05,
+      "loss": 0.0022749193012714386,
+      "step": 2100
+    },
+    {
+      "epoch": 0.1998546511627907,
+      "grad_norm": 0.12945719063282013,
+      "learning_rate": 4.5005904796511626e-05,
+      "loss": 0.0017636509239673615,
+      "step": 2200
+    },
+    {
+      "epoch": 0.2089389534883721,
+      "grad_norm": 0.01494428887963295,
+      "learning_rate": 4.47787972383721e-05,
+      "loss": 0.0017163331806659698,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2180232558139535,
+      "grad_norm": 0.08089974522590637,
+      "learning_rate": 4.455168968023256e-05,
+      "loss": 0.0023377402126789092,
+      "step": 2400
+    },
+    {
+      "epoch": 0.22710755813953487,
+      "grad_norm": 0.004136559087783098,
+      "learning_rate": 4.4324582122093026e-05,
+      "loss": 0.001353910267353058,
+      "step": 2500
+    },
+    {
+      "epoch": 0.23619186046511628,
+      "grad_norm": 0.25111478567123413,
+      "learning_rate": 4.409747456395349e-05,
+      "loss": 0.0022939679026603697,
+      "step": 2600
+    },
+    {
+      "epoch": 0.24527616279069767,
+      "grad_norm": 0.963623583316803,
+      "learning_rate": 4.3870367005813955e-05,
+      "loss": 0.0018738456070423125,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2543604651162791,
+      "grad_norm": 0.6530899405479431,
+      "learning_rate": 4.364325944767442e-05,
+      "loss": 0.0015843257308006287,
+      "step": 2800
+    },
+    {
+      "epoch": 0.26344476744186046,
+      "grad_norm": 0.0018010369967669249,
+      "learning_rate": 4.3416151889534884e-05,
+      "loss": 0.0006610354781150817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.27252906976744184,
+      "grad_norm": 0.1644030064344406,
+      "learning_rate": 4.318904433139535e-05,
+      "loss": 0.0031197205185890197,
+      "step": 3000
+    },
+    {
+      "epoch": 0.28161337209302323,
+      "grad_norm": 0.14440134167671204,
+      "learning_rate": 4.296193677325582e-05,
+      "loss": 0.0011694706976413728,
+      "step": 3100
+    },
+    {
+      "epoch": 0.29069767441860467,
+      "grad_norm": 0.0124755734577775,
+      "learning_rate": 4.273482921511628e-05,
+      "loss": 0.0013250903785228729,
+      "step": 3200
+    },
+    {
+      "epoch": 0.29978197674418605,
+      "grad_norm": 0.15910762548446655,
+      "learning_rate": 4.250772165697675e-05,
+      "loss": 0.0007842753082513809,
+      "step": 3300
+    },
+    {
+      "epoch": 0.30886627906976744,
+      "grad_norm": 0.011540662497282028,
+      "learning_rate": 4.228061409883721e-05,
+      "loss": 0.0018773898482322693,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3179505813953488,
+      "grad_norm": 0.005148735363036394,
+      "learning_rate": 4.205350654069768e-05,
+      "loss": 0.001056775525212288,
+      "step": 3500
+    },
+    {
+      "epoch": 0.32703488372093026,
+      "grad_norm": 0.007442768197506666,
+      "learning_rate": 4.182639898255814e-05,
+      "loss": 0.0014348265528678894,
+      "step": 3600
+    },
+    {
+      "epoch": 0.33611918604651164,
+      "grad_norm": 0.00586783979088068,
+      "learning_rate": 4.1599291424418606e-05,
+      "loss": 0.0014780126512050629,
+      "step": 3700
+    },
+    {
+      "epoch": 0.345203488372093,
+      "grad_norm": 0.006909696385264397,
+      "learning_rate": 4.137218386627907e-05,
+      "loss": 0.0019674110412597656,
+      "step": 3800
+    },
+    {
+      "epoch": 0.3542877906976744,
+      "grad_norm": 0.020171863958239555,
+      "learning_rate": 4.1145076308139535e-05,
+      "loss": 0.00249418705701828,
+      "step": 3900
+    },
+    {
+      "epoch": 0.3633720930232558,
+      "grad_norm": 0.031723715364933014,
+      "learning_rate": 4.091796875e-05,
+      "loss": 0.0030515387654304503,
+      "step": 4000
+    },
+    {
+      "epoch": 0.37245639534883723,
+      "grad_norm": 0.30729565024375916,
+      "learning_rate": 4.069086119186047e-05,
+      "loss": 0.0009509802609682083,
+      "step": 4100
+    },
+    {
+      "epoch": 0.3815406976744186,
+      "grad_norm": 0.0021714100148528814,
+      "learning_rate": 4.046375363372093e-05,
+      "loss": 0.0013920699059963226,
+      "step": 4200
+    },
+    {
+      "epoch": 0.390625,
+      "grad_norm": 0.001017636270262301,
+      "learning_rate": 4.02366460755814e-05,
+      "loss": 0.0010310819000005722,
+      "step": 4300
+    },
+    {
+      "epoch": 0.3997093023255814,
+      "grad_norm": 0.0009663284290581942,
+      "learning_rate": 4.0009538517441864e-05,
+      "loss": 0.001098722442984581,
+      "step": 4400
+    },
+    {
+      "epoch": 0.40879360465116277,
+      "grad_norm": 0.003507931251078844,
+      "learning_rate": 3.978243095930232e-05,
+      "loss": 0.0013840131461620331,
+      "step": 4500
+    },
+    {
+      "epoch": 0.4178779069767442,
+      "grad_norm": 0.21988199651241302,
+      "learning_rate": 3.955532340116279e-05,
+      "loss": 0.0017136169970035553,
+      "step": 4600
+    },
+    {
+      "epoch": 0.4269622093023256,
+      "grad_norm": 0.0012972657568752766,
+      "learning_rate": 3.932821584302326e-05,
+      "loss": 0.0008040449023246765,
+      "step": 4700
+    },
+    {
+      "epoch": 0.436046511627907,
+      "grad_norm": 0.033857911825180054,
+      "learning_rate": 3.910110828488372e-05,
+      "loss": 0.0005662231892347336,
+      "step": 4800
+    },
+    {
+      "epoch": 0.44513081395348836,
+      "grad_norm": 0.0008128180052153766,
+      "learning_rate": 3.8874000726744187e-05,
+      "loss": 0.001040520742535591,
+      "step": 4900
+    },
+    {
+      "epoch": 0.45421511627906974,
+      "grad_norm": 0.0017111338675022125,
+      "learning_rate": 3.864689316860465e-05,
+      "loss": 0.0010922805964946748,
+      "step": 5000
+    },
+    {
+      "epoch": 0.4632994186046512,
+      "grad_norm": 0.0013291583163663745,
+      "learning_rate": 3.841978561046512e-05,
+      "loss": 0.0005277743935585021,
+      "step": 5100
+    },
+    {
+      "epoch": 0.47238372093023256,
+      "grad_norm": 0.09846807271242142,
+      "learning_rate": 3.819267805232558e-05,
+      "loss": 0.0010997675359249114,
+      "step": 5200
+    },
+    {
+      "epoch": 0.48146802325581395,
+      "grad_norm": 0.005185174290090799,
+      "learning_rate": 3.796557049418605e-05,
+      "loss": 0.000698111355304718,
+      "step": 5300
+    },
+    {
+      "epoch": 0.49055232558139533,
+      "grad_norm": 0.019086388871073723,
+      "learning_rate": 3.7738462936046515e-05,
+      "loss": 0.0009016367793083191,
+      "step": 5400
+    },
+    {
+      "epoch": 0.49963662790697677,
+      "grad_norm": 0.05122831463813782,
+      "learning_rate": 3.751135537790697e-05,
+      "loss": 0.0015524370968341826,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5087209302325582,
+      "grad_norm": 0.012596211396157742,
+      "learning_rate": 3.7284247819767444e-05,
+      "loss": 0.00220917209982872,
+      "step": 5600
+    },
+    {
+      "epoch": 0.5178052325581395,
+      "grad_norm": 0.009361029602587223,
+      "learning_rate": 3.705714026162791e-05,
+      "loss": 0.0010362663865089417,
+      "step": 5700
+    },
+    {
+      "epoch": 0.5268895348837209,
+      "grad_norm": 0.5205378532409668,
+      "learning_rate": 3.683003270348837e-05,
+      "loss": 0.0022550773620605468,
+      "step": 5800
+    },
+    {
+      "epoch": 0.5359738372093024,
+      "grad_norm": 0.0006470124353654683,
+      "learning_rate": 3.660292514534884e-05,
+      "loss": 0.0011264414340257645,
+      "step": 5900
+    },
+    {
+      "epoch": 0.5450581395348837,
+      "grad_norm": 0.058231666684150696,
+      "learning_rate": 3.63758175872093e-05,
+      "loss": 0.0011266635358333588,
+      "step": 6000
+    },
+    {
+      "epoch": 0.5541424418604651,
+      "grad_norm": 0.0013441125629469752,
+      "learning_rate": 3.614871002906977e-05,
+      "loss": 0.0008001205325126648,
+      "step": 6100
+    },
+    {
+      "epoch": 0.5632267441860465,
+      "grad_norm": 0.0021539030130952597,
+      "learning_rate": 3.592160247093023e-05,
+      "loss": 0.0008849448710680007,
+      "step": 6200
+    },
+    {
+      "epoch": 0.5723110465116279,
+      "grad_norm": 0.9019960761070251,
+      "learning_rate": 3.56944949127907e-05,
+      "loss": 0.0006517694145441055,
+      "step": 6300
+    },
+    {
+      "epoch": 0.5813953488372093,
+      "grad_norm": 0.001283760997466743,
+      "learning_rate": 3.546738735465117e-05,
+      "loss": 0.0021727623045444486,
+      "step": 6400
+    },
+    {
+      "epoch": 0.5904796511627907,
+      "grad_norm": 0.005666423588991165,
+      "learning_rate": 3.5240279796511624e-05,
+      "loss": 0.0012587083876132966,
+      "step": 6500
+    },
+    {
+      "epoch": 0.5995639534883721,
+      "grad_norm": 1.4842944145202637,
+      "learning_rate": 3.5013172238372096e-05,
+      "loss": 0.001180112287402153,
+      "step": 6600
+    },
+    {
+      "epoch": 0.6086482558139535,
+      "grad_norm": 0.02973734401166439,
+      "learning_rate": 3.478606468023256e-05,
+      "loss": 0.0010163599252700805,
+      "step": 6700
+    },
+    {
+      "epoch": 0.6177325581395349,
+      "grad_norm": 0.00399527233093977,
+      "learning_rate": 3.4558957122093024e-05,
+      "loss": 0.0010414445400238038,
+      "step": 6800
+    },
+    {
+      "epoch": 0.6268168604651163,
+      "grad_norm": 0.39672717452049255,
+      "learning_rate": 3.433184956395349e-05,
+      "loss": 0.0012094499170780182,
+      "step": 6900
+    },
+    {
+      "epoch": 0.6359011627906976,
+      "grad_norm": 0.5308993458747864,
+      "learning_rate": 3.410474200581395e-05,
+      "loss": 0.0006021633744239807,
+      "step": 7000
+    },
+    {
+      "epoch": 0.6449854651162791,
+      "grad_norm": 0.003343795659020543,
+      "learning_rate": 3.3877634447674425e-05,
+      "loss": 0.0005116893351078033,
+      "step": 7100
+    },
+    {
+      "epoch": 0.6540697674418605,
+      "grad_norm": 0.0033038391266018152,
+      "learning_rate": 3.365052688953488e-05,
+      "loss": 0.002106922417879105,
+      "step": 7200
+    },
+    {
+      "epoch": 0.6631540697674418,
+      "grad_norm": 0.004004980903118849,
+      "learning_rate": 3.3423419331395353e-05,
+      "loss": 0.0010233993828296661,
+      "step": 7300
+    },
+    {
+      "epoch": 0.6722383720930233,
+      "grad_norm": 0.00490298168733716,
+      "learning_rate": 3.319631177325582e-05,
+      "loss": 0.0007062336057424546,
+      "step": 7400
+    },
+    {
+      "epoch": 0.6813226744186046,
+      "grad_norm": 0.04243200644850731,
+      "learning_rate": 3.2969204215116276e-05,
+      "loss": 0.0005479569733142852,
+      "step": 7500
+    },
+    {
+      "epoch": 0.690406976744186,
+      "grad_norm": 0.008556894026696682,
+      "learning_rate": 3.274209665697675e-05,
+      "loss": 0.0010938134789466858,
+      "step": 7600
+    },
+    {
+      "epoch": 0.6994912790697675,
+      "grad_norm": 0.016938723623752594,
+      "learning_rate": 3.251498909883721e-05,
+      "loss": 0.0009466408193111419,
+      "step": 7700
+    },
+    {
+      "epoch": 0.7085755813953488,
+      "grad_norm": 0.09418574720621109,
+      "learning_rate": 3.2287881540697676e-05,
+      "loss": 0.00145876482129097,
+      "step": 7800
+    },
+    {
+      "epoch": 0.7176598837209303,
+      "grad_norm": 0.002359782112762332,
+      "learning_rate": 3.206077398255814e-05,
+      "loss": 0.0015742655098438263,
+      "step": 7900
+    },
+    {
+      "epoch": 0.7267441860465116,
+      "grad_norm": 0.3374776840209961,
+      "learning_rate": 3.1833666424418605e-05,
+      "loss": 0.0007466593384742737,
+      "step": 8000
+    },
+    {
+      "epoch": 0.735828488372093,
+      "grad_norm": 0.000735403154976666,
+      "learning_rate": 3.1606558866279076e-05,
+      "loss": 0.0008171546459197998,
+      "step": 8100
+    },
+    {
+      "epoch": 0.7449127906976745,
+      "grad_norm": 0.01963644102215767,
+      "learning_rate": 3.1379451308139533e-05,
+      "loss": 0.0007363802939653396,
+      "step": 8200
+    },
+    {
+      "epoch": 0.7539970930232558,
+      "grad_norm": 0.09964141249656677,
+      "learning_rate": 3.1152343750000005e-05,
+      "loss": 0.00030826406553387644,
+      "step": 8300
+    },
+    {
+      "epoch": 0.7630813953488372,
+      "grad_norm": 0.0029934593476355076,
+      "learning_rate": 3.092523619186047e-05,
+      "loss": 0.0003284827247262001,
+      "step": 8400
+    },
+    {
+      "epoch": 0.7721656976744186,
+      "grad_norm": 0.002162993187084794,
+      "learning_rate": 3.069812863372093e-05,
+      "loss": 0.0010288888961076737,
+      "step": 8500
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 0.013634397648274899,
+      "learning_rate": 3.0471021075581398e-05,
+      "loss": 0.0006983913481235504,
+      "step": 8600
+    },
+    {
+      "epoch": 0.7903343023255814,
+      "grad_norm": 0.0833001658320427,
+      "learning_rate": 3.0243913517441862e-05,
+      "loss": 0.0009967025369405746,
+      "step": 8700
+    },
+    {
+      "epoch": 0.7994186046511628,
+      "grad_norm": 0.4697379767894745,
+      "learning_rate": 3.0016805959302323e-05,
+      "loss": 0.0007716407626867294,
+      "step": 8800
+    },
+    {
+      "epoch": 0.8085029069767442,
+      "grad_norm": 0.04588587209582329,
+      "learning_rate": 2.978969840116279e-05,
+      "loss": 0.000263803955167532,
+      "step": 8900
+    },
+    {
+      "epoch": 0.8175872093023255,
+      "grad_norm": 0.003420337103307247,
+      "learning_rate": 2.956259084302326e-05,
+      "loss": 0.0009599439054727555,
+      "step": 9000
+    },
+    {
+      "epoch": 0.826671511627907,
+      "grad_norm": 0.0024436134845018387,
+      "learning_rate": 2.9335483284883724e-05,
+      "loss": 0.012270723581314086,
+      "step": 9100
+    },
+    {
+      "epoch": 0.8357558139534884,
+      "grad_norm": 0.001418731757439673,
+      "learning_rate": 2.9108375726744185e-05,
+      "loss": 0.0009713788330554962,
+      "step": 9200
+    },
+    {
+      "epoch": 0.8448401162790697,
+      "grad_norm": 0.0026375153101980686,
+      "learning_rate": 2.8881268168604652e-05,
+      "loss": 0.0007654589414596558,
+      "step": 9300
+    },
+    {
+      "epoch": 0.8539244186046512,
+      "grad_norm": 0.0013760777655988932,
+      "learning_rate": 2.865416061046512e-05,
+      "loss": 0.0006276721507310867,
+      "step": 9400
+    },
+    {
+      "epoch": 0.8630087209302325,
+      "grad_norm": 0.0033582421019673347,
+      "learning_rate": 2.842705305232558e-05,
+      "loss": 0.0003520375117659569,
+      "step": 9500
+    },
+    {
+      "epoch": 0.872093023255814,
+      "grad_norm": 0.0043012769892811775,
+      "learning_rate": 2.819994549418605e-05,
+      "loss": 0.0011182524263858794,
+      "step": 9600
+    },
+    {
+      "epoch": 0.8811773255813954,
+      "grad_norm": 0.0015025343745946884,
+      "learning_rate": 2.7972837936046514e-05,
+      "loss": 0.00044553544372320173,
+      "step": 9700
+    },
+    {
+      "epoch": 0.8902616279069767,
+      "grad_norm": 0.0018241156358271837,
+      "learning_rate": 2.7745730377906975e-05,
+      "loss": 0.00023134740069508553,
+      "step": 9800
+    },
+    {
+      "epoch": 0.8993459302325582,
+      "grad_norm": 0.001082456554286182,
+      "learning_rate": 2.7518622819767442e-05,
+      "loss": 0.001018296480178833,
+      "step": 9900
+    },
+    {
+      "epoch": 0.9084302325581395,
+      "grad_norm": 0.01155087724328041,
+      "learning_rate": 2.729151526162791e-05,
+      "loss": 0.0005589094385504722,
+      "step": 10000
+    },
+    {
+      "epoch": 0.9175145348837209,
+      "grad_norm": 0.0022482366766780615,
+      "learning_rate": 2.7064407703488375e-05,
+      "loss": 0.000512191392481327,
+      "step": 10100
+    },
+    {
+      "epoch": 0.9265988372093024,
+      "grad_norm": 0.20549768209457397,
+      "learning_rate": 2.6837300145348836e-05,
+      "loss": 0.00038118865340948106,
+      "step": 10200
+    },
+    {
+      "epoch": 0.9356831395348837,
+      "grad_norm": 0.0013188497396185994,
+      "learning_rate": 2.6610192587209304e-05,
+      "loss": 0.0003574254736304283,
+      "step": 10300
+    },
+    {
+      "epoch": 0.9447674418604651,
+      "grad_norm": 0.0006103936466388404,
+      "learning_rate": 2.638308502906977e-05,
+      "loss": 0.00024356411769986154,
+      "step": 10400
+    },
+    {
+      "epoch": 0.9538517441860465,
+      "grad_norm": 0.00047453015577048063,
+      "learning_rate": 2.6155977470930233e-05,
+      "loss": 0.0013259868323802948,
+      "step": 10500
+    },
+    {
+      "epoch": 0.9629360465116279,
+      "grad_norm": 0.025760261341929436,
+      "learning_rate": 2.59288699127907e-05,
+      "loss": 0.0006390263140201569,
+      "step": 10600
+    },
+    {
+      "epoch": 0.9720203488372093,
+      "grad_norm": 0.0025282115675508976,
+      "learning_rate": 2.5701762354651165e-05,
+      "loss": 0.001759422868490219,
+      "step": 10700
+    },
+    {
+      "epoch": 0.9811046511627907,
+      "grad_norm": 2.990682601928711,
+      "learning_rate": 2.5474654796511626e-05,
+      "loss": 0.00041438560932874677,
+      "step": 10800
+    },
+    {
+      "epoch": 0.9901889534883721,
+      "grad_norm": 0.0036354295443743467,
+      "learning_rate": 2.5247547238372094e-05,
+      "loss": 0.000467718243598938,
+      "step": 10900
+    },
+    {
+      "epoch": 0.9992732558139535,
+      "grad_norm": 0.02648981101810932,
+      "learning_rate": 2.502043968023256e-05,
+      "loss": 0.0007214382290840149,
+      "step": 11000
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 8.017317304620519e-05,
+      "eval_runtime": 132.6261,
+      "eval_samples_per_second": 165.993,
+      "eval_steps_per_second": 20.75,
+      "step": 11008
+    },
+    {
+      "epoch": 1.008357558139535,
+      "grad_norm": 0.0011333210859447718,
+      "learning_rate": 2.4793332122093026e-05,
+      "loss": 0.000609114095568657,
+      "step": 11100
+    },
+    {
+      "epoch": 1.0174418604651163,
+      "grad_norm": 0.0006669509457424283,
+      "learning_rate": 2.4566224563953487e-05,
+      "loss": 0.000269341841340065,
+      "step": 11200
+    },
+    {
+      "epoch": 1.0265261627906976,
+      "grad_norm": 0.0006085751811042428,
+      "learning_rate": 2.4339117005813955e-05,
+      "loss": 0.00018008088693022728,
+      "step": 11300
+    },
+    {
+      "epoch": 1.035610465116279,
+      "grad_norm": 0.00029874974279664457,
+      "learning_rate": 2.411200944767442e-05,
+      "loss": 8.143479935824871e-05,
+      "step": 11400
+    },
+    {
+      "epoch": 1.0446947674418605,
+      "grad_norm": 0.0020065035205334425,
+      "learning_rate": 2.3884901889534887e-05,
+      "loss": 0.0004405264183878899,
+      "step": 11500
+    },
+    {
+      "epoch": 1.0537790697674418,
+      "grad_norm": 0.3139957785606384,
+      "learning_rate": 2.3657794331395348e-05,
+      "loss": 0.0007688279449939728,
+      "step": 11600
+    },
+    {
+      "epoch": 1.0628633720930232,
+      "grad_norm": 0.014869201928377151,
+      "learning_rate": 2.3430686773255813e-05,
+      "loss": 0.00045145414769649507,
+      "step": 11700
+    },
+    {
+      "epoch": 1.0719476744186047,
+      "grad_norm": 0.03592238947749138,
+      "learning_rate": 2.320357921511628e-05,
+      "loss": 0.0013660797476768493,
+      "step": 11800
+    },
+    {
+      "epoch": 1.081031976744186,
+      "grad_norm": 0.010270297527313232,
+      "learning_rate": 2.2976471656976745e-05,
+      "loss": 0.0005171676725149155,
+      "step": 11900
+    },
+    {
+      "epoch": 1.0901162790697674,
+      "grad_norm": 0.0026885548140853643,
+      "learning_rate": 2.2749364098837213e-05,
+      "loss": 0.00046057451516389846,
+      "step": 12000
+    },
+    {
+      "epoch": 1.099200581395349,
+      "grad_norm": 0.005681134294718504,
+      "learning_rate": 2.2522256540697674e-05,
+      "loss": 0.0006235280632972718,
+      "step": 12100
+    },
+    {
+      "epoch": 1.1082848837209303,
+      "grad_norm": 0.0028001824393868446,
+      "learning_rate": 2.2295148982558138e-05,
+      "loss": 0.0008198145776987075,
+      "step": 12200
+    },
+    {
+      "epoch": 1.1173691860465116,
+      "grad_norm": 0.0006385694723576307,
+      "learning_rate": 2.2068041424418606e-05,
+      "loss": 0.0003722207620739937,
+      "step": 12300
+    },
+    {
+      "epoch": 1.1264534883720931,
+      "grad_norm": 0.0008649047813378274,
+      "learning_rate": 2.184093386627907e-05,
+      "loss": 0.0003394853696227074,
+      "step": 12400
+    },
+    {
+      "epoch": 1.1355377906976745,
+      "grad_norm": 0.004796006251126528,
+      "learning_rate": 2.1613826308139538e-05,
+      "loss": 0.00042502928525209424,
+      "step": 12500
+    },
+    {
+      "epoch": 1.1446220930232558,
+      "grad_norm": 0.001498043886385858,
+      "learning_rate": 2.138671875e-05,
+      "loss": 0.0009064202010631562,
+      "step": 12600
+    },
+    {
+      "epoch": 1.1537063953488371,
+      "grad_norm": 0.01664622500538826,
+      "learning_rate": 2.1159611191860464e-05,
+      "loss": 0.0002878718450665474,
+      "step": 12700
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 0.010639426298439503,
+      "learning_rate": 2.093250363372093e-05,
+      "loss": 0.0011860337108373642,
+      "step": 12800
+    },
+    {
+      "epoch": 1.171875,
+      "grad_norm": 0.050644513219594955,
+      "learning_rate": 2.0705396075581396e-05,
+      "loss": 0.0006260576844215393,
+      "step": 12900
+    },
+    {
+      "epoch": 1.1809593023255813,
+      "grad_norm": 0.001398528809659183,
+      "learning_rate": 2.0478288517441864e-05,
+      "loss": 0.0001833665184676647,
+      "step": 13000
+    },
+    {
+      "epoch": 1.1900436046511629,
+      "grad_norm": 0.0057320562191307545,
+      "learning_rate": 2.0251180959302325e-05,
+      "loss": 0.00032943256199359896,
+      "step": 13100
+    },
+    {
+      "epoch": 1.1991279069767442,
+      "grad_norm": 0.0002997924748342484,
+      "learning_rate": 2.002407340116279e-05,
+      "loss": 6.249572150409222e-05,
+      "step": 13200
+    },
+    {
+      "epoch": 1.2082122093023255,
+      "grad_norm": 0.0008130258647724986,
+      "learning_rate": 1.9796965843023257e-05,
+      "loss": 0.00026577839627861977,
+      "step": 13300
+    },
+    {
+      "epoch": 1.2172965116279069,
+      "grad_norm": 0.0006040658336132765,
+      "learning_rate": 1.956985828488372e-05,
+      "loss": 0.00023248378187417984,
+      "step": 13400
+    },
+    {
+      "epoch": 1.2263808139534884,
+      "grad_norm": 0.000803643895778805,
+      "learning_rate": 1.934275072674419e-05,
+      "loss": 0.0002730824239552021,
+      "step": 13500
+    },
+    {
+      "epoch": 1.2354651162790697,
+      "grad_norm": 0.0005482266424223781,
+      "learning_rate": 1.911564316860465e-05,
+      "loss": 0.0005848826467990875,
+      "step": 13600
+    },
+    {
+      "epoch": 1.244549418604651,
+      "grad_norm": 0.0034759771078824997,
+      "learning_rate": 1.8888535610465115e-05,
+      "loss": 0.0005462893471121788,
+      "step": 13700
+    },
+    {
+      "epoch": 1.2536337209302326,
+      "grad_norm": 0.0002226918877568096,
+      "learning_rate": 1.8661428052325583e-05,
+      "loss": 7.945694960653782e-05,
+      "step": 13800
+    },
+    {
+      "epoch": 1.262718023255814,
+      "grad_norm": 0.002374310279265046,
+      "learning_rate": 1.8434320494186047e-05,
+      "loss": 0.000744214728474617,
+      "step": 13900
+    },
+    {
+      "epoch": 1.2718023255813953,
+      "grad_norm": 0.0009076696587726474,
+      "learning_rate": 1.8207212936046515e-05,
+      "loss": 0.0003250580281019211,
+      "step": 14000
+    },
+    {
+      "epoch": 1.2808866279069768,
+      "grad_norm": 0.010181100107729435,
+      "learning_rate": 1.7980105377906976e-05,
+      "loss": 0.0005765938758850097,
+      "step": 14100
+    },
+    {
+      "epoch": 1.2899709302325582,
+      "grad_norm": 0.0012442917795851827,
+      "learning_rate": 1.775299781976744e-05,
+      "loss": 0.00032320961356163024,
+      "step": 14200
+    },
+    {
+      "epoch": 1.2990552325581395,
+      "grad_norm": 0.0005617383285425603,
+      "learning_rate": 1.752589026162791e-05,
+      "loss": 0.0001776321791112423,
+      "step": 14300
+    },
+    {
+      "epoch": 1.308139534883721,
+      "grad_norm": 0.0010064981179311872,
+      "learning_rate": 1.7298782703488373e-05,
+      "loss": 0.00012774170376360415,
+      "step": 14400
+    },
+    {
+      "epoch": 1.3172238372093024,
+      "grad_norm": 0.00039220438338816166,
+      "learning_rate": 1.707167514534884e-05,
+      "loss": 0.0006190959364175796,
+      "step": 14500
+    },
+    {
+      "epoch": 1.3263081395348837,
+      "grad_norm": 0.0009137202869169414,
+      "learning_rate": 1.6844567587209302e-05,
+      "loss": 0.0007051125913858414,
+      "step": 14600
+    },
+    {
+      "epoch": 1.3353924418604652,
+      "grad_norm": 0.0006007241318002343,
+      "learning_rate": 1.6617460029069766e-05,
+      "loss": 7.074063178151846e-05,
+      "step": 14700
+    },
+    {
+      "epoch": 1.3444767441860466,
+      "grad_norm": 0.0006039150175638497,
+      "learning_rate": 1.6390352470930234e-05,
+      "loss": 0.000489293597638607,
+      "step": 14800
+    },
+    {
+      "epoch": 1.353561046511628,
+      "grad_norm": 0.0014955669175833464,
+      "learning_rate": 1.61632449127907e-05,
+      "loss": 0.0004155828058719635,
+      "step": 14900
+    },
+    {
+      "epoch": 1.3626453488372092,
+      "grad_norm": 0.1508745402097702,
+      "learning_rate": 1.5936137354651163e-05,
+      "loss": 0.00024092141538858413,
+      "step": 15000
+    },
+    {
+      "epoch": 1.3717296511627908,
+      "grad_norm": 0.0008113393560051918,
+      "learning_rate": 1.5709029796511627e-05,
+      "loss": 0.00040562458336353304,
+      "step": 15100
+    },
+    {
+      "epoch": 1.380813953488372,
+      "grad_norm": 0.0026459796354174614,
+      "learning_rate": 1.5481922238372092e-05,
+      "loss": 0.0007934534549713135,
+      "step": 15200
+    },
+    {
+      "epoch": 1.3898982558139534,
+      "grad_norm": 0.0005005365237593651,
+      "learning_rate": 1.525481468023256e-05,
+      "loss": 0.00023073634132742883,
+      "step": 15300
+    },
+    {
+      "epoch": 1.3989825581395348,
+      "grad_norm": 0.02727348543703556,
+      "learning_rate": 1.5027707122093024e-05,
+      "loss": 0.0003524136170744896,
+      "step": 15400
+    },
+    {
+      "epoch": 1.4080668604651163,
+      "grad_norm": 0.0002921252744272351,
+      "learning_rate": 1.4800599563953488e-05,
+      "loss": 0.000525415763258934,
+      "step": 15500
+    },
+    {
+      "epoch": 1.4171511627906976,
+      "grad_norm": 0.0016339289722964168,
+      "learning_rate": 1.4573492005813955e-05,
+      "loss": 0.0003766526654362678,
+      "step": 15600
+    },
+    {
+      "epoch": 1.426235465116279,
+      "grad_norm": 0.003485665889456868,
+      "learning_rate": 1.4346384447674419e-05,
+      "loss": 0.00021764757111668586,
+      "step": 15700
+    },
+    {
+      "epoch": 1.4353197674418605,
+      "grad_norm": 0.0011686257785186172,
+      "learning_rate": 1.4119276889534885e-05,
+      "loss": 0.00026009151712059976,
+      "step": 15800
+    },
+    {
+      "epoch": 1.4444040697674418,
+      "grad_norm": 0.00034656753996387124,
+      "learning_rate": 1.389216933139535e-05,
+      "loss": 0.0003694407269358635,
+      "step": 15900
+    },
+    {
+      "epoch": 1.4534883720930232,
+      "grad_norm": 0.0023575718514621258,
+      "learning_rate": 1.3665061773255814e-05,
+      "loss": 0.0001483263447880745,
+      "step": 16000
+    },
+    {
+      "epoch": 1.4625726744186047,
+      "grad_norm": 0.0011886970605701208,
+      "learning_rate": 1.343795421511628e-05,
+      "loss": 0.0004465998336672783,
+      "step": 16100
+    },
+    {
+      "epoch": 1.471656976744186,
+      "grad_norm": 0.00039876584196463227,
+      "learning_rate": 1.3210846656976745e-05,
+      "loss": 0.00015990335494279863,
+      "step": 16200
+    },
+    {
+      "epoch": 1.4807412790697674,
+      "grad_norm": 0.00023756893642712384,
+      "learning_rate": 1.298373909883721e-05,
+      "loss": 5.0160493701696394e-05,
+      "step": 16300
+    },
+    {
+      "epoch": 1.489825581395349,
+      "grad_norm": 0.0002809664292726666,
+      "learning_rate": 1.2756631540697675e-05,
+      "loss": 0.0001377291791141033,
+      "step": 16400
+    },
+    {
+      "epoch": 1.4989098837209303,
+      "grad_norm": 0.00037873705150559545,
+      "learning_rate": 1.252952398255814e-05,
+      "loss": 7.196901366114616e-05,
+      "step": 16500
+    },
+    {
+      "epoch": 1.5079941860465116,
+      "grad_norm": 1.4199703931808472,
+      "learning_rate": 1.2302416424418606e-05,
+      "loss": 0.0005905186012387276,
+      "step": 16600
+    },
+    {
+      "epoch": 1.5170784883720931,
+      "grad_norm": 0.0003422704176045954,
+      "learning_rate": 1.207530886627907e-05,
+      "loss": 0.00013428066857159137,
+      "step": 16700
+    },
+    {
+      "epoch": 1.5261627906976745,
+      "grad_norm": 0.0010882618371397257,
+      "learning_rate": 1.1848201308139535e-05,
+      "loss": 0.00020068021491169929,
+      "step": 16800
+    },
+    {
+      "epoch": 1.5352470930232558,
+      "grad_norm": 0.024530770257115364,
+      "learning_rate": 1.162109375e-05,
+      "loss": 0.0004120354726910591,
+      "step": 16900
+    },
+    {
+      "epoch": 1.5443313953488373,
+      "grad_norm": 0.0006738721276633441,
+      "learning_rate": 1.1393986191860465e-05,
+      "loss": 0.0002767092920839787,
+      "step": 17000
+    },
+    {
+      "epoch": 1.5534156976744184,
+      "grad_norm": 0.00019564498506952077,
+      "learning_rate": 1.1166878633720931e-05,
+      "loss": 0.00021817052736878396,
+      "step": 17100
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 0.0002832361788023263,
+      "learning_rate": 1.0939771075581396e-05,
+      "loss": 5.168498028069734e-05,
+      "step": 17200
+    },
+    {
+      "epoch": 1.5715843023255816,
+      "grad_norm": 0.0005729420809075236,
+      "learning_rate": 1.071266351744186e-05,
+      "loss": 0.00016681572422385215,
+      "step": 17300
+    },
+    {
+      "epoch": 1.5806686046511627,
+      "grad_norm": 0.0007383101619780064,
+      "learning_rate": 1.0485555959302326e-05,
+      "loss": 0.0003441007435321808,
+      "step": 17400
+    },
+    {
+      "epoch": 1.5897529069767442,
+      "grad_norm": 0.005897729191929102,
+      "learning_rate": 1.0258448401162791e-05,
+      "loss": 0.0003161391615867615,
+      "step": 17500
+    },
+    {
+      "epoch": 1.5988372093023255,
+      "grad_norm": 0.00025281202397309244,
+      "learning_rate": 1.0031340843023257e-05,
+      "loss": 0.00011429931037127971,
+      "step": 17600
+    },
+    {
+      "epoch": 1.6079215116279069,
+      "grad_norm": 0.0004013874859083444,
+      "learning_rate": 9.804233284883721e-06,
+      "loss": 2.793062012642622e-05,
+      "step": 17700
+    },
+    {
+      "epoch": 1.6170058139534884,
+      "grad_norm": 0.008875502273440361,
+      "learning_rate": 9.577125726744186e-06,
+      "loss": 5.882895085960627e-05,
+      "step": 17800
+    },
+    {
+      "epoch": 1.6260901162790697,
+      "grad_norm": 0.001823420519940555,
+      "learning_rate": 9.350018168604652e-06,
+      "loss": 0.0002583874017000198,
+      "step": 17900
+    },
+    {
+      "epoch": 1.635174418604651,
+      "grad_norm": 0.6227073073387146,
+      "learning_rate": 9.122910610465116e-06,
+      "loss": 0.0007115737348794937,
+      "step": 18000
+    },
+    {
+      "epoch": 1.6442587209302326,
+      "grad_norm": 0.0017345056403428316,
+      "learning_rate": 8.895803052325581e-06,
+      "loss": 0.00027986690402030944,
+      "step": 18100
+    },
+    {
+      "epoch": 1.653343023255814,
+      "grad_norm": 0.0009033643291331828,
+      "learning_rate": 8.668695494186047e-06,
+      "loss": 0.0002979452162981033,
+      "step": 18200
+    },
+    {
+      "epoch": 1.6624273255813953,
+      "grad_norm": 0.00028923453646712005,
+      "learning_rate": 8.441587936046511e-06,
+      "loss": 1.926603843457997e-05,
+      "step": 18300
+    },
+    {
+      "epoch": 1.6715116279069768,
+      "grad_norm": 0.0006103311898186803,
+      "learning_rate": 8.214480377906978e-06,
+      "loss": 6.331724114716053e-05,
+      "step": 18400
+    },
+    {
+      "epoch": 1.6805959302325582,
+      "grad_norm": 0.0002818437642417848,
+      "learning_rate": 7.987372819767442e-06,
+      "loss": 0.00014105773530900478,
+      "step": 18500
+    },
+    {
+      "epoch": 1.6896802325581395,
+      "grad_norm": 0.0011020454112440348,
+      "learning_rate": 7.760265261627907e-06,
+      "loss": 0.0002292773686349392,
+      "step": 18600
+    },
+    {
+      "epoch": 1.698764534883721,
+      "grad_norm": 0.001500884653069079,
+      "learning_rate": 7.533157703488372e-06,
+      "loss": 0.00011514685116708279,
+      "step": 18700
+    },
+    {
+      "epoch": 1.7078488372093024,
+      "grad_norm": 0.00016000888717826456,
+      "learning_rate": 7.306050145348838e-06,
+      "loss": 2.0523781422525645e-05,
+      "step": 18800
+    },
+    {
+      "epoch": 1.7169331395348837,
+      "grad_norm": 0.003748674876987934,
+      "learning_rate": 7.078942587209303e-06,
+      "loss": 0.00043839264661073686,
+      "step": 18900
+    },
+    {
+      "epoch": 1.7260174418604652,
+      "grad_norm": 0.06772974133491516,
+      "learning_rate": 6.8518350290697685e-06,
+      "loss": 2.126413397490978e-05,
+      "step": 19000
+    },
+    {
+      "epoch": 1.7351017441860463,
+      "grad_norm": 0.0004789210797753185,
+      "learning_rate": 6.624727470930232e-06,
+      "loss": 0.00020929597318172455,
+      "step": 19100
+    },
+    {
+      "epoch": 1.744186046511628,
+      "grad_norm": 7.055519381538033e-05,
+      "learning_rate": 6.397619912790697e-06,
+      "loss": 0.00032072752714157104,
+      "step": 19200
+    },
+    {
+      "epoch": 1.7532703488372094,
+      "grad_norm": 0.00010641128028510138,
+      "learning_rate": 6.1705123546511635e-06,
+      "loss": 8.132393471896649e-05,
+      "step": 19300
+    },
+    {
+      "epoch": 1.7623546511627906,
+      "grad_norm": 0.0005615473492071033,
+      "learning_rate": 5.943404796511629e-06,
+      "loss": 9.162256610579788e-06,
+      "step": 19400
+    },
+    {
+      "epoch": 1.771438953488372,
+      "grad_norm": 0.004505404736846685,
+      "learning_rate": 5.716297238372093e-06,
+      "loss": 9.514865465462207e-05,
+      "step": 19500
+    },
+    {
+      "epoch": 1.7805232558139537,
+      "grad_norm": 0.0011890050955116749,
+      "learning_rate": 5.4891896802325586e-06,
+      "loss": 3.7443286273628474e-05,
+      "step": 19600
+    },
+    {
+      "epoch": 1.7896075581395348,
+      "grad_norm": 0.000441042153397575,
+      "learning_rate": 5.262082122093023e-06,
+      "loss": 5.087008234113455e-05,
+      "step": 19700
+    },
+    {
+      "epoch": 1.7986918604651163,
+      "grad_norm": 0.0001777316356310621,
+      "learning_rate": 5.034974563953489e-06,
+      "loss": 0.00017192648723721505,
+      "step": 19800
+    },
+    {
+      "epoch": 1.8077761627906976,
+      "grad_norm": 0.00015439293929375708,
+      "learning_rate": 4.8078670058139536e-06,
+      "loss": 0.0001058769691735506,
+      "step": 19900
+    },
+    {
+      "epoch": 1.816860465116279,
+      "grad_norm": 0.0016349812503904104,
+      "learning_rate": 4.580759447674419e-06,
+      "loss": 1.0620863176882267e-05,
+      "step": 20000
+    },
+    {
+      "epoch": 1.8259447674418605,
+      "grad_norm": 9.556530858390033e-05,
+      "learning_rate": 4.353651889534884e-06,
+      "loss": 0.00019107908010482789,
+      "step": 20100
+    },
+    {
+      "epoch": 1.8350290697674418,
+      "grad_norm": 0.0002568990457803011,
+      "learning_rate": 4.126544331395349e-06,
+      "loss": 0.00026563439518213273,
+      "step": 20200
+    },
+    {
+      "epoch": 1.8441133720930232,
+      "grad_norm": 0.5719628930091858,
+      "learning_rate": 3.899436773255814e-06,
+      "loss": 5.403281655162573e-05,
+      "step": 20300
+    },
+    {
+      "epoch": 1.8531976744186047,
+      "grad_norm": 0.017383404076099396,
+      "learning_rate": 3.672329215116279e-06,
+      "loss": 0.00036652404814958573,
+      "step": 20400
+    },
+    {
+      "epoch": 1.862281976744186,
+      "grad_norm": 0.00022910887491889298,
+      "learning_rate": 3.4452216569767445e-06,
+      "loss": 0.000403200164437294,
+      "step": 20500
+    },
+    {
+      "epoch": 1.8713662790697674,
+      "grad_norm": 0.00033295468892902136,
+      "learning_rate": 3.2181140988372097e-06,
+      "loss": 8.646129630506039e-05,
+      "step": 20600
+    },
+    {
+      "epoch": 1.880450581395349,
+      "grad_norm": 0.00012523184705059975,
+      "learning_rate": 2.9910065406976746e-06,
+      "loss": 0.00016864996403455733,
+      "step": 20700
+    },
+    {
+      "epoch": 1.8895348837209303,
+      "grad_norm": 0.00022476979938801378,
+      "learning_rate": 2.7638989825581395e-06,
+      "loss": 5.167209077626467e-05,
+      "step": 20800
+    },
+    {
+      "epoch": 1.8986191860465116,
+      "grad_norm": 0.0002389108412899077,
+      "learning_rate": 2.5367914244186048e-06,
+      "loss": 6.591790355741977e-05,
+      "step": 20900
+    },
+    {
+      "epoch": 1.9077034883720931,
+      "grad_norm": 0.0013002109481021762,
+      "learning_rate": 2.30968386627907e-06,
+      "loss": 0.00010993644595146179,
+      "step": 21000
+    },
+    {
+      "epoch": 1.9167877906976745,
+      "grad_norm": 0.13188259303569794,
+      "learning_rate": 2.082576308139535e-06,
+      "loss": 0.00014082306995987892,
+      "step": 21100
+    },
+    {
+      "epoch": 1.9258720930232558,
+      "grad_norm": 0.0010737127158790827,
+      "learning_rate": 1.85546875e-06,
+      "loss": 0.00030819986015558244,
+      "step": 21200
+    },
+    {
+      "epoch": 1.9349563953488373,
+      "grad_norm": 7.047707185847685e-05,
+      "learning_rate": 1.628361191860465e-06,
+      "loss": 2.0567586179822683e-05,
+      "step": 21300
+    },
+    {
+      "epoch": 1.9440406976744184,
+      "grad_norm": 0.0001350079692201689,
+      "learning_rate": 1.4012536337209304e-06,
+      "loss": 7.989832083694636e-06,
+      "step": 21400
+    },
+    {
+      "epoch": 1.953125,
+      "grad_norm": 0.0008243785705417395,
+      "learning_rate": 1.1741460755813954e-06,
+      "loss": 0.0002570002153515816,
+      "step": 21500
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 22016,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.310887654981632e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

models/dostoievsky_v1/checkpoint-21500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edab2539a81b65c2a76c4c39bed219aa02358f44f3286429f83165442a1b53fa
+size 5329

models/dostoievsky_v1/checkpoint-22000/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "classif_dropout": 0.1,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_decoder": false,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_position_embeddings": 1024,
+  "model_type": "bart",
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "task_specific_params": {
+    "summarization": {
+      "length_penalty": 1.0,
+      "max_length": 128,
+      "min_length": 12,
+      "num_beams": 4
+    },
+    "summarization_cnn": {
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "num_beams": 4
+    },
+    "summarization_xsum": {
+      "length_penalty": 1.0,
+      "max_length": 62,
+      "min_length": 11,
+      "num_beams": 6
+    }
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.0",
+  "use_cache": false,
+  "vocab_size": 50265
+}

models/dostoievsky_v1/checkpoint-22000/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "early_stopping": true,
+  "eos_token_id": [
+    2
+  ],
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "no_repeat_ngram_size": 3,
+  "num_beams": 4,
+  "pad_token_id": 1,
+  "transformers_version": "5.5.0",
+  "use_cache": true
+}

models/dostoievsky_v1/checkpoint-22000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:633c3641654a45feb7257a935fd947b6989e946fa6299b29e37ba5e4b5ad0b0e
+size 557912620

models/dostoievsky_v1/checkpoint-22000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d1e4af5742f873951b83cbc14534ab7498884479c96fcdac27b2628a59510db
+size 1115583947

models/dostoievsky_v1/checkpoint-22000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f7968204ee7cf2f156da4351cf9b12e9e660827a0cbfd71d4a12e060ad798ec
+size 14645

models/dostoievsky_v1/checkpoint-22000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f093c3a5ebc5ff143ef105fa917c11d68551c7b60bb71edb210a12bc3eab0e
+size 1383

models/dostoievsky_v1/checkpoint-22000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09cfa34929b7369dcf65b9d61627476e95b27fd4cd32f7aa9c5a8fa4268f140f
+size 1465

models/dostoievsky_v1/checkpoint-22000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/dostoievsky_v1/checkpoint-22000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

models/dostoievsky_v1/checkpoint-22000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1582 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9985465116279069,
+  "eval_steps": 500,
+  "global_step": 22000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009084302325581396,
+      "grad_norm": 1.9626818895339966,
+      "learning_rate": 4.9775163517441866e-05,
+      "loss": 3.2140347290039064,
+      "step": 100
+    },
+    {
+      "epoch": 0.018168604651162792,
+      "grad_norm": 1.511451244354248,
+      "learning_rate": 4.9548055959302324e-05,
+      "loss": 0.01261173963546753,
+      "step": 200
+    },
+    {
+      "epoch": 0.027252906976744186,
+      "grad_norm": 0.18194368481636047,
+      "learning_rate": 4.9320948401162795e-05,
+      "loss": 0.009183254837989808,
+      "step": 300
+    },
+    {
+      "epoch": 0.036337209302325583,
+      "grad_norm": 0.27592894434928894,
+      "learning_rate": 4.909384084302326e-05,
+      "loss": 0.0027826914191246034,
+      "step": 400
+    },
+    {
+      "epoch": 0.045421511627906974,
+      "grad_norm": 1.5584100484848022,
+      "learning_rate": 4.8866733284883724e-05,
+      "loss": 0.0029672542214393615,
+      "step": 500
+    },
+    {
+      "epoch": 0.05450581395348837,
+      "grad_norm": 0.02698471024632454,
+      "learning_rate": 4.863962572674419e-05,
+      "loss": 0.004296095669269561,
+      "step": 600
+    },
+    {
+      "epoch": 0.06359011627906977,
+      "grad_norm": 0.137993723154068,
+      "learning_rate": 4.841251816860465e-05,
+      "loss": 0.0031410756707191466,
+      "step": 700
+    },
+    {
+      "epoch": 0.07267441860465117,
+      "grad_norm": 0.07874622195959091,
+      "learning_rate": 4.818541061046512e-05,
+      "loss": 0.003037240505218506,
+      "step": 800
+    },
+    {
+      "epoch": 0.08175872093023256,
+      "grad_norm": 0.013660268858075142,
+      "learning_rate": 4.795830305232558e-05,
+      "loss": 0.0031082597374916076,
+      "step": 900
+    },
+    {
+      "epoch": 0.09084302325581395,
+      "grad_norm": 0.3354911804199219,
+      "learning_rate": 4.7731195494186046e-05,
+      "loss": 0.0048285979032516475,
+      "step": 1000
+    },
+    {
+      "epoch": 0.09992732558139535,
+      "grad_norm": 0.43667030334472656,
+      "learning_rate": 4.750408793604652e-05,
+      "loss": 0.0019270157814025878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.10901162790697674,
+      "grad_norm": 0.03849343582987785,
+      "learning_rate": 4.7276980377906975e-05,
+      "loss": 0.001011388897895813,
+      "step": 1200
+    },
+    {
+      "epoch": 0.11809593023255814,
+      "grad_norm": 0.017985744401812553,
+      "learning_rate": 4.7049872819767446e-05,
+      "loss": 0.0018200889229774476,
+      "step": 1300
+    },
+    {
+      "epoch": 0.12718023255813954,
+      "grad_norm": 0.05241613835096359,
+      "learning_rate": 4.682276526162791e-05,
+      "loss": 0.004675151705741882,
+      "step": 1400
+    },
+    {
+      "epoch": 0.13626453488372092,
+      "grad_norm": 0.16119782626628876,
+      "learning_rate": 4.6595657703488375e-05,
+      "loss": 0.0025834646821022034,
+      "step": 1500
+    },
+    {
+      "epoch": 0.14534883720930233,
+      "grad_norm": 0.8985283374786377,
+      "learning_rate": 4.636855014534884e-05,
+      "loss": 0.001825849711894989,
+      "step": 1600
+    },
+    {
+      "epoch": 0.15443313953488372,
+      "grad_norm": 0.021285999566316605,
+      "learning_rate": 4.6141442587209304e-05,
+      "loss": 0.004164438843727112,
+      "step": 1700
+    },
+    {
+      "epoch": 0.16351744186046513,
+      "grad_norm": 0.01013926975429058,
+      "learning_rate": 4.591433502906977e-05,
+      "loss": 0.002582077383995056,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1726017441860465,
+      "grad_norm": 0.005713903810828924,
+      "learning_rate": 4.568722747093023e-05,
+      "loss": 0.0026018735766410827,
+      "step": 1900
+    },
+    {
+      "epoch": 0.1816860465116279,
+      "grad_norm": 0.019032707437872887,
+      "learning_rate": 4.54601199127907e-05,
+      "loss": 0.0017240011692047118,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1907703488372093,
+      "grad_norm": 0.03191379830241203,
+      "learning_rate": 4.523301235465117e-05,
+      "loss": 0.0022749193012714386,
+      "step": 2100
+    },
+    {
+      "epoch": 0.1998546511627907,
+      "grad_norm": 0.12945719063282013,
+      "learning_rate": 4.5005904796511626e-05,
+      "loss": 0.0017636509239673615,
+      "step": 2200
+    },
+    {
+      "epoch": 0.2089389534883721,
+      "grad_norm": 0.01494428887963295,
+      "learning_rate": 4.47787972383721e-05,
+      "loss": 0.0017163331806659698,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2180232558139535,
+      "grad_norm": 0.08089974522590637,
+      "learning_rate": 4.455168968023256e-05,
+      "loss": 0.0023377402126789092,
+      "step": 2400
+    },
+    {
+      "epoch": 0.22710755813953487,
+      "grad_norm": 0.004136559087783098,
+      "learning_rate": 4.4324582122093026e-05,
+      "loss": 0.001353910267353058,
+      "step": 2500
+    },
+    {
+      "epoch": 0.23619186046511628,
+      "grad_norm": 0.25111478567123413,
+      "learning_rate": 4.409747456395349e-05,
+      "loss": 0.0022939679026603697,
+      "step": 2600
+    },
+    {
+      "epoch": 0.24527616279069767,
+      "grad_norm": 0.963623583316803,
+      "learning_rate": 4.3870367005813955e-05,
+      "loss": 0.0018738456070423125,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2543604651162791,
+      "grad_norm": 0.6530899405479431,
+      "learning_rate": 4.364325944767442e-05,
+      "loss": 0.0015843257308006287,
+      "step": 2800
+    },
+    {
+      "epoch": 0.26344476744186046,
+      "grad_norm": 0.0018010369967669249,
+      "learning_rate": 4.3416151889534884e-05,
+      "loss": 0.0006610354781150817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.27252906976744184,
+      "grad_norm": 0.1644030064344406,
+      "learning_rate": 4.318904433139535e-05,
+      "loss": 0.0031197205185890197,
+      "step": 3000
+    },
+    {
+      "epoch": 0.28161337209302323,
+      "grad_norm": 0.14440134167671204,
+      "learning_rate": 4.296193677325582e-05,
+      "loss": 0.0011694706976413728,
+      "step": 3100
+    },
+    {
+      "epoch": 0.29069767441860467,
+      "grad_norm": 0.0124755734577775,
+      "learning_rate": 4.273482921511628e-05,
+      "loss": 0.0013250903785228729,
+      "step": 3200
+    },
+    {
+      "epoch": 0.29978197674418605,
+      "grad_norm": 0.15910762548446655,
+      "learning_rate": 4.250772165697675e-05,
+      "loss": 0.0007842753082513809,
+      "step": 3300
+    },
+    {
+      "epoch": 0.30886627906976744,
+      "grad_norm": 0.011540662497282028,
+      "learning_rate": 4.228061409883721e-05,
+      "loss": 0.0018773898482322693,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3179505813953488,
+      "grad_norm": 0.005148735363036394,
+      "learning_rate": 4.205350654069768e-05,
+      "loss": 0.001056775525212288,
+      "step": 3500
+    },
+    {
+      "epoch": 0.32703488372093026,
+      "grad_norm": 0.007442768197506666,
+      "learning_rate": 4.182639898255814e-05,
+      "loss": 0.0014348265528678894,
+      "step": 3600
+    },
+    {
+      "epoch": 0.33611918604651164,
+      "grad_norm": 0.00586783979088068,
+      "learning_rate": 4.1599291424418606e-05,
+      "loss": 0.0014780126512050629,
+      "step": 3700
+    },
+    {
+      "epoch": 0.345203488372093,
+      "grad_norm": 0.006909696385264397,
+      "learning_rate": 4.137218386627907e-05,
+      "loss": 0.0019674110412597656,
+      "step": 3800
+    },
+    {
+      "epoch": 0.3542877906976744,
+      "grad_norm": 0.020171863958239555,
+      "learning_rate": 4.1145076308139535e-05,
+      "loss": 0.00249418705701828,
+      "step": 3900
+    },
+    {
+      "epoch": 0.3633720930232558,
+      "grad_norm": 0.031723715364933014,
+      "learning_rate": 4.091796875e-05,
+      "loss": 0.0030515387654304503,
+      "step": 4000
+    },
+    {
+      "epoch": 0.37245639534883723,
+      "grad_norm": 0.30729565024375916,
+      "learning_rate": 4.069086119186047e-05,
+      "loss": 0.0009509802609682083,
+      "step": 4100
+    },
+    {
+      "epoch": 0.3815406976744186,
+      "grad_norm": 0.0021714100148528814,
+      "learning_rate": 4.046375363372093e-05,
+      "loss": 0.0013920699059963226,
+      "step": 4200
+    },
+    {
+      "epoch": 0.390625,
+      "grad_norm": 0.001017636270262301,
+      "learning_rate": 4.02366460755814e-05,
+      "loss": 0.0010310819000005722,
+      "step": 4300
+    },
+    {
+      "epoch": 0.3997093023255814,
+      "grad_norm": 0.0009663284290581942,
+      "learning_rate": 4.0009538517441864e-05,
+      "loss": 0.001098722442984581,
+      "step": 4400
+    },
+    {
+      "epoch": 0.40879360465116277,
+      "grad_norm": 0.003507931251078844,
+      "learning_rate": 3.978243095930232e-05,
+      "loss": 0.0013840131461620331,
+      "step": 4500
+    },
+    {
+      "epoch": 0.4178779069767442,
+      "grad_norm": 0.21988199651241302,
+      "learning_rate": 3.955532340116279e-05,
+      "loss": 0.0017136169970035553,
+      "step": 4600
+    },
+    {
+      "epoch": 0.4269622093023256,
+      "grad_norm": 0.0012972657568752766,
+      "learning_rate": 3.932821584302326e-05,
+      "loss": 0.0008040449023246765,
+      "step": 4700
+    },
+    {
+      "epoch": 0.436046511627907,
+      "grad_norm": 0.033857911825180054,
+      "learning_rate": 3.910110828488372e-05,
+      "loss": 0.0005662231892347336,
+      "step": 4800
+    },
+    {
+      "epoch": 0.44513081395348836,
+      "grad_norm": 0.0008128180052153766,
+      "learning_rate": 3.8874000726744187e-05,
+      "loss": 0.001040520742535591,
+      "step": 4900
+    },
+    {
+      "epoch": 0.45421511627906974,
+      "grad_norm": 0.0017111338675022125,
+      "learning_rate": 3.864689316860465e-05,
+      "loss": 0.0010922805964946748,
+      "step": 5000
+    },
+    {
+      "epoch": 0.4632994186046512,
+      "grad_norm": 0.0013291583163663745,
+      "learning_rate": 3.841978561046512e-05,
+      "loss": 0.0005277743935585021,
+      "step": 5100
+    },
+    {
+      "epoch": 0.47238372093023256,
+      "grad_norm": 0.09846807271242142,
+      "learning_rate": 3.819267805232558e-05,
+      "loss": 0.0010997675359249114,
+      "step": 5200
+    },
+    {
+      "epoch": 0.48146802325581395,
+      "grad_norm": 0.005185174290090799,
+      "learning_rate": 3.796557049418605e-05,
+      "loss": 0.000698111355304718,
+      "step": 5300
+    },
+    {
+      "epoch": 0.49055232558139533,
+      "grad_norm": 0.019086388871073723,
+      "learning_rate": 3.7738462936046515e-05,
+      "loss": 0.0009016367793083191,
+      "step": 5400
+    },
+    {
+      "epoch": 0.49963662790697677,
+      "grad_norm": 0.05122831463813782,
+      "learning_rate": 3.751135537790697e-05,
+      "loss": 0.0015524370968341826,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5087209302325582,
+      "grad_norm": 0.012596211396157742,
+      "learning_rate": 3.7284247819767444e-05,
+      "loss": 0.00220917209982872,
+      "step": 5600
+    },
+    {
+      "epoch": 0.5178052325581395,
+      "grad_norm": 0.009361029602587223,
+      "learning_rate": 3.705714026162791e-05,
+      "loss": 0.0010362663865089417,
+      "step": 5700
+    },
+    {
+      "epoch": 0.5268895348837209,
+      "grad_norm": 0.5205378532409668,
+      "learning_rate": 3.683003270348837e-05,
+      "loss": 0.0022550773620605468,
+      "step": 5800
+    },
+    {
+      "epoch": 0.5359738372093024,
+      "grad_norm": 0.0006470124353654683,
+      "learning_rate": 3.660292514534884e-05,
+      "loss": 0.0011264414340257645,
+      "step": 5900
+    },
+    {
+      "epoch": 0.5450581395348837,
+      "grad_norm": 0.058231666684150696,
+      "learning_rate": 3.63758175872093e-05,
+      "loss": 0.0011266635358333588,
+      "step": 6000
+    },
+    {
+      "epoch": 0.5541424418604651,
+      "grad_norm": 0.0013441125629469752,
+      "learning_rate": 3.614871002906977e-05,
+      "loss": 0.0008001205325126648,
+      "step": 6100
+    },
+    {
+      "epoch": 0.5632267441860465,
+      "grad_norm": 0.0021539030130952597,
+      "learning_rate": 3.592160247093023e-05,
+      "loss": 0.0008849448710680007,
+      "step": 6200
+    },
+    {
+      "epoch": 0.5723110465116279,
+      "grad_norm": 0.9019960761070251,
+      "learning_rate": 3.56944949127907e-05,
+      "loss": 0.0006517694145441055,
+      "step": 6300
+    },
+    {
+      "epoch": 0.5813953488372093,
+      "grad_norm": 0.001283760997466743,
+      "learning_rate": 3.546738735465117e-05,
+      "loss": 0.0021727623045444486,
+      "step": 6400
+    },
+    {
+      "epoch": 0.5904796511627907,
+      "grad_norm": 0.005666423588991165,
+      "learning_rate": 3.5240279796511624e-05,
+      "loss": 0.0012587083876132966,
+      "step": 6500
+    },
+    {
+      "epoch": 0.5995639534883721,
+      "grad_norm": 1.4842944145202637,
+      "learning_rate": 3.5013172238372096e-05,
+      "loss": 0.001180112287402153,
+      "step": 6600
+    },
+    {
+      "epoch": 0.6086482558139535,
+      "grad_norm": 0.02973734401166439,
+      "learning_rate": 3.478606468023256e-05,
+      "loss": 0.0010163599252700805,
+      "step": 6700
+    },
+    {
+      "epoch": 0.6177325581395349,
+      "grad_norm": 0.00399527233093977,
+      "learning_rate": 3.4558957122093024e-05,
+      "loss": 0.0010414445400238038,
+      "step": 6800
+    },
+    {
+      "epoch": 0.6268168604651163,
+      "grad_norm": 0.39672717452049255,
+      "learning_rate": 3.433184956395349e-05,
+      "loss": 0.0012094499170780182,
+      "step": 6900
+    },
+    {
+      "epoch": 0.6359011627906976,
+      "grad_norm": 0.5308993458747864,
+      "learning_rate": 3.410474200581395e-05,
+      "loss": 0.0006021633744239807,
+      "step": 7000
+    },
+    {
+      "epoch": 0.6449854651162791,
+      "grad_norm": 0.003343795659020543,
+      "learning_rate": 3.3877634447674425e-05,
+      "loss": 0.0005116893351078033,
+      "step": 7100
+    },
+    {
+      "epoch": 0.6540697674418605,
+      "grad_norm": 0.0033038391266018152,
+      "learning_rate": 3.365052688953488e-05,
+      "loss": 0.002106922417879105,
+      "step": 7200
+    },
+    {
+      "epoch": 0.6631540697674418,
+      "grad_norm": 0.004004980903118849,
+      "learning_rate": 3.3423419331395353e-05,
+      "loss": 0.0010233993828296661,
+      "step": 7300
+    },
+    {
+      "epoch": 0.6722383720930233,
+      "grad_norm": 0.00490298168733716,
+      "learning_rate": 3.319631177325582e-05,
+      "loss": 0.0007062336057424546,
+      "step": 7400
+    },
+    {
+      "epoch": 0.6813226744186046,
+      "grad_norm": 0.04243200644850731,
+      "learning_rate": 3.2969204215116276e-05,
+      "loss": 0.0005479569733142852,
+      "step": 7500
+    },
+    {
+      "epoch": 0.690406976744186,
+      "grad_norm": 0.008556894026696682,
+      "learning_rate": 3.274209665697675e-05,
+      "loss": 0.0010938134789466858,
+      "step": 7600
+    },
+    {
+      "epoch": 0.6994912790697675,
+      "grad_norm": 0.016938723623752594,
+      "learning_rate": 3.251498909883721e-05,
+      "loss": 0.0009466408193111419,
+      "step": 7700
+    },
+    {
+      "epoch": 0.7085755813953488,
+      "grad_norm": 0.09418574720621109,
+      "learning_rate": 3.2287881540697676e-05,
+      "loss": 0.00145876482129097,
+      "step": 7800
+    },
+    {
+      "epoch": 0.7176598837209303,
+      "grad_norm": 0.002359782112762332,
+      "learning_rate": 3.206077398255814e-05,
+      "loss": 0.0015742655098438263,
+      "step": 7900
+    },
+    {
+      "epoch": 0.7267441860465116,
+      "grad_norm": 0.3374776840209961,
+      "learning_rate": 3.1833666424418605e-05,
+      "loss": 0.0007466593384742737,
+      "step": 8000
+    },
+    {
+      "epoch": 0.735828488372093,
+      "grad_norm": 0.000735403154976666,
+      "learning_rate": 3.1606558866279076e-05,
+      "loss": 0.0008171546459197998,
+      "step": 8100
+    },
+    {
+      "epoch": 0.7449127906976745,
+      "grad_norm": 0.01963644102215767,
+      "learning_rate": 3.1379451308139533e-05,
+      "loss": 0.0007363802939653396,
+      "step": 8200
+    },
+    {
+      "epoch": 0.7539970930232558,
+      "grad_norm": 0.09964141249656677,
+      "learning_rate": 3.1152343750000005e-05,
+      "loss": 0.00030826406553387644,
+      "step": 8300
+    },
+    {
+      "epoch": 0.7630813953488372,
+      "grad_norm": 0.0029934593476355076,
+      "learning_rate": 3.092523619186047e-05,
+      "loss": 0.0003284827247262001,
+      "step": 8400
+    },
+    {
+      "epoch": 0.7721656976744186,
+      "grad_norm": 0.002162993187084794,
+      "learning_rate": 3.069812863372093e-05,
+      "loss": 0.0010288888961076737,
+      "step": 8500
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 0.013634397648274899,
+      "learning_rate": 3.0471021075581398e-05,
+      "loss": 0.0006983913481235504,
+      "step": 8600
+    },
+    {
+      "epoch": 0.7903343023255814,
+      "grad_norm": 0.0833001658320427,
+      "learning_rate": 3.0243913517441862e-05,
+      "loss": 0.0009967025369405746,
+      "step": 8700
+    },
+    {
+      "epoch": 0.7994186046511628,
+      "grad_norm": 0.4697379767894745,
+      "learning_rate": 3.0016805959302323e-05,
+      "loss": 0.0007716407626867294,
+      "step": 8800
+    },
+    {
+      "epoch": 0.8085029069767442,
+      "grad_norm": 0.04588587209582329,
+      "learning_rate": 2.978969840116279e-05,
+      "loss": 0.000263803955167532,
+      "step": 8900
+    },
+    {
+      "epoch": 0.8175872093023255,
+      "grad_norm": 0.003420337103307247,
+      "learning_rate": 2.956259084302326e-05,
+      "loss": 0.0009599439054727555,
+      "step": 9000
+    },
+    {
+      "epoch": 0.826671511627907,
+      "grad_norm": 0.0024436134845018387,
+      "learning_rate": 2.9335483284883724e-05,
+      "loss": 0.012270723581314086,
+      "step": 9100
+    },
+    {
+      "epoch": 0.8357558139534884,
+      "grad_norm": 0.001418731757439673,
+      "learning_rate": 2.9108375726744185e-05,
+      "loss": 0.0009713788330554962,
+      "step": 9200
+    },
+    {
+      "epoch": 0.8448401162790697,
+      "grad_norm": 0.0026375153101980686,
+      "learning_rate": 2.8881268168604652e-05,
+      "loss": 0.0007654589414596558,
+      "step": 9300
+    },
+    {
+      "epoch": 0.8539244186046512,
+      "grad_norm": 0.0013760777655988932,
+      "learning_rate": 2.865416061046512e-05,
+      "loss": 0.0006276721507310867,
+      "step": 9400
+    },
+    {
+      "epoch": 0.8630087209302325,
+      "grad_norm": 0.0033582421019673347,
+      "learning_rate": 2.842705305232558e-05,
+      "loss": 0.0003520375117659569,
+      "step": 9500
+    },
+    {
+      "epoch": 0.872093023255814,
+      "grad_norm": 0.0043012769892811775,
+      "learning_rate": 2.819994549418605e-05,
+      "loss": 0.0011182524263858794,
+      "step": 9600
+    },
+    {
+      "epoch": 0.8811773255813954,
+      "grad_norm": 0.0015025343745946884,
+      "learning_rate": 2.7972837936046514e-05,
+      "loss": 0.00044553544372320173,
+      "step": 9700
+    },
+    {
+      "epoch": 0.8902616279069767,
+      "grad_norm": 0.0018241156358271837,
+      "learning_rate": 2.7745730377906975e-05,
+      "loss": 0.00023134740069508553,
+      "step": 9800
+    },
+    {
+      "epoch": 0.8993459302325582,
+      "grad_norm": 0.001082456554286182,
+      "learning_rate": 2.7518622819767442e-05,
+      "loss": 0.001018296480178833,
+      "step": 9900
+    },
+    {
+      "epoch": 0.9084302325581395,
+      "grad_norm": 0.01155087724328041,
+      "learning_rate": 2.729151526162791e-05,
+      "loss": 0.0005589094385504722,
+      "step": 10000
+    },
+    {
+      "epoch": 0.9175145348837209,
+      "grad_norm": 0.0022482366766780615,
+      "learning_rate": 2.7064407703488375e-05,
+      "loss": 0.000512191392481327,
+      "step": 10100
+    },
+    {
+      "epoch": 0.9265988372093024,
+      "grad_norm": 0.20549768209457397,
+      "learning_rate": 2.6837300145348836e-05,
+      "loss": 0.00038118865340948106,
+      "step": 10200
+    },
+    {
+      "epoch": 0.9356831395348837,
+      "grad_norm": 0.0013188497396185994,
+      "learning_rate": 2.6610192587209304e-05,
+      "loss": 0.0003574254736304283,
+      "step": 10300
+    },
+    {
+      "epoch": 0.9447674418604651,
+      "grad_norm": 0.0006103936466388404,
+      "learning_rate": 2.638308502906977e-05,
+      "loss": 0.00024356411769986154,
+      "step": 10400
+    },
+    {
+      "epoch": 0.9538517441860465,
+      "grad_norm": 0.00047453015577048063,
+      "learning_rate": 2.6155977470930233e-05,
+      "loss": 0.0013259868323802948,
+      "step": 10500
+    },
+    {
+      "epoch": 0.9629360465116279,
+      "grad_norm": 0.025760261341929436,
+      "learning_rate": 2.59288699127907e-05,
+      "loss": 0.0006390263140201569,
+      "step": 10600
+    },
+    {
+      "epoch": 0.9720203488372093,
+      "grad_norm": 0.0025282115675508976,
+      "learning_rate": 2.5701762354651165e-05,
+      "loss": 0.001759422868490219,
+      "step": 10700
+    },
+    {
+      "epoch": 0.9811046511627907,
+      "grad_norm": 2.990682601928711,
+      "learning_rate": 2.5474654796511626e-05,
+      "loss": 0.00041438560932874677,
+      "step": 10800
+    },
+    {
+      "epoch": 0.9901889534883721,
+      "grad_norm": 0.0036354295443743467,
+      "learning_rate": 2.5247547238372094e-05,
+      "loss": 0.000467718243598938,
+      "step": 10900
+    },
+    {
+      "epoch": 0.9992732558139535,
+      "grad_norm": 0.02648981101810932,
+      "learning_rate": 2.502043968023256e-05,
+      "loss": 0.0007214382290840149,
+      "step": 11000
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 8.017317304620519e-05,
+      "eval_runtime": 132.6261,
+      "eval_samples_per_second": 165.993,
+      "eval_steps_per_second": 20.75,
+      "step": 11008
+    },
+    {
+      "epoch": 1.008357558139535,
+      "grad_norm": 0.0011333210859447718,
+      "learning_rate": 2.4793332122093026e-05,
+      "loss": 0.000609114095568657,
+      "step": 11100
+    },
+    {
+      "epoch": 1.0174418604651163,
+      "grad_norm": 0.0006669509457424283,
+      "learning_rate": 2.4566224563953487e-05,
+      "loss": 0.000269341841340065,
+      "step": 11200
+    },
+    {
+      "epoch": 1.0265261627906976,
+      "grad_norm": 0.0006085751811042428,
+      "learning_rate": 2.4339117005813955e-05,
+      "loss": 0.00018008088693022728,
+      "step": 11300
+    },
+    {
+      "epoch": 1.035610465116279,
+      "grad_norm": 0.00029874974279664457,
+      "learning_rate": 2.411200944767442e-05,
+      "loss": 8.143479935824871e-05,
+      "step": 11400
+    },
+    {
+      "epoch": 1.0446947674418605,
+      "grad_norm": 0.0020065035205334425,
+      "learning_rate": 2.3884901889534887e-05,
+      "loss": 0.0004405264183878899,
+      "step": 11500
+    },
+    {
+      "epoch": 1.0537790697674418,
+      "grad_norm": 0.3139957785606384,
+      "learning_rate": 2.3657794331395348e-05,
+      "loss": 0.0007688279449939728,
+      "step": 11600
+    },
+    {
+      "epoch": 1.0628633720930232,
+      "grad_norm": 0.014869201928377151,
+      "learning_rate": 2.3430686773255813e-05,
+      "loss": 0.00045145414769649507,
+      "step": 11700
+    },
+    {
+      "epoch": 1.0719476744186047,
+      "grad_norm": 0.03592238947749138,
+      "learning_rate": 2.320357921511628e-05,
+      "loss": 0.0013660797476768493,
+      "step": 11800
+    },
+    {
+      "epoch": 1.081031976744186,
+      "grad_norm": 0.010270297527313232,
+      "learning_rate": 2.2976471656976745e-05,
+      "loss": 0.0005171676725149155,
+      "step": 11900
+    },
+    {
+      "epoch": 1.0901162790697674,
+      "grad_norm": 0.0026885548140853643,
+      "learning_rate": 2.2749364098837213e-05,
+      "loss": 0.00046057451516389846,
+      "step": 12000
+    },
+    {
+      "epoch": 1.099200581395349,
+      "grad_norm": 0.005681134294718504,
+      "learning_rate": 2.2522256540697674e-05,
+      "loss": 0.0006235280632972718,
+      "step": 12100
+    },
+    {
+      "epoch": 1.1082848837209303,
+      "grad_norm": 0.0028001824393868446,
+      "learning_rate": 2.2295148982558138e-05,
+      "loss": 0.0008198145776987075,
+      "step": 12200
+    },
+    {
+      "epoch": 1.1173691860465116,
+      "grad_norm": 0.0006385694723576307,
+      "learning_rate": 2.2068041424418606e-05,
+      "loss": 0.0003722207620739937,
+      "step": 12300
+    },
+    {
+      "epoch": 1.1264534883720931,
+      "grad_norm": 0.0008649047813378274,
+      "learning_rate": 2.184093386627907e-05,
+      "loss": 0.0003394853696227074,
+      "step": 12400
+    },
+    {
+      "epoch": 1.1355377906976745,
+      "grad_norm": 0.004796006251126528,
+      "learning_rate": 2.1613826308139538e-05,
+      "loss": 0.00042502928525209424,
+      "step": 12500
+    },
+    {
+      "epoch": 1.1446220930232558,
+      "grad_norm": 0.001498043886385858,
+      "learning_rate": 2.138671875e-05,
+      "loss": 0.0009064202010631562,
+      "step": 12600
+    },
+    {
+      "epoch": 1.1537063953488371,
+      "grad_norm": 0.01664622500538826,
+      "learning_rate": 2.1159611191860464e-05,
+      "loss": 0.0002878718450665474,
+      "step": 12700
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 0.010639426298439503,
+      "learning_rate": 2.093250363372093e-05,
+      "loss": 0.0011860337108373642,
+      "step": 12800
+    },
+    {
+      "epoch": 1.171875,
+      "grad_norm": 0.050644513219594955,
+      "learning_rate": 2.0705396075581396e-05,
+      "loss": 0.0006260576844215393,
+      "step": 12900
+    },
+    {
+      "epoch": 1.1809593023255813,
+      "grad_norm": 0.001398528809659183,
+      "learning_rate": 2.0478288517441864e-05,
+      "loss": 0.0001833665184676647,
+      "step": 13000
+    },
+    {
+      "epoch": 1.1900436046511629,
+      "grad_norm": 0.0057320562191307545,
+      "learning_rate": 2.0251180959302325e-05,
+      "loss": 0.00032943256199359896,
+      "step": 13100
+    },
+    {
+      "epoch": 1.1991279069767442,
+      "grad_norm": 0.0002997924748342484,
+      "learning_rate": 2.002407340116279e-05,
+      "loss": 6.249572150409222e-05,
+      "step": 13200
+    },
+    {
+      "epoch": 1.2082122093023255,
+      "grad_norm": 0.0008130258647724986,
+      "learning_rate": 1.9796965843023257e-05,
+      "loss": 0.00026577839627861977,
+      "step": 13300
+    },
+    {
+      "epoch": 1.2172965116279069,
+      "grad_norm": 0.0006040658336132765,
+      "learning_rate": 1.956985828488372e-05,
+      "loss": 0.00023248378187417984,
+      "step": 13400
+    },
+    {
+      "epoch": 1.2263808139534884,
+      "grad_norm": 0.000803643895778805,
+      "learning_rate": 1.934275072674419e-05,
+      "loss": 0.0002730824239552021,
+      "step": 13500
+    },
+    {
+      "epoch": 1.2354651162790697,
+      "grad_norm": 0.0005482266424223781,
+      "learning_rate": 1.911564316860465e-05,
+      "loss": 0.0005848826467990875,
+      "step": 13600
+    },
+    {
+      "epoch": 1.244549418604651,
+      "grad_norm": 0.0034759771078824997,
+      "learning_rate": 1.8888535610465115e-05,
+      "loss": 0.0005462893471121788,
+      "step": 13700
+    },
+    {
+      "epoch": 1.2536337209302326,
+      "grad_norm": 0.0002226918877568096,
+      "learning_rate": 1.8661428052325583e-05,
+      "loss": 7.945694960653782e-05,
+      "step": 13800
+    },
+    {
+      "epoch": 1.262718023255814,
+      "grad_norm": 0.002374310279265046,
+      "learning_rate": 1.8434320494186047e-05,
+      "loss": 0.000744214728474617,
+      "step": 13900
+    },
+    {
+      "epoch": 1.2718023255813953,
+      "grad_norm": 0.0009076696587726474,
+      "learning_rate": 1.8207212936046515e-05,
+      "loss": 0.0003250580281019211,
+      "step": 14000
+    },
+    {
+      "epoch": 1.2808866279069768,
+      "grad_norm": 0.010181100107729435,
+      "learning_rate": 1.7980105377906976e-05,
+      "loss": 0.0005765938758850097,
+      "step": 14100
+    },
+    {
+      "epoch": 1.2899709302325582,
+      "grad_norm": 0.0012442917795851827,
+      "learning_rate": 1.775299781976744e-05,
+      "loss": 0.00032320961356163024,
+      "step": 14200
+    },
+    {
+      "epoch": 1.2990552325581395,
+      "grad_norm": 0.0005617383285425603,
+      "learning_rate": 1.752589026162791e-05,
+      "loss": 0.0001776321791112423,
+      "step": 14300
+    },
+    {
+      "epoch": 1.308139534883721,
+      "grad_norm": 0.0010064981179311872,
+      "learning_rate": 1.7298782703488373e-05,
+      "loss": 0.00012774170376360415,
+      "step": 14400
+    },
+    {
+      "epoch": 1.3172238372093024,
+      "grad_norm": 0.00039220438338816166,
+      "learning_rate": 1.707167514534884e-05,
+      "loss": 0.0006190959364175796,
+      "step": 14500
+    },
+    {
+      "epoch": 1.3263081395348837,
+      "grad_norm": 0.0009137202869169414,
+      "learning_rate": 1.6844567587209302e-05,
+      "loss": 0.0007051125913858414,
+      "step": 14600
+    },
+    {
+      "epoch": 1.3353924418604652,
+      "grad_norm": 0.0006007241318002343,
+      "learning_rate": 1.6617460029069766e-05,
+      "loss": 7.074063178151846e-05,
+      "step": 14700
+    },
+    {
+      "epoch": 1.3444767441860466,
+      "grad_norm": 0.0006039150175638497,
+      "learning_rate": 1.6390352470930234e-05,
+      "loss": 0.000489293597638607,
+      "step": 14800
+    },
+    {
+      "epoch": 1.353561046511628,
+      "grad_norm": 0.0014955669175833464,
+      "learning_rate": 1.61632449127907e-05,
+      "loss": 0.0004155828058719635,
+      "step": 14900
+    },
+    {
+      "epoch": 1.3626453488372092,
+      "grad_norm": 0.1508745402097702,
+      "learning_rate": 1.5936137354651163e-05,
+      "loss": 0.00024092141538858413,
+      "step": 15000
+    },
+    {
+      "epoch": 1.3717296511627908,
+      "grad_norm": 0.0008113393560051918,
+      "learning_rate": 1.5709029796511627e-05,
+      "loss": 0.00040562458336353304,
+      "step": 15100
+    },
+    {
+      "epoch": 1.380813953488372,
+      "grad_norm": 0.0026459796354174614,
+      "learning_rate": 1.5481922238372092e-05,
+      "loss": 0.0007934534549713135,
+      "step": 15200
+    },
+    {
+      "epoch": 1.3898982558139534,
+      "grad_norm": 0.0005005365237593651,
+      "learning_rate": 1.525481468023256e-05,
+      "loss": 0.00023073634132742883,
+      "step": 15300
+    },
+    {
+      "epoch": 1.3989825581395348,
+      "grad_norm": 0.02727348543703556,
+      "learning_rate": 1.5027707122093024e-05,
+      "loss": 0.0003524136170744896,
+      "step": 15400
+    },
+    {
+      "epoch": 1.4080668604651163,
+      "grad_norm": 0.0002921252744272351,
+      "learning_rate": 1.4800599563953488e-05,
+      "loss": 0.000525415763258934,
+      "step": 15500
+    },
+    {
+      "epoch": 1.4171511627906976,
+      "grad_norm": 0.0016339289722964168,
+      "learning_rate": 1.4573492005813955e-05,
+      "loss": 0.0003766526654362678,
+      "step": 15600
+    },
+    {
+      "epoch": 1.426235465116279,
+      "grad_norm": 0.003485665889456868,
+      "learning_rate": 1.4346384447674419e-05,
+      "loss": 0.00021764757111668586,
+      "step": 15700
+    },
+    {
+      "epoch": 1.4353197674418605,
+      "grad_norm": 0.0011686257785186172,
+      "learning_rate": 1.4119276889534885e-05,
+      "loss": 0.00026009151712059976,
+      "step": 15800
+    },
+    {
+      "epoch": 1.4444040697674418,
+      "grad_norm": 0.00034656753996387124,
+      "learning_rate": 1.389216933139535e-05,
+      "loss": 0.0003694407269358635,
+      "step": 15900
+    },
+    {
+      "epoch": 1.4534883720930232,
+      "grad_norm": 0.0023575718514621258,
+      "learning_rate": 1.3665061773255814e-05,
+      "loss": 0.0001483263447880745,
+      "step": 16000
+    },
+    {
+      "epoch": 1.4625726744186047,
+      "grad_norm": 0.0011886970605701208,
+      "learning_rate": 1.343795421511628e-05,
+      "loss": 0.0004465998336672783,
+      "step": 16100
+    },
+    {
+      "epoch": 1.471656976744186,
+      "grad_norm": 0.00039876584196463227,
+      "learning_rate": 1.3210846656976745e-05,
+      "loss": 0.00015990335494279863,
+      "step": 16200
+    },
+    {
+      "epoch": 1.4807412790697674,
+      "grad_norm": 0.00023756893642712384,
+      "learning_rate": 1.298373909883721e-05,
+      "loss": 5.0160493701696394e-05,
+      "step": 16300
+    },
+    {
+      "epoch": 1.489825581395349,
+      "grad_norm": 0.0002809664292726666,
+      "learning_rate": 1.2756631540697675e-05,
+      "loss": 0.0001377291791141033,
+      "step": 16400
+    },
+    {
+      "epoch": 1.4989098837209303,
+      "grad_norm": 0.00037873705150559545,
+      "learning_rate": 1.252952398255814e-05,
+      "loss": 7.196901366114616e-05,
+      "step": 16500
+    },
+    {
+      "epoch": 1.5079941860465116,
+      "grad_norm": 1.4199703931808472,
+      "learning_rate": 1.2302416424418606e-05,
+      "loss": 0.0005905186012387276,
+      "step": 16600
+    },
+    {
+      "epoch": 1.5170784883720931,
+      "grad_norm": 0.0003422704176045954,
+      "learning_rate": 1.207530886627907e-05,
+      "loss": 0.00013428066857159137,
+      "step": 16700
+    },
+    {
+      "epoch": 1.5261627906976745,
+      "grad_norm": 0.0010882618371397257,
+      "learning_rate": 1.1848201308139535e-05,
+      "loss": 0.00020068021491169929,
+      "step": 16800
+    },
+    {
+      "epoch": 1.5352470930232558,
+      "grad_norm": 0.024530770257115364,
+      "learning_rate": 1.162109375e-05,
+      "loss": 0.0004120354726910591,
+      "step": 16900
+    },
+    {
+      "epoch": 1.5443313953488373,
+      "grad_norm": 0.0006738721276633441,
+      "learning_rate": 1.1393986191860465e-05,
+      "loss": 0.0002767092920839787,
+      "step": 17000
+    },
+    {
+      "epoch": 1.5534156976744184,
+      "grad_norm": 0.00019564498506952077,
+      "learning_rate": 1.1166878633720931e-05,
+      "loss": 0.00021817052736878396,
+      "step": 17100
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 0.0002832361788023263,
+      "learning_rate": 1.0939771075581396e-05,
+      "loss": 5.168498028069734e-05,
+      "step": 17200
+    },
+    {
+      "epoch": 1.5715843023255816,
+      "grad_norm": 0.0005729420809075236,
+      "learning_rate": 1.071266351744186e-05,
+      "loss": 0.00016681572422385215,
+      "step": 17300
+    },
+    {
+      "epoch": 1.5806686046511627,
+      "grad_norm": 0.0007383101619780064,
+      "learning_rate": 1.0485555959302326e-05,
+      "loss": 0.0003441007435321808,
+      "step": 17400
+    },
+    {
+      "epoch": 1.5897529069767442,
+      "grad_norm": 0.005897729191929102,
+      "learning_rate": 1.0258448401162791e-05,
+      "loss": 0.0003161391615867615,
+      "step": 17500
+    },
+    {
+      "epoch": 1.5988372093023255,
+      "grad_norm": 0.00025281202397309244,
+      "learning_rate": 1.0031340843023257e-05,
+      "loss": 0.00011429931037127971,
+      "step": 17600
+    },
+    {
+      "epoch": 1.6079215116279069,
+      "grad_norm": 0.0004013874859083444,
+      "learning_rate": 9.804233284883721e-06,
+      "loss": 2.793062012642622e-05,
+      "step": 17700
+    },
+    {
+      "epoch": 1.6170058139534884,
+      "grad_norm": 0.008875502273440361,
+      "learning_rate": 9.577125726744186e-06,
+      "loss": 5.882895085960627e-05,
+      "step": 17800
+    },
+    {
+      "epoch": 1.6260901162790697,
+      "grad_norm": 0.001823420519940555,
+      "learning_rate": 9.350018168604652e-06,
+      "loss": 0.0002583874017000198,
+      "step": 17900
+    },
+    {
+      "epoch": 1.635174418604651,
+      "grad_norm": 0.6227073073387146,
+      "learning_rate": 9.122910610465116e-06,
+      "loss": 0.0007115737348794937,
+      "step": 18000
+    },
+    {
+      "epoch": 1.6442587209302326,
+      "grad_norm": 0.0017345056403428316,
+      "learning_rate": 8.895803052325581e-06,
+      "loss": 0.00027986690402030944,
+      "step": 18100
+    },
+    {
+      "epoch": 1.653343023255814,
+      "grad_norm": 0.0009033643291331828,
+      "learning_rate": 8.668695494186047e-06,
+      "loss": 0.0002979452162981033,
+      "step": 18200
+    },
+    {
+      "epoch": 1.6624273255813953,
+      "grad_norm": 0.00028923453646712005,
+      "learning_rate": 8.441587936046511e-06,
+      "loss": 1.926603843457997e-05,
+      "step": 18300
+    },
+    {
+      "epoch": 1.6715116279069768,
+      "grad_norm": 0.0006103311898186803,
+      "learning_rate": 8.214480377906978e-06,
+      "loss": 6.331724114716053e-05,
+      "step": 18400
+    },
+    {
+      "epoch": 1.6805959302325582,
+      "grad_norm": 0.0002818437642417848,
+      "learning_rate": 7.987372819767442e-06,
+      "loss": 0.00014105773530900478,
+      "step": 18500
+    },
+    {
+      "epoch": 1.6896802325581395,
+      "grad_norm": 0.0011020454112440348,
+      "learning_rate": 7.760265261627907e-06,
+      "loss": 0.0002292773686349392,
+      "step": 18600
+    },
+    {
+      "epoch": 1.698764534883721,
+      "grad_norm": 0.001500884653069079,
+      "learning_rate": 7.533157703488372e-06,
+      "loss": 0.00011514685116708279,
+      "step": 18700
+    },
+    {
+      "epoch": 1.7078488372093024,
+      "grad_norm": 0.00016000888717826456,
+      "learning_rate": 7.306050145348838e-06,
+      "loss": 2.0523781422525645e-05,
+      "step": 18800
+    },
+    {
+      "epoch": 1.7169331395348837,
+      "grad_norm": 0.003748674876987934,
+      "learning_rate": 7.078942587209303e-06,
+      "loss": 0.00043839264661073686,
+      "step": 18900
+    },
+    {
+      "epoch": 1.7260174418604652,
+      "grad_norm": 0.06772974133491516,
+      "learning_rate": 6.8518350290697685e-06,
+      "loss": 2.126413397490978e-05,
+      "step": 19000
+    },
+    {
+      "epoch": 1.7351017441860463,
+      "grad_norm": 0.0004789210797753185,
+      "learning_rate": 6.624727470930232e-06,
+      "loss": 0.00020929597318172455,
+      "step": 19100
+    },
+    {
+      "epoch": 1.744186046511628,
+      "grad_norm": 7.055519381538033e-05,
+      "learning_rate": 6.397619912790697e-06,
+      "loss": 0.00032072752714157104,
+      "step": 19200
+    },
+    {
+      "epoch": 1.7532703488372094,
+      "grad_norm": 0.00010641128028510138,
+      "learning_rate": 6.1705123546511635e-06,
+      "loss": 8.132393471896649e-05,
+      "step": 19300
+    },
+    {
+      "epoch": 1.7623546511627906,
+      "grad_norm": 0.0005615473492071033,
+      "learning_rate": 5.943404796511629e-06,
+      "loss": 9.162256610579788e-06,
+      "step": 19400
+    },
+    {
+      "epoch": 1.771438953488372,
+      "grad_norm": 0.004505404736846685,
+      "learning_rate": 5.716297238372093e-06,
+      "loss": 9.514865465462207e-05,
+      "step": 19500
+    },
+    {
+      "epoch": 1.7805232558139537,
+      "grad_norm": 0.0011890050955116749,
+      "learning_rate": 5.4891896802325586e-06,
+      "loss": 3.7443286273628474e-05,
+      "step": 19600
+    },
+    {
+      "epoch": 1.7896075581395348,
+      "grad_norm": 0.000441042153397575,
+      "learning_rate": 5.262082122093023e-06,
+      "loss": 5.087008234113455e-05,
+      "step": 19700
+    },
+    {
+      "epoch": 1.7986918604651163,
+      "grad_norm": 0.0001777316356310621,
+      "learning_rate": 5.034974563953489e-06,
+      "loss": 0.00017192648723721505,
+      "step": 19800
+    },
+    {
+      "epoch": 1.8077761627906976,
+      "grad_norm": 0.00015439293929375708,
+      "learning_rate": 4.8078670058139536e-06,
+      "loss": 0.0001058769691735506,
+      "step": 19900
+    },
+    {
+      "epoch": 1.816860465116279,
+      "grad_norm": 0.0016349812503904104,
+      "learning_rate": 4.580759447674419e-06,
+      "loss": 1.0620863176882267e-05,
+      "step": 20000
+    },
+    {
+      "epoch": 1.8259447674418605,
+      "grad_norm": 9.556530858390033e-05,
+      "learning_rate": 4.353651889534884e-06,
+      "loss": 0.00019107908010482789,
+      "step": 20100
+    },
+    {
+      "epoch": 1.8350290697674418,
+      "grad_norm": 0.0002568990457803011,
+      "learning_rate": 4.126544331395349e-06,
+      "loss": 0.00026563439518213273,
+      "step": 20200
+    },
+    {
+      "epoch": 1.8441133720930232,
+      "grad_norm": 0.5719628930091858,
+      "learning_rate": 3.899436773255814e-06,
+      "loss": 5.403281655162573e-05,
+      "step": 20300
+    },
+    {
+      "epoch": 1.8531976744186047,
+      "grad_norm": 0.017383404076099396,
+      "learning_rate": 3.672329215116279e-06,
+      "loss": 0.00036652404814958573,
+      "step": 20400
+    },
+    {
+      "epoch": 1.862281976744186,
+      "grad_norm": 0.00022910887491889298,
+      "learning_rate": 3.4452216569767445e-06,
+      "loss": 0.000403200164437294,
+      "step": 20500
+    },
+    {
+      "epoch": 1.8713662790697674,
+      "grad_norm": 0.00033295468892902136,
+      "learning_rate": 3.2181140988372097e-06,
+      "loss": 8.646129630506039e-05,
+      "step": 20600
+    },
+    {
+      "epoch": 1.880450581395349,
+      "grad_norm": 0.00012523184705059975,
+      "learning_rate": 2.9910065406976746e-06,
+      "loss": 0.00016864996403455733,
+      "step": 20700
+    },
+    {
+      "epoch": 1.8895348837209303,
+      "grad_norm": 0.00022476979938801378,
+      "learning_rate": 2.7638989825581395e-06,
+      "loss": 5.167209077626467e-05,
+      "step": 20800
+    },
+    {
+      "epoch": 1.8986191860465116,
+      "grad_norm": 0.0002389108412899077,
+      "learning_rate": 2.5367914244186048e-06,
+      "loss": 6.591790355741977e-05,
+      "step": 20900
+    },
+    {
+      "epoch": 1.9077034883720931,
+      "grad_norm": 0.0013002109481021762,
+      "learning_rate": 2.30968386627907e-06,
+      "loss": 0.00010993644595146179,
+      "step": 21000
+    },
+    {
+      "epoch": 1.9167877906976745,
+      "grad_norm": 0.13188259303569794,
+      "learning_rate": 2.082576308139535e-06,
+      "loss": 0.00014082306995987892,
+      "step": 21100
+    },
+    {
+      "epoch": 1.9258720930232558,
+      "grad_norm": 0.0010737127158790827,
+      "learning_rate": 1.85546875e-06,
+      "loss": 0.00030819986015558244,
+      "step": 21200
+    },
+    {
+      "epoch": 1.9349563953488373,
+      "grad_norm": 7.047707185847685e-05,
+      "learning_rate": 1.628361191860465e-06,
+      "loss": 2.0567586179822683e-05,
+      "step": 21300
+    },
+    {
+      "epoch": 1.9440406976744184,
+      "grad_norm": 0.0001350079692201689,
+      "learning_rate": 1.4012536337209304e-06,
+      "loss": 7.989832083694636e-06,
+      "step": 21400
+    },
+    {
+      "epoch": 1.953125,
+      "grad_norm": 0.0008243785705417395,
+      "learning_rate": 1.1741460755813954e-06,
+      "loss": 0.0002570002153515816,
+      "step": 21500
+    },
+    {
+      "epoch": 1.9622093023255816,
+      "grad_norm": 0.001545518171042204,
+      "learning_rate": 9.470385174418604e-07,
+      "loss": 2.7221005875617266e-05,
+      "step": 21600
+    },
+    {
+      "epoch": 1.9712936046511627,
+      "grad_norm": 5.5686323321424425e-05,
+      "learning_rate": 7.199309593023256e-07,
+      "loss": 0.00010406752116978168,
+      "step": 21700
+    },
+    {
+      "epoch": 1.9803779069767442,
+      "grad_norm": 5.525942106032744e-05,
+      "learning_rate": 4.928234011627908e-07,
+      "loss": 3.853037022054195e-05,
+      "step": 21800
+    },
+    {
+      "epoch": 1.9894622093023255,
+      "grad_norm": 0.0002285480877617374,
+      "learning_rate": 2.657158430232558e-07,
+      "loss": 0.00048015639185905457,
+      "step": 21900
+    },
+    {
+      "epoch": 1.9985465116279069,
+      "grad_norm": 8.369733404833823e-05,
+      "learning_rate": 3.860828488372093e-08,
+      "loss": 6.105023785494268e-06,
+      "step": 22000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 22016,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.341374477893632e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

models/dostoievsky_v1/checkpoint-22000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edab2539a81b65c2a76c4c39bed219aa02358f44f3286429f83165442a1b53fa
+size 5329

models/dostoievsky_v1/checkpoint-22016/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "classif_dropout": 0.1,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_decoder": false,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_position_embeddings": 1024,
+  "model_type": "bart",
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "task_specific_params": {
+    "summarization": {
+      "length_penalty": 1.0,
+      "max_length": 128,
+      "min_length": 12,
+      "num_beams": 4
+    },
+    "summarization_cnn": {
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "num_beams": 4
+    },
+    "summarization_xsum": {
+      "length_penalty": 1.0,
+      "max_length": 62,
+      "min_length": 11,
+      "num_beams": 6
+    }
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.0",
+  "use_cache": false,
+  "vocab_size": 50265
+}

models/dostoievsky_v1/checkpoint-22016/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "early_stopping": true,
+  "eos_token_id": [
+    2
+  ],
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "no_repeat_ngram_size": 3,
+  "num_beams": 4,
+  "pad_token_id": 1,
+  "transformers_version": "5.5.0",
+  "use_cache": true
+}

models/dostoievsky_v1/checkpoint-22016/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9dc9ffae85920773b2b5774f37b9630b197fc455008db011e6288a451870cb8
+size 557912620

models/dostoievsky_v1/checkpoint-22016/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3cef9d187ecb037f2d7e4d910620d3c26ea5e61d28c9406a25018cdeb2412f2
+size 1115583947

models/dostoievsky_v1/checkpoint-22016/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bef8c98d0370faf4d9e294ae6bb5a54459b7df1614ab0d628dfa9ab8c16113d
+size 14645

models/dostoievsky_v1/checkpoint-22016/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de1988b91eb0266803c28f7f4b4c45861f95d52f7293e9c027e38a68aedd566a
+size 1383

models/dostoievsky_v1/checkpoint-22016/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af2fefc736f21c2829d14de379d49371f2072daa593ac72eb437dc21a1f555df
+size 1465

models/dostoievsky_v1/checkpoint-22016/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/dostoievsky_v1/checkpoint-22016/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

models/dostoievsky_v1/checkpoint-22016/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1582 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 22016,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009084302325581396,
+      "grad_norm": 1.9626818895339966,
+      "learning_rate": 4.9775163517441866e-05,
+      "loss": 3.2140347290039064,
+      "step": 100
+    },
+    {
+      "epoch": 0.018168604651162792,
+      "grad_norm": 1.511451244354248,
+      "learning_rate": 4.9548055959302324e-05,
+      "loss": 0.01261173963546753,
+      "step": 200
+    },
+    {
+      "epoch": 0.027252906976744186,
+      "grad_norm": 0.18194368481636047,
+      "learning_rate": 4.9320948401162795e-05,
+      "loss": 0.009183254837989808,
+      "step": 300
+    },
+    {
+      "epoch": 0.036337209302325583,
+      "grad_norm": 0.27592894434928894,
+      "learning_rate": 4.909384084302326e-05,
+      "loss": 0.0027826914191246034,
+      "step": 400
+    },
+    {
+      "epoch": 0.045421511627906974,
+      "grad_norm": 1.5584100484848022,
+      "learning_rate": 4.8866733284883724e-05,
+      "loss": 0.0029672542214393615,
+      "step": 500
+    },
+    {
+      "epoch": 0.05450581395348837,
+      "grad_norm": 0.02698471024632454,
+      "learning_rate": 4.863962572674419e-05,
+      "loss": 0.004296095669269561,
+      "step": 600
+    },
+    {
+      "epoch": 0.06359011627906977,
+      "grad_norm": 0.137993723154068,
+      "learning_rate": 4.841251816860465e-05,
+      "loss": 0.0031410756707191466,
+      "step": 700
+    },
+    {
+      "epoch": 0.07267441860465117,
+      "grad_norm": 0.07874622195959091,
+      "learning_rate": 4.818541061046512e-05,
+      "loss": 0.003037240505218506,
+      "step": 800
+    },
+    {
+      "epoch": 0.08175872093023256,
+      "grad_norm": 0.013660268858075142,
+      "learning_rate": 4.795830305232558e-05,
+      "loss": 0.0031082597374916076,
+      "step": 900
+    },
+    {
+      "epoch": 0.09084302325581395,
+      "grad_norm": 0.3354911804199219,
+      "learning_rate": 4.7731195494186046e-05,
+      "loss": 0.0048285979032516475,
+      "step": 1000
+    },
+    {
+      "epoch": 0.09992732558139535,
+      "grad_norm": 0.43667030334472656,
+      "learning_rate": 4.750408793604652e-05,
+      "loss": 0.0019270157814025878,
+      "step": 1100
+    },
+    {
+      "epoch": 0.10901162790697674,
+      "grad_norm": 0.03849343582987785,
+      "learning_rate": 4.7276980377906975e-05,
+      "loss": 0.001011388897895813,
+      "step": 1200
+    },
+    {
+      "epoch": 0.11809593023255814,
+      "grad_norm": 0.017985744401812553,
+      "learning_rate": 4.7049872819767446e-05,
+      "loss": 0.0018200889229774476,
+      "step": 1300
+    },
+    {
+      "epoch": 0.12718023255813954,
+      "grad_norm": 0.05241613835096359,
+      "learning_rate": 4.682276526162791e-05,
+      "loss": 0.004675151705741882,
+      "step": 1400
+    },
+    {
+      "epoch": 0.13626453488372092,
+      "grad_norm": 0.16119782626628876,
+      "learning_rate": 4.6595657703488375e-05,
+      "loss": 0.0025834646821022034,
+      "step": 1500
+    },
+    {
+      "epoch": 0.14534883720930233,
+      "grad_norm": 0.8985283374786377,
+      "learning_rate": 4.636855014534884e-05,
+      "loss": 0.001825849711894989,
+      "step": 1600
+    },
+    {
+      "epoch": 0.15443313953488372,
+      "grad_norm": 0.021285999566316605,
+      "learning_rate": 4.6141442587209304e-05,
+      "loss": 0.004164438843727112,
+      "step": 1700
+    },
+    {
+      "epoch": 0.16351744186046513,
+      "grad_norm": 0.01013926975429058,
+      "learning_rate": 4.591433502906977e-05,
+      "loss": 0.002582077383995056,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1726017441860465,
+      "grad_norm": 0.005713903810828924,
+      "learning_rate": 4.568722747093023e-05,
+      "loss": 0.0026018735766410827,
+      "step": 1900
+    },
+    {
+      "epoch": 0.1816860465116279,
+      "grad_norm": 0.019032707437872887,
+      "learning_rate": 4.54601199127907e-05,
+      "loss": 0.0017240011692047118,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1907703488372093,
+      "grad_norm": 0.03191379830241203,
+      "learning_rate": 4.523301235465117e-05,
+      "loss": 0.0022749193012714386,
+      "step": 2100
+    },
+    {
+      "epoch": 0.1998546511627907,
+      "grad_norm": 0.12945719063282013,
+      "learning_rate": 4.5005904796511626e-05,
+      "loss": 0.0017636509239673615,
+      "step": 2200
+    },
+    {
+      "epoch": 0.2089389534883721,
+      "grad_norm": 0.01494428887963295,
+      "learning_rate": 4.47787972383721e-05,
+      "loss": 0.0017163331806659698,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2180232558139535,
+      "grad_norm": 0.08089974522590637,
+      "learning_rate": 4.455168968023256e-05,
+      "loss": 0.0023377402126789092,
+      "step": 2400
+    },
+    {
+      "epoch": 0.22710755813953487,
+      "grad_norm": 0.004136559087783098,
+      "learning_rate": 4.4324582122093026e-05,
+      "loss": 0.001353910267353058,
+      "step": 2500
+    },
+    {
+      "epoch": 0.23619186046511628,
+      "grad_norm": 0.25111478567123413,
+      "learning_rate": 4.409747456395349e-05,
+      "loss": 0.0022939679026603697,
+      "step": 2600
+    },
+    {
+      "epoch": 0.24527616279069767,
+      "grad_norm": 0.963623583316803,
+      "learning_rate": 4.3870367005813955e-05,
+      "loss": 0.0018738456070423125,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2543604651162791,
+      "grad_norm": 0.6530899405479431,
+      "learning_rate": 4.364325944767442e-05,
+      "loss": 0.0015843257308006287,
+      "step": 2800
+    },
+    {
+      "epoch": 0.26344476744186046,
+      "grad_norm": 0.0018010369967669249,
+      "learning_rate": 4.3416151889534884e-05,
+      "loss": 0.0006610354781150817,
+      "step": 2900
+    },
+    {
+      "epoch": 0.27252906976744184,
+      "grad_norm": 0.1644030064344406,
+      "learning_rate": 4.318904433139535e-05,
+      "loss": 0.0031197205185890197,
+      "step": 3000
+    },
+    {
+      "epoch": 0.28161337209302323,
+      "grad_norm": 0.14440134167671204,
+      "learning_rate": 4.296193677325582e-05,
+      "loss": 0.0011694706976413728,
+      "step": 3100
+    },
+    {
+      "epoch": 0.29069767441860467,
+      "grad_norm": 0.0124755734577775,
+      "learning_rate": 4.273482921511628e-05,
+      "loss": 0.0013250903785228729,
+      "step": 3200
+    },
+    {
+      "epoch": 0.29978197674418605,
+      "grad_norm": 0.15910762548446655,
+      "learning_rate": 4.250772165697675e-05,
+      "loss": 0.0007842753082513809,
+      "step": 3300
+    },
+    {
+      "epoch": 0.30886627906976744,
+      "grad_norm": 0.011540662497282028,
+      "learning_rate": 4.228061409883721e-05,
+      "loss": 0.0018773898482322693,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3179505813953488,
+      "grad_norm": 0.005148735363036394,
+      "learning_rate": 4.205350654069768e-05,
+      "loss": 0.001056775525212288,
+      "step": 3500
+    },
+    {
+      "epoch": 0.32703488372093026,
+      "grad_norm": 0.007442768197506666,
+      "learning_rate": 4.182639898255814e-05,
+      "loss": 0.0014348265528678894,
+      "step": 3600
+    },
+    {
+      "epoch": 0.33611918604651164,
+      "grad_norm": 0.00586783979088068,
+      "learning_rate": 4.1599291424418606e-05,
+      "loss": 0.0014780126512050629,
+      "step": 3700
+    },
+    {
+      "epoch": 0.345203488372093,
+      "grad_norm": 0.006909696385264397,
+      "learning_rate": 4.137218386627907e-05,
+      "loss": 0.0019674110412597656,
+      "step": 3800
+    },
+    {
+      "epoch": 0.3542877906976744,
+      "grad_norm": 0.020171863958239555,
+      "learning_rate": 4.1145076308139535e-05,
+      "loss": 0.00249418705701828,
+      "step": 3900
+    },
+    {
+      "epoch": 0.3633720930232558,
+      "grad_norm": 0.031723715364933014,
+      "learning_rate": 4.091796875e-05,
+      "loss": 0.0030515387654304503,
+      "step": 4000
+    },
+    {
+      "epoch": 0.37245639534883723,
+      "grad_norm": 0.30729565024375916,
+      "learning_rate": 4.069086119186047e-05,
+      "loss": 0.0009509802609682083,
+      "step": 4100
+    },
+    {
+      "epoch": 0.3815406976744186,
+      "grad_norm": 0.0021714100148528814,
+      "learning_rate": 4.046375363372093e-05,
+      "loss": 0.0013920699059963226,
+      "step": 4200
+    },
+    {
+      "epoch": 0.390625,
+      "grad_norm": 0.001017636270262301,
+      "learning_rate": 4.02366460755814e-05,
+      "loss": 0.0010310819000005722,
+      "step": 4300
+    },
+    {
+      "epoch": 0.3997093023255814,
+      "grad_norm": 0.0009663284290581942,
+      "learning_rate": 4.0009538517441864e-05,
+      "loss": 0.001098722442984581,
+      "step": 4400
+    },
+    {
+      "epoch": 0.40879360465116277,
+      "grad_norm": 0.003507931251078844,
+      "learning_rate": 3.978243095930232e-05,
+      "loss": 0.0013840131461620331,
+      "step": 4500
+    },
+    {
+      "epoch": 0.4178779069767442,
+      "grad_norm": 0.21988199651241302,
+      "learning_rate": 3.955532340116279e-05,
+      "loss": 0.0017136169970035553,
+      "step": 4600
+    },
+    {
+      "epoch": 0.4269622093023256,
+      "grad_norm": 0.0012972657568752766,
+      "learning_rate": 3.932821584302326e-05,
+      "loss": 0.0008040449023246765,
+      "step": 4700
+    },
+    {
+      "epoch": 0.436046511627907,
+      "grad_norm": 0.033857911825180054,
+      "learning_rate": 3.910110828488372e-05,
+      "loss": 0.0005662231892347336,
+      "step": 4800
+    },
+    {
+      "epoch": 0.44513081395348836,
+      "grad_norm": 0.0008128180052153766,
+      "learning_rate": 3.8874000726744187e-05,
+      "loss": 0.001040520742535591,
+      "step": 4900
+    },
+    {
+      "epoch": 0.45421511627906974,
+      "grad_norm": 0.0017111338675022125,
+      "learning_rate": 3.864689316860465e-05,
+      "loss": 0.0010922805964946748,
+      "step": 5000
+    },
+    {
+      "epoch": 0.4632994186046512,
+      "grad_norm": 0.0013291583163663745,
+      "learning_rate": 3.841978561046512e-05,
+      "loss": 0.0005277743935585021,
+      "step": 5100
+    },
+    {
+      "epoch": 0.47238372093023256,
+      "grad_norm": 0.09846807271242142,
+      "learning_rate": 3.819267805232558e-05,
+      "loss": 0.0010997675359249114,
+      "step": 5200
+    },
+    {
+      "epoch": 0.48146802325581395,
+      "grad_norm": 0.005185174290090799,
+      "learning_rate": 3.796557049418605e-05,
+      "loss": 0.000698111355304718,
+      "step": 5300
+    },
+    {
+      "epoch": 0.49055232558139533,
+      "grad_norm": 0.019086388871073723,
+      "learning_rate": 3.7738462936046515e-05,
+      "loss": 0.0009016367793083191,
+      "step": 5400
+    },
+    {
+      "epoch": 0.49963662790697677,
+      "grad_norm": 0.05122831463813782,
+      "learning_rate": 3.751135537790697e-05,
+      "loss": 0.0015524370968341826,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5087209302325582,
+      "grad_norm": 0.012596211396157742,
+      "learning_rate": 3.7284247819767444e-05,
+      "loss": 0.00220917209982872,
+      "step": 5600
+    },
+    {
+      "epoch": 0.5178052325581395,
+      "grad_norm": 0.009361029602587223,
+      "learning_rate": 3.705714026162791e-05,
+      "loss": 0.0010362663865089417,
+      "step": 5700
+    },
+    {
+      "epoch": 0.5268895348837209,
+      "grad_norm": 0.5205378532409668,
+      "learning_rate": 3.683003270348837e-05,
+      "loss": 0.0022550773620605468,
+      "step": 5800
+    },
+    {
+      "epoch": 0.5359738372093024,
+      "grad_norm": 0.0006470124353654683,
+      "learning_rate": 3.660292514534884e-05,
+      "loss": 0.0011264414340257645,
+      "step": 5900
+    },
+    {
+      "epoch": 0.5450581395348837,
+      "grad_norm": 0.058231666684150696,
+      "learning_rate": 3.63758175872093e-05,
+      "loss": 0.0011266635358333588,
+      "step": 6000
+    },
+    {
+      "epoch": 0.5541424418604651,
+      "grad_norm": 0.0013441125629469752,
+      "learning_rate": 3.614871002906977e-05,
+      "loss": 0.0008001205325126648,
+      "step": 6100
+    },
+    {
+      "epoch": 0.5632267441860465,
+      "grad_norm": 0.0021539030130952597,
+      "learning_rate": 3.592160247093023e-05,
+      "loss": 0.0008849448710680007,
+      "step": 6200
+    },
+    {
+      "epoch": 0.5723110465116279,
+      "grad_norm": 0.9019960761070251,
+      "learning_rate": 3.56944949127907e-05,
+      "loss": 0.0006517694145441055,
+      "step": 6300
+    },
+    {
+      "epoch": 0.5813953488372093,
+      "grad_norm": 0.001283760997466743,
+      "learning_rate": 3.546738735465117e-05,
+      "loss": 0.0021727623045444486,
+      "step": 6400
+    },
+    {
+      "epoch": 0.5904796511627907,
+      "grad_norm": 0.005666423588991165,
+      "learning_rate": 3.5240279796511624e-05,
+      "loss": 0.0012587083876132966,
+      "step": 6500
+    },
+    {
+      "epoch": 0.5995639534883721,
+      "grad_norm": 1.4842944145202637,
+      "learning_rate": 3.5013172238372096e-05,
+      "loss": 0.001180112287402153,
+      "step": 6600
+    },
+    {
+      "epoch": 0.6086482558139535,
+      "grad_norm": 0.02973734401166439,
+      "learning_rate": 3.478606468023256e-05,
+      "loss": 0.0010163599252700805,
+      "step": 6700
+    },
+    {
+      "epoch": 0.6177325581395349,
+      "grad_norm": 0.00399527233093977,
+      "learning_rate": 3.4558957122093024e-05,
+      "loss": 0.0010414445400238038,
+      "step": 6800
+    },
+    {
+      "epoch": 0.6268168604651163,
+      "grad_norm": 0.39672717452049255,
+      "learning_rate": 3.433184956395349e-05,
+      "loss": 0.0012094499170780182,
+      "step": 6900
+    },
+    {
+      "epoch": 0.6359011627906976,
+      "grad_norm": 0.5308993458747864,
+      "learning_rate": 3.410474200581395e-05,
+      "loss": 0.0006021633744239807,
+      "step": 7000
+    },
+    {
+      "epoch": 0.6449854651162791,
+      "grad_norm": 0.003343795659020543,
+      "learning_rate": 3.3877634447674425e-05,
+      "loss": 0.0005116893351078033,
+      "step": 7100
+    },
+    {
+      "epoch": 0.6540697674418605,
+      "grad_norm": 0.0033038391266018152,
+      "learning_rate": 3.365052688953488e-05,
+      "loss": 0.002106922417879105,
+      "step": 7200
+    },
+    {
+      "epoch": 0.6631540697674418,
+      "grad_norm": 0.004004980903118849,
+      "learning_rate": 3.3423419331395353e-05,
+      "loss": 0.0010233993828296661,
+      "step": 7300
+    },
+    {
+      "epoch": 0.6722383720930233,
+      "grad_norm": 0.00490298168733716,
+      "learning_rate": 3.319631177325582e-05,
+      "loss": 0.0007062336057424546,
+      "step": 7400
+    },
+    {
+      "epoch": 0.6813226744186046,
+      "grad_norm": 0.04243200644850731,
+      "learning_rate": 3.2969204215116276e-05,
+      "loss": 0.0005479569733142852,
+      "step": 7500
+    },
+    {
+      "epoch": 0.690406976744186,
+      "grad_norm": 0.008556894026696682,
+      "learning_rate": 3.274209665697675e-05,
+      "loss": 0.0010938134789466858,
+      "step": 7600
+    },
+    {
+      "epoch": 0.6994912790697675,
+      "grad_norm": 0.016938723623752594,
+      "learning_rate": 3.251498909883721e-05,
+      "loss": 0.0009466408193111419,
+      "step": 7700
+    },
+    {
+      "epoch": 0.7085755813953488,
+      "grad_norm": 0.09418574720621109,
+      "learning_rate": 3.2287881540697676e-05,
+      "loss": 0.00145876482129097,
+      "step": 7800
+    },
+    {
+      "epoch": 0.7176598837209303,
+      "grad_norm": 0.002359782112762332,
+      "learning_rate": 3.206077398255814e-05,
+      "loss": 0.0015742655098438263,
+      "step": 7900
+    },
+    {
+      "epoch": 0.7267441860465116,
+      "grad_norm": 0.3374776840209961,
+      "learning_rate": 3.1833666424418605e-05,
+      "loss": 0.0007466593384742737,
+      "step": 8000
+    },
+    {
+      "epoch": 0.735828488372093,
+      "grad_norm": 0.000735403154976666,
+      "learning_rate": 3.1606558866279076e-05,
+      "loss": 0.0008171546459197998,
+      "step": 8100
+    },
+    {
+      "epoch": 0.7449127906976745,
+      "grad_norm": 0.01963644102215767,
+      "learning_rate": 3.1379451308139533e-05,
+      "loss": 0.0007363802939653396,
+      "step": 8200
+    },
+    {
+      "epoch": 0.7539970930232558,
+      "grad_norm": 0.09964141249656677,
+      "learning_rate": 3.1152343750000005e-05,
+      "loss": 0.00030826406553387644,
+      "step": 8300
+    },
+    {
+      "epoch": 0.7630813953488372,
+      "grad_norm": 0.0029934593476355076,
+      "learning_rate": 3.092523619186047e-05,
+      "loss": 0.0003284827247262001,
+      "step": 8400
+    },
+    {
+      "epoch": 0.7721656976744186,
+      "grad_norm": 0.002162993187084794,
+      "learning_rate": 3.069812863372093e-05,
+      "loss": 0.0010288888961076737,
+      "step": 8500
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 0.013634397648274899,
+      "learning_rate": 3.0471021075581398e-05,
+      "loss": 0.0006983913481235504,
+      "step": 8600
+    },
+    {
+      "epoch": 0.7903343023255814,
+      "grad_norm": 0.0833001658320427,
+      "learning_rate": 3.0243913517441862e-05,
+      "loss": 0.0009967025369405746,
+      "step": 8700
+    },
+    {
+      "epoch": 0.7994186046511628,
+      "grad_norm": 0.4697379767894745,
+      "learning_rate": 3.0016805959302323e-05,
+      "loss": 0.0007716407626867294,
+      "step": 8800
+    },
+    {
+      "epoch": 0.8085029069767442,
+      "grad_norm": 0.04588587209582329,
+      "learning_rate": 2.978969840116279e-05,
+      "loss": 0.000263803955167532,
+      "step": 8900
+    },
+    {
+      "epoch": 0.8175872093023255,
+      "grad_norm": 0.003420337103307247,
+      "learning_rate": 2.956259084302326e-05,
+      "loss": 0.0009599439054727555,
+      "step": 9000
+    },
+    {
+      "epoch": 0.826671511627907,
+      "grad_norm": 0.0024436134845018387,
+      "learning_rate": 2.9335483284883724e-05,
+      "loss": 0.012270723581314086,
+      "step": 9100
+    },
+    {
+      "epoch": 0.8357558139534884,
+      "grad_norm": 0.001418731757439673,
+      "learning_rate": 2.9108375726744185e-05,
+      "loss": 0.0009713788330554962,
+      "step": 9200
+    },
+    {
+      "epoch": 0.8448401162790697,
+      "grad_norm": 0.0026375153101980686,
+      "learning_rate": 2.8881268168604652e-05,
+      "loss": 0.0007654589414596558,
+      "step": 9300
+    },
+    {
+      "epoch": 0.8539244186046512,
+      "grad_norm": 0.0013760777655988932,
+      "learning_rate": 2.865416061046512e-05,
+      "loss": 0.0006276721507310867,
+      "step": 9400
+    },
+    {
+      "epoch": 0.8630087209302325,
+      "grad_norm": 0.0033582421019673347,
+      "learning_rate": 2.842705305232558e-05,
+      "loss": 0.0003520375117659569,
+      "step": 9500
+    },
+    {
+      "epoch": 0.872093023255814,
+      "grad_norm": 0.0043012769892811775,
+      "learning_rate": 2.819994549418605e-05,
+      "loss": 0.0011182524263858794,
+      "step": 9600
+    },
+    {
+      "epoch": 0.8811773255813954,
+      "grad_norm": 0.0015025343745946884,
+      "learning_rate": 2.7972837936046514e-05,
+      "loss": 0.00044553544372320173,
+      "step": 9700
+    },
+    {
+      "epoch": 0.8902616279069767,
+      "grad_norm": 0.0018241156358271837,
+      "learning_rate": 2.7745730377906975e-05,
+      "loss": 0.00023134740069508553,
+      "step": 9800
+    },
+    {
+      "epoch": 0.8993459302325582,
+      "grad_norm": 0.001082456554286182,
+      "learning_rate": 2.7518622819767442e-05,
+      "loss": 0.001018296480178833,
+      "step": 9900
+    },
+    {
+      "epoch": 0.9084302325581395,
+      "grad_norm": 0.01155087724328041,
+      "learning_rate": 2.729151526162791e-05,
+      "loss": 0.0005589094385504722,
+      "step": 10000
+    },
+    {
+      "epoch": 0.9175145348837209,
+      "grad_norm": 0.0022482366766780615,
+      "learning_rate": 2.7064407703488375e-05,
+      "loss": 0.000512191392481327,
+      "step": 10100
+    },
+    {
+      "epoch": 0.9265988372093024,
+      "grad_norm": 0.20549768209457397,
+      "learning_rate": 2.6837300145348836e-05,
+      "loss": 0.00038118865340948106,
+      "step": 10200
+    },
+    {
+      "epoch": 0.9356831395348837,
+      "grad_norm": 0.0013188497396185994,
+      "learning_rate": 2.6610192587209304e-05,
+      "loss": 0.0003574254736304283,
+      "step": 10300
+    },
+    {
+      "epoch": 0.9447674418604651,
+      "grad_norm": 0.0006103936466388404,
+      "learning_rate": 2.638308502906977e-05,
+      "loss": 0.00024356411769986154,
+      "step": 10400
+    },
+    {
+      "epoch": 0.9538517441860465,
+      "grad_norm": 0.00047453015577048063,
+      "learning_rate": 2.6155977470930233e-05,
+      "loss": 0.0013259868323802948,
+      "step": 10500
+    },
+    {
+      "epoch": 0.9629360465116279,
+      "grad_norm": 0.025760261341929436,
+      "learning_rate": 2.59288699127907e-05,
+      "loss": 0.0006390263140201569,
+      "step": 10600
+    },
+    {
+      "epoch": 0.9720203488372093,
+      "grad_norm": 0.0025282115675508976,
+      "learning_rate": 2.5701762354651165e-05,
+      "loss": 0.001759422868490219,
+      "step": 10700
+    },
+    {
+      "epoch": 0.9811046511627907,
+      "grad_norm": 2.990682601928711,
+      "learning_rate": 2.5474654796511626e-05,
+      "loss": 0.00041438560932874677,
+      "step": 10800
+    },
+    {
+      "epoch": 0.9901889534883721,
+      "grad_norm": 0.0036354295443743467,
+      "learning_rate": 2.5247547238372094e-05,
+      "loss": 0.000467718243598938,
+      "step": 10900
+    },
+    {
+      "epoch": 0.9992732558139535,
+      "grad_norm": 0.02648981101810932,
+      "learning_rate": 2.502043968023256e-05,
+      "loss": 0.0007214382290840149,
+      "step": 11000
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 8.017317304620519e-05,
+      "eval_runtime": 132.6261,
+      "eval_samples_per_second": 165.993,
+      "eval_steps_per_second": 20.75,
+      "step": 11008
+    },
+    {
+      "epoch": 1.008357558139535,
+      "grad_norm": 0.0011333210859447718,
+      "learning_rate": 2.4793332122093026e-05,
+      "loss": 0.000609114095568657,
+      "step": 11100
+    },
+    {
+      "epoch": 1.0174418604651163,
+      "grad_norm": 0.0006669509457424283,
+      "learning_rate": 2.4566224563953487e-05,
+      "loss": 0.000269341841340065,
+      "step": 11200
+    },
+    {
+      "epoch": 1.0265261627906976,
+      "grad_norm": 0.0006085751811042428,
+      "learning_rate": 2.4339117005813955e-05,
+      "loss": 0.00018008088693022728,
+      "step": 11300
+    },
+    {
+      "epoch": 1.035610465116279,
+      "grad_norm": 0.00029874974279664457,
+      "learning_rate": 2.411200944767442e-05,
+      "loss": 8.143479935824871e-05,
+      "step": 11400
+    },
+    {
+      "epoch": 1.0446947674418605,
+      "grad_norm": 0.0020065035205334425,
+      "learning_rate": 2.3884901889534887e-05,
+      "loss": 0.0004405264183878899,
+      "step": 11500
+    },
+    {
+      "epoch": 1.0537790697674418,
+      "grad_norm": 0.3139957785606384,
+      "learning_rate": 2.3657794331395348e-05,
+      "loss": 0.0007688279449939728,
+      "step": 11600
+    },
+    {
+      "epoch": 1.0628633720930232,
+      "grad_norm": 0.014869201928377151,
+      "learning_rate": 2.3430686773255813e-05,
+      "loss": 0.00045145414769649507,
+      "step": 11700
+    },
+    {
+      "epoch": 1.0719476744186047,
+      "grad_norm": 0.03592238947749138,
+      "learning_rate": 2.320357921511628e-05,
+      "loss": 0.0013660797476768493,
+      "step": 11800
+    },
+    {
+      "epoch": 1.081031976744186,
+      "grad_norm": 0.010270297527313232,
+      "learning_rate": 2.2976471656976745e-05,
+      "loss": 0.0005171676725149155,
+      "step": 11900
+    },
+    {
+      "epoch": 1.0901162790697674,
+      "grad_norm": 0.0026885548140853643,
+      "learning_rate": 2.2749364098837213e-05,
+      "loss": 0.00046057451516389846,
+      "step": 12000
+    },
+    {
+      "epoch": 1.099200581395349,
+      "grad_norm": 0.005681134294718504,
+      "learning_rate": 2.2522256540697674e-05,
+      "loss": 0.0006235280632972718,
+      "step": 12100
+    },
+    {
+      "epoch": 1.1082848837209303,
+      "grad_norm": 0.0028001824393868446,
+      "learning_rate": 2.2295148982558138e-05,
+      "loss": 0.0008198145776987075,
+      "step": 12200
+    },
+    {
+      "epoch": 1.1173691860465116,
+      "grad_norm": 0.0006385694723576307,
+      "learning_rate": 2.2068041424418606e-05,
+      "loss": 0.0003722207620739937,
+      "step": 12300
+    },
+    {
+      "epoch": 1.1264534883720931,
+      "grad_norm": 0.0008649047813378274,
+      "learning_rate": 2.184093386627907e-05,
+      "loss": 0.0003394853696227074,
+      "step": 12400
+    },
+    {
+      "epoch": 1.1355377906976745,
+      "grad_norm": 0.004796006251126528,
+      "learning_rate": 2.1613826308139538e-05,
+      "loss": 0.00042502928525209424,
+      "step": 12500
+    },
+    {
+      "epoch": 1.1446220930232558,
+      "grad_norm": 0.001498043886385858,
+      "learning_rate": 2.138671875e-05,
+      "loss": 0.0009064202010631562,
+      "step": 12600
+    },
+    {
+      "epoch": 1.1537063953488371,
+      "grad_norm": 0.01664622500538826,
+      "learning_rate": 2.1159611191860464e-05,
+      "loss": 0.0002878718450665474,
+      "step": 12700
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 0.010639426298439503,
+      "learning_rate": 2.093250363372093e-05,
+      "loss": 0.0011860337108373642,
+      "step": 12800
+    },
+    {
+      "epoch": 1.171875,
+      "grad_norm": 0.050644513219594955,
+      "learning_rate": 2.0705396075581396e-05,
+      "loss": 0.0006260576844215393,
+      "step": 12900
+    },
+    {
+      "epoch": 1.1809593023255813,
+      "grad_norm": 0.001398528809659183,
+      "learning_rate": 2.0478288517441864e-05,
+      "loss": 0.0001833665184676647,
+      "step": 13000
+    },
+    {
+      "epoch": 1.1900436046511629,
+      "grad_norm": 0.0057320562191307545,
+      "learning_rate": 2.0251180959302325e-05,
+      "loss": 0.00032943256199359896,
+      "step": 13100
+    },
+    {
+      "epoch": 1.1991279069767442,
+      "grad_norm": 0.0002997924748342484,
+      "learning_rate": 2.002407340116279e-05,
+      "loss": 6.249572150409222e-05,
+      "step": 13200
+    },
+    {
+      "epoch": 1.2082122093023255,
+      "grad_norm": 0.0008130258647724986,
+      "learning_rate": 1.9796965843023257e-05,
+      "loss": 0.00026577839627861977,
+      "step": 13300
+    },
+    {
+      "epoch": 1.2172965116279069,
+      "grad_norm": 0.0006040658336132765,
+      "learning_rate": 1.956985828488372e-05,
+      "loss": 0.00023248378187417984,
+      "step": 13400
+    },
+    {
+      "epoch": 1.2263808139534884,
+      "grad_norm": 0.000803643895778805,
+      "learning_rate": 1.934275072674419e-05,
+      "loss": 0.0002730824239552021,
+      "step": 13500
+    },
+    {
+      "epoch": 1.2354651162790697,
+      "grad_norm": 0.0005482266424223781,
+      "learning_rate": 1.911564316860465e-05,
+      "loss": 0.0005848826467990875,
+      "step": 13600
+    },
+    {
+      "epoch": 1.244549418604651,
+      "grad_norm": 0.0034759771078824997,
+      "learning_rate": 1.8888535610465115e-05,
+      "loss": 0.0005462893471121788,
+      "step": 13700
+    },
+    {
+      "epoch": 1.2536337209302326,
+      "grad_norm": 0.0002226918877568096,
+      "learning_rate": 1.8661428052325583e-05,
+      "loss": 7.945694960653782e-05,
+      "step": 13800
+    },
+    {
+      "epoch": 1.262718023255814,
+      "grad_norm": 0.002374310279265046,
+      "learning_rate": 1.8434320494186047e-05,
+      "loss": 0.000744214728474617,
+      "step": 13900
+    },
+    {
+      "epoch": 1.2718023255813953,
+      "grad_norm": 0.0009076696587726474,
+      "learning_rate": 1.8207212936046515e-05,
+      "loss": 0.0003250580281019211,
+      "step": 14000
+    },
+    {
+      "epoch": 1.2808866279069768,
+      "grad_norm": 0.010181100107729435,
+      "learning_rate": 1.7980105377906976e-05,
+      "loss": 0.0005765938758850097,
+      "step": 14100
+    },
+    {
+      "epoch": 1.2899709302325582,
+      "grad_norm": 0.0012442917795851827,
+      "learning_rate": 1.775299781976744e-05,
+      "loss": 0.00032320961356163024,
+      "step": 14200
+    },
+    {
+      "epoch": 1.2990552325581395,
+      "grad_norm": 0.0005617383285425603,
+      "learning_rate": 1.752589026162791e-05,
+      "loss": 0.0001776321791112423,
+      "step": 14300
+    },
+    {
+      "epoch": 1.308139534883721,
+      "grad_norm": 0.0010064981179311872,
+      "learning_rate": 1.7298782703488373e-05,
+      "loss": 0.00012774170376360415,
+      "step": 14400
+    },
+    {
+      "epoch": 1.3172238372093024,
+      "grad_norm": 0.00039220438338816166,
+      "learning_rate": 1.707167514534884e-05,
+      "loss": 0.0006190959364175796,
+      "step": 14500
+    },
+    {
+      "epoch": 1.3263081395348837,
+      "grad_norm": 0.0009137202869169414,
+      "learning_rate": 1.6844567587209302e-05,
+      "loss": 0.0007051125913858414,
+      "step": 14600
+    },
+    {
+      "epoch": 1.3353924418604652,
+      "grad_norm": 0.0006007241318002343,
+      "learning_rate": 1.6617460029069766e-05,
+      "loss": 7.074063178151846e-05,
+      "step": 14700
+    },
+    {
+      "epoch": 1.3444767441860466,
+      "grad_norm": 0.0006039150175638497,
+      "learning_rate": 1.6390352470930234e-05,
+      "loss": 0.000489293597638607,
+      "step": 14800
+    },
+    {
+      "epoch": 1.353561046511628,
+      "grad_norm": 0.0014955669175833464,
+      "learning_rate": 1.61632449127907e-05,
+      "loss": 0.0004155828058719635,
+      "step": 14900
+    },
+    {
+      "epoch": 1.3626453488372092,
+      "grad_norm": 0.1508745402097702,
+      "learning_rate": 1.5936137354651163e-05,
+      "loss": 0.00024092141538858413,
+      "step": 15000
+    },
+    {
+      "epoch": 1.3717296511627908,
+      "grad_norm": 0.0008113393560051918,
+      "learning_rate": 1.5709029796511627e-05,
+      "loss": 0.00040562458336353304,
+      "step": 15100
+    },
+    {
+      "epoch": 1.380813953488372,
+      "grad_norm": 0.0026459796354174614,
+      "learning_rate": 1.5481922238372092e-05,
+      "loss": 0.0007934534549713135,
+      "step": 15200
+    },
+    {
+      "epoch": 1.3898982558139534,
+      "grad_norm": 0.0005005365237593651,
+      "learning_rate": 1.525481468023256e-05,
+      "loss": 0.00023073634132742883,
+      "step": 15300
+    },
+    {
+      "epoch": 1.3989825581395348,
+      "grad_norm": 0.02727348543703556,
+      "learning_rate": 1.5027707122093024e-05,
+      "loss": 0.0003524136170744896,
+      "step": 15400
+    },
+    {
+      "epoch": 1.4080668604651163,
+      "grad_norm": 0.0002921252744272351,
+      "learning_rate": 1.4800599563953488e-05,
+      "loss": 0.000525415763258934,
+      "step": 15500
+    },
+    {
+      "epoch": 1.4171511627906976,
+      "grad_norm": 0.0016339289722964168,
+      "learning_rate": 1.4573492005813955e-05,
+      "loss": 0.0003766526654362678,
+      "step": 15600
+    },
+    {
+      "epoch": 1.426235465116279,
+      "grad_norm": 0.003485665889456868,
+      "learning_rate": 1.4346384447674419e-05,
+      "loss": 0.00021764757111668586,
+      "step": 15700
+    },
+    {
+      "epoch": 1.4353197674418605,
+      "grad_norm": 0.0011686257785186172,
+      "learning_rate": 1.4119276889534885e-05,
+      "loss": 0.00026009151712059976,
+      "step": 15800
+    },
+    {
+      "epoch": 1.4444040697674418,
+      "grad_norm": 0.00034656753996387124,
+      "learning_rate": 1.389216933139535e-05,
+      "loss": 0.0003694407269358635,
+      "step": 15900
+    },
+    {
+      "epoch": 1.4534883720930232,
+      "grad_norm": 0.0023575718514621258,
+      "learning_rate": 1.3665061773255814e-05,
+      "loss": 0.0001483263447880745,
+      "step": 16000
+    },
+    {
+      "epoch": 1.4625726744186047,
+      "grad_norm": 0.0011886970605701208,
+      "learning_rate": 1.343795421511628e-05,
+      "loss": 0.0004465998336672783,
+      "step": 16100
+    },
+    {
+      "epoch": 1.471656976744186,
+      "grad_norm": 0.00039876584196463227,
+      "learning_rate": 1.3210846656976745e-05,
+      "loss": 0.00015990335494279863,
+      "step": 16200
+    },
+    {
+      "epoch": 1.4807412790697674,
+      "grad_norm": 0.00023756893642712384,
+      "learning_rate": 1.298373909883721e-05,
+      "loss": 5.0160493701696394e-05,
+      "step": 16300
+    },
+    {
+      "epoch": 1.489825581395349,
+      "grad_norm": 0.0002809664292726666,
+      "learning_rate": 1.2756631540697675e-05,
+      "loss": 0.0001377291791141033,
+      "step": 16400
+    },
+    {
+      "epoch": 1.4989098837209303,
+      "grad_norm": 0.00037873705150559545,
+      "learning_rate": 1.252952398255814e-05,
+      "loss": 7.196901366114616e-05,
+      "step": 16500
+    },
+    {
+      "epoch": 1.5079941860465116,
+      "grad_norm": 1.4199703931808472,
+      "learning_rate": 1.2302416424418606e-05,
+      "loss": 0.0005905186012387276,
+      "step": 16600
+    },
+    {
+      "epoch": 1.5170784883720931,
+      "grad_norm": 0.0003422704176045954,
+      "learning_rate": 1.207530886627907e-05,
+      "loss": 0.00013428066857159137,
+      "step": 16700
+    },
+    {
+      "epoch": 1.5261627906976745,
+      "grad_norm": 0.0010882618371397257,
+      "learning_rate": 1.1848201308139535e-05,
+      "loss": 0.00020068021491169929,
+      "step": 16800
+    },
+    {
+      "epoch": 1.5352470930232558,
+      "grad_norm": 0.024530770257115364,
+      "learning_rate": 1.162109375e-05,
+      "loss": 0.0004120354726910591,
+      "step": 16900
+    },
+    {
+      "epoch": 1.5443313953488373,
+      "grad_norm": 0.0006738721276633441,
+      "learning_rate": 1.1393986191860465e-05,
+      "loss": 0.0002767092920839787,
+      "step": 17000
+    },
+    {
+      "epoch": 1.5534156976744184,
+      "grad_norm": 0.00019564498506952077,
+      "learning_rate": 1.1166878633720931e-05,
+      "loss": 0.00021817052736878396,
+      "step": 17100
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 0.0002832361788023263,
+      "learning_rate": 1.0939771075581396e-05,
+      "loss": 5.168498028069734e-05,
+      "step": 17200
+    },
+    {
+      "epoch": 1.5715843023255816,
+      "grad_norm": 0.0005729420809075236,
+      "learning_rate": 1.071266351744186e-05,
+      "loss": 0.00016681572422385215,
+      "step": 17300
+    },
+    {
+      "epoch": 1.5806686046511627,
+      "grad_norm": 0.0007383101619780064,
+      "learning_rate": 1.0485555959302326e-05,
+      "loss": 0.0003441007435321808,
+      "step": 17400
+    },
+    {
+      "epoch": 1.5897529069767442,
+      "grad_norm": 0.005897729191929102,
+      "learning_rate": 1.0258448401162791e-05,
+      "loss": 0.0003161391615867615,
+      "step": 17500
+    },
+    {
+      "epoch": 1.5988372093023255,
+      "grad_norm": 0.00025281202397309244,
+      "learning_rate": 1.0031340843023257e-05,
+      "loss": 0.00011429931037127971,
+      "step": 17600
+    },
+    {
+      "epoch": 1.6079215116279069,
+      "grad_norm": 0.0004013874859083444,
+      "learning_rate": 9.804233284883721e-06,
+      "loss": 2.793062012642622e-05,
+      "step": 17700
+    },
+    {
+      "epoch": 1.6170058139534884,
+      "grad_norm": 0.008875502273440361,
+      "learning_rate": 9.577125726744186e-06,
+      "loss": 5.882895085960627e-05,
+      "step": 17800
+    },
+    {
+      "epoch": 1.6260901162790697,
+      "grad_norm": 0.001823420519940555,
+      "learning_rate": 9.350018168604652e-06,
+      "loss": 0.0002583874017000198,
+      "step": 17900
+    },
+    {
+      "epoch": 1.635174418604651,
+      "grad_norm": 0.6227073073387146,
+      "learning_rate": 9.122910610465116e-06,
+      "loss": 0.0007115737348794937,
+      "step": 18000
+    },
+    {
+      "epoch": 1.6442587209302326,
+      "grad_norm": 0.0017345056403428316,
+      "learning_rate": 8.895803052325581e-06,
+      "loss": 0.00027986690402030944,
+      "step": 18100
+    },
+    {
+      "epoch": 1.653343023255814,
+      "grad_norm": 0.0009033643291331828,
+      "learning_rate": 8.668695494186047e-06,
+      "loss": 0.0002979452162981033,
+      "step": 18200
+    },
+    {
+      "epoch": 1.6624273255813953,
+      "grad_norm": 0.00028923453646712005,
+      "learning_rate": 8.441587936046511e-06,
+      "loss": 1.926603843457997e-05,
+      "step": 18300
+    },
+    {
+      "epoch": 1.6715116279069768,
+      "grad_norm": 0.0006103311898186803,
+      "learning_rate": 8.214480377906978e-06,
+      "loss": 6.331724114716053e-05,
+      "step": 18400
+    },
+    {
+      "epoch": 1.6805959302325582,
+      "grad_norm": 0.0002818437642417848,
+      "learning_rate": 7.987372819767442e-06,
+      "loss": 0.00014105773530900478,
+      "step": 18500
+    },
+    {
+      "epoch": 1.6896802325581395,
+      "grad_norm": 0.0011020454112440348,
+      "learning_rate": 7.760265261627907e-06,
+      "loss": 0.0002292773686349392,
+      "step": 18600
+    },
+    {
+      "epoch": 1.698764534883721,
+      "grad_norm": 0.001500884653069079,
+      "learning_rate": 7.533157703488372e-06,
+      "loss": 0.00011514685116708279,
+      "step": 18700
+    },
+    {
+      "epoch": 1.7078488372093024,
+      "grad_norm": 0.00016000888717826456,
+      "learning_rate": 7.306050145348838e-06,
+      "loss": 2.0523781422525645e-05,
+      "step": 18800
+    },
+    {
+      "epoch": 1.7169331395348837,
+      "grad_norm": 0.003748674876987934,
+      "learning_rate": 7.078942587209303e-06,
+      "loss": 0.00043839264661073686,
+      "step": 18900
+    },
+    {
+      "epoch": 1.7260174418604652,
+      "grad_norm": 0.06772974133491516,
+      "learning_rate": 6.8518350290697685e-06,
+      "loss": 2.126413397490978e-05,
+      "step": 19000
+    },
+    {
+      "epoch": 1.7351017441860463,
+      "grad_norm": 0.0004789210797753185,
+      "learning_rate": 6.624727470930232e-06,
+      "loss": 0.00020929597318172455,
+      "step": 19100
+    },
+    {
+      "epoch": 1.744186046511628,
+      "grad_norm": 7.055519381538033e-05,
+      "learning_rate": 6.397619912790697e-06,
+      "loss": 0.00032072752714157104,
+      "step": 19200
+    },
+    {
+      "epoch": 1.7532703488372094,
+      "grad_norm": 0.00010641128028510138,
+      "learning_rate": 6.1705123546511635e-06,
+      "loss": 8.132393471896649e-05,
+      "step": 19300
+    },
+    {
+      "epoch": 1.7623546511627906,
+      "grad_norm": 0.0005615473492071033,
+      "learning_rate": 5.943404796511629e-06,
+      "loss": 9.162256610579788e-06,
+      "step": 19400
+    },
+    {
+      "epoch": 1.771438953488372,
+      "grad_norm": 0.004505404736846685,
+      "learning_rate": 5.716297238372093e-06,
+      "loss": 9.514865465462207e-05,
+      "step": 19500
+    },
+    {
+      "epoch": 1.7805232558139537,
+      "grad_norm": 0.0011890050955116749,
+      "learning_rate": 5.4891896802325586e-06,
+      "loss": 3.7443286273628474e-05,
+      "step": 19600
+    },
+    {
+      "epoch": 1.7896075581395348,
+      "grad_norm": 0.000441042153397575,
+      "learning_rate": 5.262082122093023e-06,
+      "loss": 5.087008234113455e-05,
+      "step": 19700
+    },
+    {
+      "epoch": 1.7986918604651163,
+      "grad_norm": 0.0001777316356310621,
+      "learning_rate": 5.034974563953489e-06,
+      "loss": 0.00017192648723721505,
+      "step": 19800
+    },
+    {
+      "epoch": 1.8077761627906976,
+      "grad_norm": 0.00015439293929375708,
+      "learning_rate": 4.8078670058139536e-06,
+      "loss": 0.0001058769691735506,
+      "step": 19900
+    },
+    {
+      "epoch": 1.816860465116279,
+      "grad_norm": 0.0016349812503904104,
+      "learning_rate": 4.580759447674419e-06,
+      "loss": 1.0620863176882267e-05,
+      "step": 20000
+    },
+    {
+      "epoch": 1.8259447674418605,
+      "grad_norm": 9.556530858390033e-05,
+      "learning_rate": 4.353651889534884e-06,
+      "loss": 0.00019107908010482789,
+      "step": 20100
+    },
+    {
+      "epoch": 1.8350290697674418,
+      "grad_norm": 0.0002568990457803011,
+      "learning_rate": 4.126544331395349e-06,
+      "loss": 0.00026563439518213273,
+      "step": 20200
+    },
+    {
+      "epoch": 1.8441133720930232,
+      "grad_norm": 0.5719628930091858,
+      "learning_rate": 3.899436773255814e-06,
+      "loss": 5.403281655162573e-05,
+      "step": 20300
+    },
+    {
+      "epoch": 1.8531976744186047,
+      "grad_norm": 0.017383404076099396,
+      "learning_rate": 3.672329215116279e-06,
+      "loss": 0.00036652404814958573,
+      "step": 20400
+    },
+    {
+      "epoch": 1.862281976744186,
+      "grad_norm": 0.00022910887491889298,
+      "learning_rate": 3.4452216569767445e-06,
+      "loss": 0.000403200164437294,
+      "step": 20500
+    },
+    {
+      "epoch": 1.8713662790697674,
+      "grad_norm": 0.00033295468892902136,
+      "learning_rate": 3.2181140988372097e-06,
+      "loss": 8.646129630506039e-05,
+      "step": 20600
+    },
+    {
+      "epoch": 1.880450581395349,
+      "grad_norm": 0.00012523184705059975,
+      "learning_rate": 2.9910065406976746e-06,
+      "loss": 0.00016864996403455733,
+      "step": 20700
+    },
+    {
+      "epoch": 1.8895348837209303,
+      "grad_norm": 0.00022476979938801378,
+      "learning_rate": 2.7638989825581395e-06,
+      "loss": 5.167209077626467e-05,
+      "step": 20800
+    },
+    {
+      "epoch": 1.8986191860465116,
+      "grad_norm": 0.0002389108412899077,
+      "learning_rate": 2.5367914244186048e-06,
+      "loss": 6.591790355741977e-05,
+      "step": 20900
+    },
+    {
+      "epoch": 1.9077034883720931,
+      "grad_norm": 0.0013002109481021762,
+      "learning_rate": 2.30968386627907e-06,
+      "loss": 0.00010993644595146179,
+      "step": 21000
+    },
+    {
+      "epoch": 1.9167877906976745,
+      "grad_norm": 0.13188259303569794,
+      "learning_rate": 2.082576308139535e-06,
+      "loss": 0.00014082306995987892,
+      "step": 21100
+    },
+    {
+      "epoch": 1.9258720930232558,
+      "grad_norm": 0.0010737127158790827,
+      "learning_rate": 1.85546875e-06,
+      "loss": 0.00030819986015558244,
+      "step": 21200
+    },
+    {
+      "epoch": 1.9349563953488373,
+      "grad_norm": 7.047707185847685e-05,
+      "learning_rate": 1.628361191860465e-06,
+      "loss": 2.0567586179822683e-05,
+      "step": 21300
+    },
+    {
+      "epoch": 1.9440406976744184,
+      "grad_norm": 0.0001350079692201689,
+      "learning_rate": 1.4012536337209304e-06,
+      "loss": 7.989832083694636e-06,
+      "step": 21400
+    },
+    {
+      "epoch": 1.953125,
+      "grad_norm": 0.0008243785705417395,
+      "learning_rate": 1.1741460755813954e-06,
+      "loss": 0.0002570002153515816,
+      "step": 21500
+    },
+    {
+      "epoch": 1.9622093023255816,
+      "grad_norm": 0.001545518171042204,
+      "learning_rate": 9.470385174418604e-07,
+      "loss": 2.7221005875617266e-05,
+      "step": 21600
+    },
+    {
+      "epoch": 1.9712936046511627,
+      "grad_norm": 5.5686323321424425e-05,
+      "learning_rate": 7.199309593023256e-07,
+      "loss": 0.00010406752116978168,
+      "step": 21700
+    },
+    {
+      "epoch": 1.9803779069767442,
+      "grad_norm": 5.525942106032744e-05,
+      "learning_rate": 4.928234011627908e-07,
+      "loss": 3.853037022054195e-05,
+      "step": 21800
+    },
+    {
+      "epoch": 1.9894622093023255,
+      "grad_norm": 0.0002285480877617374,
+      "learning_rate": 2.657158430232558e-07,
+      "loss": 0.00048015639185905457,
+      "step": 21900
+    },
+    {
+      "epoch": 1.9985465116279069,
+      "grad_norm": 8.369733404833823e-05,
+      "learning_rate": 3.860828488372093e-08,
+      "loss": 6.105023785494268e-06,
+      "step": 22000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 22016,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.342304325992448e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

models/dostoievsky_v1/checkpoint-22016/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edab2539a81b65c2a76c4c39bed219aa02358f44f3286429f83165442a1b53fa
+size 5329

models/dostoievsky_v1/final_model/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "classif_dropout": 0.1,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 2,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_decoder": false,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_position_embeddings": 1024,
+  "model_type": "bart",
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "task_specific_params": {
+    "summarization": {
+      "length_penalty": 1.0,
+      "max_length": 128,
+      "min_length": 12,
+      "num_beams": 4
+    },
+    "summarization_cnn": {
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "num_beams": 4
+    },
+    "summarization_xsum": {
+      "length_penalty": 1.0,
+      "max_length": 62,
+      "min_length": 11,
+      "num_beams": 6
+    }
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.0",
+  "use_cache": false,
+  "vocab_size": 50265
+}

models/dostoievsky_v1/final_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "early_stopping": true,
+  "eos_token_id": [
+    2
+  ],
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "no_repeat_ngram_size": 3,
+  "num_beams": 4,
+  "pad_token_id": 1,
+  "transformers_version": "5.5.0",
+  "use_cache": true
+}

models/dostoievsky_v1/final_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9dc9ffae85920773b2b5774f37b9630b197fc455008db011e6288a451870cb8
+size 557912620

models/dostoievsky_v1/final_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/dostoievsky_v1/final_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

models/dostoievsky_v1/final_model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edab2539a81b65c2a76c4c39bed219aa02358f44f3286429f83165442a1b53fa
+size 5329