upload checkpoints

Browse files

Files changed (13) hide show

fourier-spectral-norm-classifier/checkpoint-1500/config.json +29 -0
fourier-spectral-norm-classifier/checkpoint-1500/config_hyperparams.json +82 -0
fourier-spectral-norm-classifier/checkpoint-1500/merges.txt +0 -0
fourier-spectral-norm-classifier/checkpoint-1500/model.safetensors +3 -0
fourier-spectral-norm-classifier/checkpoint-1500/optimizer.pt +3 -0
fourier-spectral-norm-classifier/checkpoint-1500/rng_state.pth +3 -0
fourier-spectral-norm-classifier/checkpoint-1500/scheduler.pt +3 -0
fourier-spectral-norm-classifier/checkpoint-1500/special_tokens_map.json +51 -0
fourier-spectral-norm-classifier/checkpoint-1500/tokenizer.json +0 -0
fourier-spectral-norm-classifier/checkpoint-1500/tokenizer_config.json +58 -0
fourier-spectral-norm-classifier/checkpoint-1500/train_config.yaml +55 -0
fourier-spectral-norm-classifier/checkpoint-1500/trainer_state.json +2184 -0
fourier-spectral-norm-classifier/checkpoint-1500/vocab.json +0 -0

fourier-spectral-norm-classifier/checkpoint-1500/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.3,
+  "bos_token_id": 0,
+  "classifier_dropout": 0.4,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.3,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.56.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

fourier-spectral-norm-classifier/checkpoint-1500/config_hyperparams.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+  "train_config": {
+    "TEST": false,
+    "model_name": "/kaggle/input/models/dzung271828/microsoft-graphcodebert-base/transformers/default/1",
+    "output_dir": "training/fourier-spectral-norm-classifier/",
+    "num_epochs": 5,
+    "max_steps": -1,
+    "batch_size": 512,
+    "learning_rate": 1e-06,
+    "max_length": 512,
+    "num_labels": 2,
+    "use_wandb": false,
+    "freeze_base": true,
+    "loss_type": "ce",
+    "focal_alpha": 1.0,
+    "focal_gamma": 2.0,
+    "r_drop_alpha": 6.0,
+    "infonce_temperature": 0.07,
+    "infonce_weight": 0.5,
+    "seed": 42,
+    "resume_from_checkpoint": null,
+    "save_steps": 500,
+    "eval_steps": 500,
+    "logging_steps": 5,
+    "label_smoothing": 0.5,
+    "adversarial_epsilon": 0.5,
+    "use_swa": true,
+    "swa_start_epoch": 0,
+    "swa_lr": 1e-06,
+    "data_augmentation": true,
+    "aug_rename_prob": 0.7,
+    "aug_format_prob": 0.7,
+    "weight_decay": 0.1,
+    "mixup_alpha": 1.0,
+    "low_pass_keep_ratio": 0.5,
+    "freq_consistency_weight": 0.2,
+    "use_mixcode": true,
+    "use_fgm": true,
+    "fgm_freq": 5,
+    "use_r_drop": true,
+    "use_freq_consistency_loss": true,
+    "use_attn_spectral": false,
+    "attn_spectral_weight": 0.1,
+    "attn_spectral_cutoff_ratio": 0.25,
+    "hidden_dropout_prob": 0.3,
+    "attention_probs_dropout_prob": 0.3,
+    "classifier_dropout": 0.4,
+    "device": "cuda",
+    "torch_compile": true,
+    "cache_dir": "./tokenized_cache",
+    "use_swa_actual": true,
+    "use_fgm_actual": true,
+    "use_r_drop_actual": true,
+    "use_mixcode_actual": true,
+    "use_attn_spectral_actual": false,
+    "use_freq_consistency_loss_actual": true,
+    "use_spectral_norm": true
+  },
+  "training_arguments": {
+    "output_dir": "training/fourier-spectral-norm-classifier/",
+    "num_train_epochs": 5,
+    "per_device_train_batch_size": 512,
+    "per_device_eval_batch_size": 1024,
+    "learning_rate": 1e-06,
+    "warmup_steps": 488,
+    "weight_decay": 0.1,
+    "logging_steps": 5,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "metric_for_best_model": "macro_f1",
+    "greater_is_better": true,
+    "save_total_limit": 5,
+    "fp16": false,
+    "seed": 42
+  },
+  "training_state": {
+    "global_step": 1500,
+    "epoch": 1.5353121801432958,
+    "best_metric": 0.6724504812400831,
+    "best_model_checkpoint": "training/fourier-spectral-norm-classifier/checkpoint-1000"
+  }
+}

fourier-spectral-norm-classifier/checkpoint-1500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

fourier-spectral-norm-classifier/checkpoint-1500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e4bc60c5b8f6a36e32e0d39d6e8298433aad8375eb96017225baf2ef95e07ce
+size 498619200

fourier-spectral-norm-classifier/checkpoint-1500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a83a04a97cc63d010ee0194538f8c07493223183d14eab2d89cd2b4303321ee2
+size 4741923

fourier-spectral-norm-classifier/checkpoint-1500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5909ab7c9c269b6f87c8f78a629bf9a4d31ba187687dd61dda9c2b3eec7a4b6
+size 14645

fourier-spectral-norm-classifier/checkpoint-1500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b851d3befbd44b60717bbc46c57e884e1a9f534520ed08499a8654800370d4cd
+size 1465

fourier-spectral-norm-classifier/checkpoint-1500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

fourier-spectral-norm-classifier/checkpoint-1500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

fourier-spectral-norm-classifier/checkpoint-1500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

fourier-spectral-norm-classifier/checkpoint-1500/train_config.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+TEST: false
+model_name: /kaggle/input/models/dzung271828/microsoft-graphcodebert-base/transformers/default/1
+output_dir: training/fourier-spectral-norm-classifier/
+num_epochs: 5
+max_steps: -1
+batch_size: 512
+learning_rate: 1.0e-06
+max_length: 512
+num_labels: 2
+use_wandb: false
+freeze_base: true
+loss_type: ce
+focal_alpha: 1.0
+focal_gamma: 2.0
+r_drop_alpha: 6.0
+infonce_temperature: 0.07
+infonce_weight: 0.5
+seed: 42
+resume_from_checkpoint: null
+save_steps: 500
+eval_steps: 500
+logging_steps: 5
+label_smoothing: 0.5
+adversarial_epsilon: 0.5
+use_swa: true
+swa_start_epoch: 0
+swa_lr: 1.0e-06
+data_augmentation: true
+aug_rename_prob: 0.7
+aug_format_prob: 0.7
+weight_decay: 0.1
+mixup_alpha: 1.0
+low_pass_keep_ratio: 0.5
+freq_consistency_weight: 0.2
+use_mixcode: true
+use_fgm: true
+fgm_freq: 5
+use_r_drop: true
+use_freq_consistency_loss: true
+use_attn_spectral: false
+attn_spectral_weight: 0.1
+attn_spectral_cutoff_ratio: 0.25
+hidden_dropout_prob: 0.3
+attention_probs_dropout_prob: 0.3
+classifier_dropout: 0.4
+device: cuda
+torch_compile: true
+cache_dir: ./tokenized_cache
+use_swa_actual: true
+use_fgm_actual: true
+use_r_drop_actual: true
+use_mixcode_actual: true
+use_attn_spectral_actual: false
+use_freq_consistency_loss_actual: true
+use_spectral_norm: true

fourier-spectral-norm-classifier/checkpoint-1500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2184 @@

+{
+  "best_global_step": 1000,
+  "best_metric": 0.6724504812400831,
+  "best_model_checkpoint": "training/fourier-spectral-norm-classifier/checkpoint-1000",
+  "epoch": 1.5353121801432958,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "SWA": "started",
+      "epoch": 0,
+      "step": 0
+    },
+    {
+      "epoch": 0.00511770726714432,
+      "grad_norm": 1.7937116622924805,
+      "learning_rate": 8.19672131147541e-09,
+      "loss": 0.8149,
+      "step": 5
+    },
+    {
+      "epoch": 0.01023541453428864,
+      "grad_norm": 1.8986879587173462,
+      "learning_rate": 1.844262295081967e-08,
+      "loss": 0.8145,
+      "step": 10
+    },
+    {
+      "epoch": 0.015353121801432957,
+      "grad_norm": 1.8692522048950195,
+      "learning_rate": 2.8688524590163933e-08,
+      "loss": 0.8031,
+      "step": 15
+    },
+    {
+      "epoch": 0.02047082906857728,
+      "grad_norm": 1.6589646339416504,
+      "learning_rate": 3.8934426229508196e-08,
+      "loss": 0.8208,
+      "step": 20
+    },
+    {
+      "epoch": 0.0255885363357216,
+      "grad_norm": 2.377978563308716,
+      "learning_rate": 4.918032786885246e-08,
+      "loss": 0.8054,
+      "step": 25
+    },
+    {
+      "epoch": 0.030706243602865915,
+      "grad_norm": 2.000364065170288,
+      "learning_rate": 5.9426229508196716e-08,
+      "loss": 0.8064,
+      "step": 30
+    },
+    {
+      "epoch": 0.03582395087001024,
+      "grad_norm": 1.8844542503356934,
+      "learning_rate": 6.967213114754098e-08,
+      "loss": 0.8047,
+      "step": 35
+    },
+    {
+      "epoch": 0.04094165813715456,
+      "grad_norm": 2.0933573246002197,
+      "learning_rate": 7.991803278688524e-08,
+      "loss": 0.8156,
+      "step": 40
+    },
+    {
+      "epoch": 0.04605936540429888,
+      "grad_norm": 1.8126033544540405,
+      "learning_rate": 9.01639344262295e-08,
+      "loss": 0.8074,
+      "step": 45
+    },
+    {
+      "epoch": 0.0511770726714432,
+      "grad_norm": 2.5709195137023926,
+      "learning_rate": 1.0040983606557377e-07,
+      "loss": 0.8124,
+      "step": 50
+    },
+    {
+      "epoch": 0.05629477993858751,
+      "grad_norm": 2.1875293254852295,
+      "learning_rate": 1.1065573770491803e-07,
+      "loss": 0.8143,
+      "step": 55
+    },
+    {
+      "epoch": 0.06141248720573183,
+      "grad_norm": 2.0810351371765137,
+      "learning_rate": 1.209016393442623e-07,
+      "loss": 0.8149,
+      "step": 60
+    },
+    {
+      "epoch": 0.06653019447287616,
+      "grad_norm": 1.7912037372589111,
+      "learning_rate": 1.3114754098360656e-07,
+      "loss": 0.8022,
+      "step": 65
+    },
+    {
+      "epoch": 0.07164790174002048,
+      "grad_norm": 1.7301534414291382,
+      "learning_rate": 1.413934426229508e-07,
+      "loss": 0.8149,
+      "step": 70
+    },
+    {
+      "epoch": 0.0767656090071648,
+      "grad_norm": 1.9520158767700195,
+      "learning_rate": 1.5163934426229508e-07,
+      "loss": 0.8201,
+      "step": 75
+    },
+    {
+      "epoch": 0.08188331627430911,
+      "grad_norm": 2.11938214302063,
+      "learning_rate": 1.6188524590163935e-07,
+      "loss": 0.8079,
+      "step": 80
+    },
+    {
+      "epoch": 0.08700102354145343,
+      "grad_norm": 2.1483607292175293,
+      "learning_rate": 1.7213114754098358e-07,
+      "loss": 0.8084,
+      "step": 85
+    },
+    {
+      "epoch": 0.09211873080859775,
+      "grad_norm": 2.1716372966766357,
+      "learning_rate": 1.8237704918032787e-07,
+      "loss": 0.8188,
+      "step": 90
+    },
+    {
+      "epoch": 0.09723643807574207,
+      "grad_norm": 2.3327996730804443,
+      "learning_rate": 1.926229508196721e-07,
+      "loss": 0.8153,
+      "step": 95
+    },
+    {
+      "epoch": 0.1023541453428864,
+      "grad_norm": 1.762168526649475,
+      "learning_rate": 2.028688524590164e-07,
+      "loss": 0.8064,
+      "step": 100
+    },
+    {
+      "epoch": 0.10747185261003071,
+      "grad_norm": 1.7200757265090942,
+      "learning_rate": 2.1311475409836064e-07,
+      "loss": 0.8063,
+      "step": 105
+    },
+    {
+      "epoch": 0.11258955987717502,
+      "grad_norm": 2.490513324737549,
+      "learning_rate": 2.233606557377049e-07,
+      "loss": 0.8192,
+      "step": 110
+    },
+    {
+      "epoch": 0.11770726714431934,
+      "grad_norm": 2.244020938873291,
+      "learning_rate": 2.336065573770492e-07,
+      "loss": 0.8153,
+      "step": 115
+    },
+    {
+      "epoch": 0.12282497441146366,
+      "grad_norm": 2.1315150260925293,
+      "learning_rate": 2.438524590163934e-07,
+      "loss": 0.807,
+      "step": 120
+    },
+    {
+      "epoch": 0.12794268167860798,
+      "grad_norm": 2.320936918258667,
+      "learning_rate": 2.540983606557377e-07,
+      "loss": 0.8163,
+      "step": 125
+    },
+    {
+      "epoch": 0.1330603889457523,
+      "grad_norm": 2.7143912315368652,
+      "learning_rate": 2.643442622950819e-07,
+      "loss": 0.8166,
+      "step": 130
+    },
+    {
+      "epoch": 0.13817809621289662,
+      "grad_norm": 1.649880290031433,
+      "learning_rate": 2.7459016393442624e-07,
+      "loss": 0.8113,
+      "step": 135
+    },
+    {
+      "epoch": 0.14329580348004095,
+      "grad_norm": 2.171790361404419,
+      "learning_rate": 2.848360655737705e-07,
+      "loss": 0.805,
+      "step": 140
+    },
+    {
+      "epoch": 0.14841351074718526,
+      "grad_norm": 2.093440294265747,
+      "learning_rate": 2.950819672131147e-07,
+      "loss": 0.8118,
+      "step": 145
+    },
+    {
+      "epoch": 0.1535312180143296,
+      "grad_norm": 1.9067059755325317,
+      "learning_rate": 3.05327868852459e-07,
+      "loss": 0.8047,
+      "step": 150
+    },
+    {
+      "epoch": 0.1586489252814739,
+      "grad_norm": 1.9988980293273926,
+      "learning_rate": 3.155737704918033e-07,
+      "loss": 0.8091,
+      "step": 155
+    },
+    {
+      "epoch": 0.16376663254861823,
+      "grad_norm": 1.696977972984314,
+      "learning_rate": 3.258196721311475e-07,
+      "loss": 0.8101,
+      "step": 160
+    },
+    {
+      "epoch": 0.16888433981576254,
+      "grad_norm": 2.098017454147339,
+      "learning_rate": 3.3606557377049177e-07,
+      "loss": 0.81,
+      "step": 165
+    },
+    {
+      "epoch": 0.17400204708290687,
+      "grad_norm": 2.0255584716796875,
+      "learning_rate": 3.463114754098361e-07,
+      "loss": 0.814,
+      "step": 170
+    },
+    {
+      "epoch": 0.17911975435005117,
+      "grad_norm": 1.8376339673995972,
+      "learning_rate": 3.565573770491803e-07,
+      "loss": 0.8053,
+      "step": 175
+    },
+    {
+      "epoch": 0.1842374616171955,
+      "grad_norm": 1.9230207204818726,
+      "learning_rate": 3.6680327868852456e-07,
+      "loss": 0.8022,
+      "step": 180
+    },
+    {
+      "epoch": 0.18935516888433981,
+      "grad_norm": 1.939705729484558,
+      "learning_rate": 3.770491803278688e-07,
+      "loss": 0.8075,
+      "step": 185
+    },
+    {
+      "epoch": 0.19447287615148415,
+      "grad_norm": 1.6276813745498657,
+      "learning_rate": 3.8729508196721314e-07,
+      "loss": 0.8097,
+      "step": 190
+    },
+    {
+      "epoch": 0.19959058341862845,
+      "grad_norm": 1.7544569969177246,
+      "learning_rate": 3.9754098360655735e-07,
+      "loss": 0.8046,
+      "step": 195
+    },
+    {
+      "epoch": 0.2047082906857728,
+      "grad_norm": 1.7406467199325562,
+      "learning_rate": 4.077868852459016e-07,
+      "loss": 0.8149,
+      "step": 200
+    },
+    {
+      "epoch": 0.2098259979529171,
+      "grad_norm": 1.7330560684204102,
+      "learning_rate": 4.180327868852459e-07,
+      "loss": 0.8077,
+      "step": 205
+    },
+    {
+      "epoch": 0.21494370522006143,
+      "grad_norm": 1.417546033859253,
+      "learning_rate": 4.2827868852459014e-07,
+      "loss": 0.807,
+      "step": 210
+    },
+    {
+      "epoch": 0.22006141248720573,
+      "grad_norm": 2.1064000129699707,
+      "learning_rate": 4.385245901639344e-07,
+      "loss": 0.8041,
+      "step": 215
+    },
+    {
+      "epoch": 0.22517911975435004,
+      "grad_norm": 1.637609601020813,
+      "learning_rate": 4.487704918032787e-07,
+      "loss": 0.7992,
+      "step": 220
+    },
+    {
+      "epoch": 0.23029682702149437,
+      "grad_norm": 1.659397840499878,
+      "learning_rate": 4.590163934426229e-07,
+      "loss": 0.802,
+      "step": 225
+    },
+    {
+      "epoch": 0.23541453428863868,
+      "grad_norm": 1.6912051439285278,
+      "learning_rate": 4.692622950819672e-07,
+      "loss": 0.8005,
+      "step": 230
+    },
+    {
+      "epoch": 0.240532241555783,
+      "grad_norm": 1.9433246850967407,
+      "learning_rate": 4.795081967213115e-07,
+      "loss": 0.8079,
+      "step": 235
+    },
+    {
+      "epoch": 0.24564994882292732,
+      "grad_norm": 1.9640270471572876,
+      "learning_rate": 4.897540983606557e-07,
+      "loss": 0.8127,
+      "step": 240
+    },
+    {
+      "epoch": 0.2507676560900716,
+      "grad_norm": 2.3167271614074707,
+      "learning_rate": 5e-07,
+      "loss": 0.8058,
+      "step": 245
+    },
+    {
+      "epoch": 0.25588536335721596,
+      "grad_norm": 1.6469106674194336,
+      "learning_rate": 5.102459016393442e-07,
+      "loss": 0.8011,
+      "step": 250
+    },
+    {
+      "epoch": 0.2610030706243603,
+      "grad_norm": 1.5691314935684204,
+      "learning_rate": 5.204918032786885e-07,
+      "loss": 0.7968,
+      "step": 255
+    },
+    {
+      "epoch": 0.2661207778915046,
+      "grad_norm": 1.663665533065796,
+      "learning_rate": 5.307377049180327e-07,
+      "loss": 0.8018,
+      "step": 260
+    },
+    {
+      "epoch": 0.2712384851586489,
+      "grad_norm": 1.99347984790802,
+      "learning_rate": 5.40983606557377e-07,
+      "loss": 0.8006,
+      "step": 265
+    },
+    {
+      "epoch": 0.27635619242579323,
+      "grad_norm": 1.4906947612762451,
+      "learning_rate": 5.512295081967213e-07,
+      "loss": 0.7977,
+      "step": 270
+    },
+    {
+      "epoch": 0.28147389969293757,
+      "grad_norm": 1.786527395248413,
+      "learning_rate": 5.614754098360656e-07,
+      "loss": 0.8041,
+      "step": 275
+    },
+    {
+      "epoch": 0.2865916069600819,
+      "grad_norm": 1.9175364971160889,
+      "learning_rate": 5.717213114754098e-07,
+      "loss": 0.8079,
+      "step": 280
+    },
+    {
+      "epoch": 0.2917093142272262,
+      "grad_norm": 1.678741216659546,
+      "learning_rate": 5.819672131147541e-07,
+      "loss": 0.7974,
+      "step": 285
+    },
+    {
+      "epoch": 0.2968270214943705,
+      "grad_norm": 2.0347344875335693,
+      "learning_rate": 5.922131147540983e-07,
+      "loss": 0.8011,
+      "step": 290
+    },
+    {
+      "epoch": 0.30194472876151485,
+      "grad_norm": 1.8914201259613037,
+      "learning_rate": 6.024590163934425e-07,
+      "loss": 0.8026,
+      "step": 295
+    },
+    {
+      "epoch": 0.3070624360286592,
+      "grad_norm": 1.6236293315887451,
+      "learning_rate": 6.127049180327869e-07,
+      "loss": 0.7981,
+      "step": 300
+    },
+    {
+      "epoch": 0.31218014329580346,
+      "grad_norm": 1.4731358289718628,
+      "learning_rate": 6.229508196721311e-07,
+      "loss": 0.7972,
+      "step": 305
+    },
+    {
+      "epoch": 0.3172978505629478,
+      "grad_norm": 1.7494508028030396,
+      "learning_rate": 6.331967213114754e-07,
+      "loss": 0.797,
+      "step": 310
+    },
+    {
+      "epoch": 0.3224155578300921,
+      "grad_norm": 1.696869134902954,
+      "learning_rate": 6.434426229508197e-07,
+      "loss": 0.7972,
+      "step": 315
+    },
+    {
+      "epoch": 0.32753326509723646,
+      "grad_norm": 1.5431866645812988,
+      "learning_rate": 6.536885245901639e-07,
+      "loss": 0.7919,
+      "step": 320
+    },
+    {
+      "epoch": 0.33265097236438074,
+      "grad_norm": 1.6396448612213135,
+      "learning_rate": 6.639344262295081e-07,
+      "loss": 0.7986,
+      "step": 325
+    },
+    {
+      "epoch": 0.33776867963152507,
+      "grad_norm": 1.7315205335617065,
+      "learning_rate": 6.741803278688525e-07,
+      "loss": 0.7966,
+      "step": 330
+    },
+    {
+      "epoch": 0.3428863868986694,
+      "grad_norm": 1.6142867803573608,
+      "learning_rate": 6.844262295081967e-07,
+      "loss": 0.7964,
+      "step": 335
+    },
+    {
+      "epoch": 0.34800409416581374,
+      "grad_norm": 1.332783818244934,
+      "learning_rate": 6.94672131147541e-07,
+      "loss": 0.7969,
+      "step": 340
+    },
+    {
+      "epoch": 0.353121801432958,
+      "grad_norm": 1.434688687324524,
+      "learning_rate": 7.049180327868852e-07,
+      "loss": 0.8015,
+      "step": 345
+    },
+    {
+      "epoch": 0.35823950870010235,
+      "grad_norm": 1.7243021726608276,
+      "learning_rate": 7.151639344262295e-07,
+      "loss": 0.791,
+      "step": 350
+    },
+    {
+      "epoch": 0.3633572159672467,
+      "grad_norm": 1.603244662284851,
+      "learning_rate": 7.254098360655737e-07,
+      "loss": 0.7926,
+      "step": 355
+    },
+    {
+      "epoch": 0.368474923234391,
+      "grad_norm": 1.645308256149292,
+      "learning_rate": 7.356557377049179e-07,
+      "loss": 0.7988,
+      "step": 360
+    },
+    {
+      "epoch": 0.3735926305015353,
+      "grad_norm": 1.3321951627731323,
+      "learning_rate": 7.459016393442623e-07,
+      "loss": 0.7923,
+      "step": 365
+    },
+    {
+      "epoch": 0.37871033776867963,
+      "grad_norm": 2.1083521842956543,
+      "learning_rate": 7.561475409836066e-07,
+      "loss": 0.7935,
+      "step": 370
+    },
+    {
+      "epoch": 0.38382804503582396,
+      "grad_norm": 1.3414019346237183,
+      "learning_rate": 7.663934426229508e-07,
+      "loss": 0.7894,
+      "step": 375
+    },
+    {
+      "epoch": 0.3889457523029683,
+      "grad_norm": 1.8279671669006348,
+      "learning_rate": 7.766393442622951e-07,
+      "loss": 0.7916,
+      "step": 380
+    },
+    {
+      "epoch": 0.3940634595701126,
+      "grad_norm": 1.6233114004135132,
+      "learning_rate": 7.868852459016393e-07,
+      "loss": 0.7886,
+      "step": 385
+    },
+    {
+      "epoch": 0.3991811668372569,
+      "grad_norm": 1.4336532354354858,
+      "learning_rate": 7.971311475409835e-07,
+      "loss": 0.7884,
+      "step": 390
+    },
+    {
+      "epoch": 0.40429887410440124,
+      "grad_norm": 1.597020149230957,
+      "learning_rate": 8.073770491803278e-07,
+      "loss": 0.7904,
+      "step": 395
+    },
+    {
+      "epoch": 0.4094165813715456,
+      "grad_norm": 1.3191157579421997,
+      "learning_rate": 8.176229508196721e-07,
+      "loss": 0.787,
+      "step": 400
+    },
+    {
+      "epoch": 0.41453428863868985,
+      "grad_norm": 1.6425617933273315,
+      "learning_rate": 8.278688524590164e-07,
+      "loss": 0.7887,
+      "step": 405
+    },
+    {
+      "epoch": 0.4196519959058342,
+      "grad_norm": 1.3924281597137451,
+      "learning_rate": 8.381147540983607e-07,
+      "loss": 0.7976,
+      "step": 410
+    },
+    {
+      "epoch": 0.4247697031729785,
+      "grad_norm": 1.2975757122039795,
+      "learning_rate": 8.483606557377049e-07,
+      "loss": 0.7895,
+      "step": 415
+    },
+    {
+      "epoch": 0.42988741044012285,
+      "grad_norm": 1.3045737743377686,
+      "learning_rate": 8.586065573770491e-07,
+      "loss": 0.7894,
+      "step": 420
+    },
+    {
+      "epoch": 0.43500511770726713,
+      "grad_norm": 1.9618183374404907,
+      "learning_rate": 8.688524590163933e-07,
+      "loss": 0.7865,
+      "step": 425
+    },
+    {
+      "epoch": 0.44012282497441146,
+      "grad_norm": 1.3976588249206543,
+      "learning_rate": 8.790983606557376e-07,
+      "loss": 0.7896,
+      "step": 430
+    },
+    {
+      "epoch": 0.4452405322415558,
+      "grad_norm": 1.1260899305343628,
+      "learning_rate": 8.89344262295082e-07,
+      "loss": 0.7861,
+      "step": 435
+    },
+    {
+      "epoch": 0.4503582395087001,
+      "grad_norm": 1.293816089630127,
+      "learning_rate": 8.995901639344262e-07,
+      "loss": 0.7826,
+      "step": 440
+    },
+    {
+      "epoch": 0.4554759467758444,
+      "grad_norm": 1.4861347675323486,
+      "learning_rate": 9.098360655737705e-07,
+      "loss": 0.7822,
+      "step": 445
+    },
+    {
+      "epoch": 0.46059365404298874,
+      "grad_norm": 1.378319501876831,
+      "learning_rate": 9.200819672131147e-07,
+      "loss": 0.778,
+      "step": 450
+    },
+    {
+      "epoch": 0.4657113613101331,
+      "grad_norm": 1.2947815656661987,
+      "learning_rate": 9.303278688524589e-07,
+      "loss": 0.7853,
+      "step": 455
+    },
+    {
+      "epoch": 0.47082906857727735,
+      "grad_norm": 0.9865773916244507,
+      "learning_rate": 9.405737704918032e-07,
+      "loss": 0.7797,
+      "step": 460
+    },
+    {
+      "epoch": 0.4759467758444217,
+      "grad_norm": 1.4883133172988892,
+      "learning_rate": 9.508196721311474e-07,
+      "loss": 0.7804,
+      "step": 465
+    },
+    {
+      "epoch": 0.481064483111566,
+      "grad_norm": 1.1394942998886108,
+      "learning_rate": 9.610655737704918e-07,
+      "loss": 0.7818,
+      "step": 470
+    },
+    {
+      "epoch": 0.48618219037871035,
+      "grad_norm": 1.104995846748352,
+      "learning_rate": 9.71311475409836e-07,
+      "loss": 0.7775,
+      "step": 475
+    },
+    {
+      "epoch": 0.49129989764585463,
+      "grad_norm": 1.258623719215393,
+      "learning_rate": 9.815573770491803e-07,
+      "loss": 0.7731,
+      "step": 480
+    },
+    {
+      "epoch": 0.49641760491299897,
+      "grad_norm": 1.4409220218658447,
+      "learning_rate": 9.918032786885245e-07,
+      "loss": 0.7811,
+      "step": 485
+    },
+    {
+      "epoch": 0.5015353121801432,
+      "grad_norm": 0.9952474236488342,
+      "learning_rate": 9.999994895105863e-07,
+      "loss": 0.7821,
+      "step": 490
+    },
+    {
+      "epoch": 0.5066530194472876,
+      "grad_norm": 1.2250083684921265,
+      "learning_rate": 9.99981622490561e-07,
+      "loss": 0.7822,
+      "step": 495
+    },
+    {
+      "epoch": 0.5117707267144319,
+      "grad_norm": 1.1539254188537598,
+      "learning_rate": 9.999382320422427e-07,
+      "loss": 0.776,
+      "step": 500
+    },
+    {
+      "epoch": 0.5117707267144319,
+      "eval_accuracy": 0.59523,
+      "eval_loss": 0.6936843991279602,
+      "eval_macro_f1": 0.5690192634397302,
+      "eval_precision": 0.6518208624514151,
+      "eval_recall": 0.6078906162164894,
+      "eval_runtime": 73.7478,
+      "eval_samples_per_second": 1355.972,
+      "eval_steps_per_second": 1.329,
+      "step": 500
+    },
+    {
+      "epoch": 0.5168884339815762,
+      "grad_norm": 1.2244267463684082,
+      "learning_rate": 9.998693203806588e-07,
+      "loss": 0.7771,
+      "step": 505
+    },
+    {
+      "epoch": 0.5220061412487206,
+      "grad_norm": 1.1900156736373901,
+      "learning_rate": 9.997748910236623e-07,
+      "loss": 0.7815,
+      "step": 510
+    },
+    {
+      "epoch": 0.5271238485158649,
+      "grad_norm": 1.2272601127624512,
+      "learning_rate": 9.996549487917522e-07,
+      "loss": 0.7829,
+      "step": 515
+    },
+    {
+      "epoch": 0.5322415557830092,
+      "grad_norm": 1.160675287246704,
+      "learning_rate": 9.995094998078276e-07,
+      "loss": 0.7785,
+      "step": 520
+    },
+    {
+      "epoch": 0.5373592630501536,
+      "grad_norm": 1.2759345769882202,
+      "learning_rate": 9.993385514968745e-07,
+      "loss": 0.7755,
+      "step": 525
+    },
+    {
+      "epoch": 0.5424769703172978,
+      "grad_norm": 1.0531632900238037,
+      "learning_rate": 9.99142112585588e-07,
+      "loss": 0.7781,
+      "step": 530
+    },
+    {
+      "epoch": 0.5475946775844421,
+      "grad_norm": 1.0040606260299683,
+      "learning_rate": 9.989201931019251e-07,
+      "loss": 0.7744,
+      "step": 535
+    },
+    {
+      "epoch": 0.5527123848515865,
+      "grad_norm": 1.2468197345733643,
+      "learning_rate": 9.98672804374595e-07,
+      "loss": 0.7712,
+      "step": 540
+    },
+    {
+      "epoch": 0.5578300921187308,
+      "grad_norm": 1.1564112901687622,
+      "learning_rate": 9.983999590324778e-07,
+      "loss": 0.7797,
+      "step": 545
+    },
+    {
+      "epoch": 0.5629477993858751,
+      "grad_norm": 0.8854450583457947,
+      "learning_rate": 9.981016710039832e-07,
+      "loss": 0.7723,
+      "step": 550
+    },
+    {
+      "epoch": 0.5680655066530195,
+      "grad_norm": 1.142919659614563,
+      "learning_rate": 9.977779555163369e-07,
+      "loss": 0.7739,
+      "step": 555
+    },
+    {
+      "epoch": 0.5731832139201638,
+      "grad_norm": 1.058153748512268,
+      "learning_rate": 9.974288290948042e-07,
+      "loss": 0.774,
+      "step": 560
+    },
+    {
+      "epoch": 0.5783009211873081,
+      "grad_norm": 1.1157392263412476,
+      "learning_rate": 9.970543095618468e-07,
+      "loss": 0.7742,
+      "step": 565
+    },
+    {
+      "epoch": 0.5834186284544524,
+      "grad_norm": 1.0850578546524048,
+      "learning_rate": 9.96654416036212e-07,
+      "loss": 0.7734,
+      "step": 570
+    },
+    {
+      "epoch": 0.5885363357215967,
+      "grad_norm": 0.9722121953964233,
+      "learning_rate": 9.96229168931958e-07,
+      "loss": 0.77,
+      "step": 575
+    },
+    {
+      "epoch": 0.593654042988741,
+      "grad_norm": 1.332795262336731,
+      "learning_rate": 9.957785899574102e-07,
+      "loss": 0.7725,
+      "step": 580
+    },
+    {
+      "epoch": 0.5987717502558854,
+      "grad_norm": 0.8639675378799438,
+      "learning_rate": 9.953027021140543e-07,
+      "loss": 0.7646,
+      "step": 585
+    },
+    {
+      "epoch": 0.6038894575230297,
+      "grad_norm": 0.9253244400024414,
+      "learning_rate": 9.948015296953623e-07,
+      "loss": 0.7743,
+      "step": 590
+    },
+    {
+      "epoch": 0.609007164790174,
+      "grad_norm": 0.8843643069267273,
+      "learning_rate": 9.942750982855503e-07,
+      "loss": 0.7717,
+      "step": 595
+    },
+    {
+      "epoch": 0.6141248720573184,
+      "grad_norm": 1.046048879623413,
+      "learning_rate": 9.937234347582753e-07,
+      "loss": 0.7721,
+      "step": 600
+    },
+    {
+      "epoch": 0.6192425793244627,
+      "grad_norm": 0.8906111717224121,
+      "learning_rate": 9.931465672752613e-07,
+      "loss": 0.7657,
+      "step": 605
+    },
+    {
+      "epoch": 0.6243602865916069,
+      "grad_norm": 0.9637787342071533,
+      "learning_rate": 9.925445252848621e-07,
+      "loss": 0.7666,
+      "step": 610
+    },
+    {
+      "epoch": 0.6294779938587513,
+      "grad_norm": 0.9004104733467102,
+      "learning_rate": 9.919173395205584e-07,
+      "loss": 0.7664,
+      "step": 615
+    },
+    {
+      "epoch": 0.6345957011258956,
+      "grad_norm": 1.4724570512771606,
+      "learning_rate": 9.912650419993893e-07,
+      "loss": 0.7679,
+      "step": 620
+    },
+    {
+      "epoch": 0.6397134083930399,
+      "grad_norm": 0.8644343614578247,
+      "learning_rate": 9.905876660203161e-07,
+      "loss": 0.7671,
+      "step": 625
+    },
+    {
+      "epoch": 0.6448311156601843,
+      "grad_norm": 0.8368955254554749,
+      "learning_rate": 9.898852461625245e-07,
+      "loss": 0.7717,
+      "step": 630
+    },
+    {
+      "epoch": 0.6499488229273286,
+      "grad_norm": 0.9413282871246338,
+      "learning_rate": 9.891578182836583e-07,
+      "loss": 0.7693,
+      "step": 635
+    },
+    {
+      "epoch": 0.6550665301944729,
+      "grad_norm": 0.9777762293815613,
+      "learning_rate": 9.884054195179886e-07,
+      "loss": 0.7656,
+      "step": 640
+    },
+    {
+      "epoch": 0.6601842374616171,
+      "grad_norm": 0.8983454704284668,
+      "learning_rate": 9.876280882745193e-07,
+      "loss": 0.7605,
+      "step": 645
+    },
+    {
+      "epoch": 0.6653019447287615,
+      "grad_norm": 0.8708799481391907,
+      "learning_rate": 9.868258642350254e-07,
+      "loss": 0.7673,
+      "step": 650
+    },
+    {
+      "epoch": 0.6704196519959058,
+      "grad_norm": 0.8354130387306213,
+      "learning_rate": 9.859987883520275e-07,
+      "loss": 0.767,
+      "step": 655
+    },
+    {
+      "epoch": 0.6755373592630501,
+      "grad_norm": 0.868485152721405,
+      "learning_rate": 9.851469028467015e-07,
+      "loss": 0.7647,
+      "step": 660
+    },
+    {
+      "epoch": 0.6806550665301945,
+      "grad_norm": 0.9445936679840088,
+      "learning_rate": 9.84270251206723e-07,
+      "loss": 0.7605,
+      "step": 665
+    },
+    {
+      "epoch": 0.6857727737973388,
+      "grad_norm": 0.7952156662940979,
+      "learning_rate": 9.833688781840475e-07,
+      "loss": 0.7664,
+      "step": 670
+    },
+    {
+      "epoch": 0.6908904810644831,
+      "grad_norm": 1.1992422342300415,
+      "learning_rate": 9.824428297926254e-07,
+      "loss": 0.7617,
+      "step": 675
+    },
+    {
+      "epoch": 0.6960081883316275,
+      "grad_norm": 0.8914986252784729,
+      "learning_rate": 9.81492153306054e-07,
+      "loss": 0.764,
+      "step": 680
+    },
+    {
+      "epoch": 0.7011258955987717,
+      "grad_norm": 0.7945632338523865,
+      "learning_rate": 9.80516897255163e-07,
+      "loss": 0.7617,
+      "step": 685
+    },
+    {
+      "epoch": 0.706243602865916,
+      "grad_norm": 0.7822641134262085,
+      "learning_rate": 9.795171114255384e-07,
+      "loss": 0.7613,
+      "step": 690
+    },
+    {
+      "epoch": 0.7113613101330604,
+      "grad_norm": 0.7989721298217773,
+      "learning_rate": 9.784928468549793e-07,
+      "loss": 0.7615,
+      "step": 695
+    },
+    {
+      "epoch": 0.7164790174002047,
+      "grad_norm": 0.7325178980827332,
+      "learning_rate": 9.77444155830895e-07,
+      "loss": 0.7572,
+      "step": 700
+    },
+    {
+      "epoch": 0.721596724667349,
+      "grad_norm": 0.8934036493301392,
+      "learning_rate": 9.763710918876329e-07,
+      "loss": 0.7589,
+      "step": 705
+    },
+    {
+      "epoch": 0.7267144319344934,
+      "grad_norm": 0.7769590616226196,
+      "learning_rate": 9.752737098037477e-07,
+      "loss": 0.7573,
+      "step": 710
+    },
+    {
+      "epoch": 0.7318321392016377,
+      "grad_norm": 1.0458475351333618,
+      "learning_rate": 9.741520655992047e-07,
+      "loss": 0.759,
+      "step": 715
+    },
+    {
+      "epoch": 0.736949846468782,
+      "grad_norm": 0.649872899055481,
+      "learning_rate": 9.730062165325185e-07,
+      "loss": 0.7607,
+      "step": 720
+    },
+    {
+      "epoch": 0.7420675537359263,
+      "grad_norm": 0.7517932057380676,
+      "learning_rate": 9.718362210978329e-07,
+      "loss": 0.7567,
+      "step": 725
+    },
+    {
+      "epoch": 0.7471852610030706,
+      "grad_norm": 0.9947759509086609,
+      "learning_rate": 9.706421390219315e-07,
+      "loss": 0.7593,
+      "step": 730
+    },
+    {
+      "epoch": 0.7523029682702149,
+      "grad_norm": 0.719109833240509,
+      "learning_rate": 9.694240312611917e-07,
+      "loss": 0.7615,
+      "step": 735
+    },
+    {
+      "epoch": 0.7574206755373593,
+      "grad_norm": 1.0175235271453857,
+      "learning_rate": 9.681819599984712e-07,
+      "loss": 0.7555,
+      "step": 740
+    },
+    {
+      "epoch": 0.7625383828045036,
+      "grad_norm": 0.8200032711029053,
+      "learning_rate": 9.66915988639934e-07,
+      "loss": 0.7565,
+      "step": 745
+    },
+    {
+      "epoch": 0.7676560900716479,
+      "grad_norm": 0.926680326461792,
+      "learning_rate": 9.656261818118139e-07,
+      "loss": 0.7628,
+      "step": 750
+    },
+    {
+      "epoch": 0.7727737973387923,
+      "grad_norm": 0.6904947757720947,
+      "learning_rate": 9.64312605357115e-07,
+      "loss": 0.7584,
+      "step": 755
+    },
+    {
+      "epoch": 0.7778915046059366,
+      "grad_norm": 0.7391018867492676,
+      "learning_rate": 9.62975326332251e-07,
+      "loss": 0.7582,
+      "step": 760
+    },
+    {
+      "epoch": 0.7830092118730808,
+      "grad_norm": 0.7193120121955872,
+      "learning_rate": 9.616144130036214e-07,
+      "loss": 0.7557,
+      "step": 765
+    },
+    {
+      "epoch": 0.7881269191402251,
+      "grad_norm": 0.8275336623191833,
+      "learning_rate": 9.602299348441277e-07,
+      "loss": 0.7575,
+      "step": 770
+    },
+    {
+      "epoch": 0.7932446264073695,
+      "grad_norm": 0.9943181276321411,
+      "learning_rate": 9.58821962529625e-07,
+      "loss": 0.7568,
+      "step": 775
+    },
+    {
+      "epoch": 0.7983623336745138,
+      "grad_norm": 0.7646188139915466,
+      "learning_rate": 9.573905679353166e-07,
+      "loss": 0.752,
+      "step": 780
+    },
+    {
+      "epoch": 0.8034800409416581,
+      "grad_norm": 0.7356329560279846,
+      "learning_rate": 9.55935824132082e-07,
+      "loss": 0.7552,
+      "step": 785
+    },
+    {
+      "epoch": 0.8085977482088025,
+      "grad_norm": 0.795838475227356,
+      "learning_rate": 9.544578053827495e-07,
+      "loss": 0.7543,
+      "step": 790
+    },
+    {
+      "epoch": 0.8137154554759468,
+      "grad_norm": 0.9953216314315796,
+      "learning_rate": 9.529565871383034e-07,
+      "loss": 0.7558,
+      "step": 795
+    },
+    {
+      "epoch": 0.8188331627430911,
+      "grad_norm": 0.797937273979187,
+      "learning_rate": 9.514322460340329e-07,
+      "loss": 0.7542,
+      "step": 800
+    },
+    {
+      "epoch": 0.8239508700102354,
+      "grad_norm": 0.7371375560760498,
+      "learning_rate": 9.498848598856198e-07,
+      "loss": 0.7532,
+      "step": 805
+    },
+    {
+      "epoch": 0.8290685772773797,
+      "grad_norm": 0.8336758613586426,
+      "learning_rate": 9.48314507685166e-07,
+      "loss": 0.756,
+      "step": 810
+    },
+    {
+      "epoch": 0.834186284544524,
+      "grad_norm": 0.7204869389533997,
+      "learning_rate": 9.467212695971619e-07,
+      "loss": 0.7564,
+      "step": 815
+    },
+    {
+      "epoch": 0.8393039918116684,
+      "grad_norm": 0.6758232712745667,
+      "learning_rate": 9.451052269543929e-07,
+      "loss": 0.7548,
+      "step": 820
+    },
+    {
+      "epoch": 0.8444216990788127,
+      "grad_norm": 0.7348074913024902,
+      "learning_rate": 9.434664622537883e-07,
+      "loss": 0.7535,
+      "step": 825
+    },
+    {
+      "epoch": 0.849539406345957,
+      "grad_norm": 0.747559130191803,
+      "learning_rate": 9.418050591522093e-07,
+      "loss": 0.752,
+      "step": 830
+    },
+    {
+      "epoch": 0.8546571136131014,
+      "grad_norm": 0.7392817735671997,
+      "learning_rate": 9.401211024621792e-07,
+      "loss": 0.7492,
+      "step": 835
+    },
+    {
+      "epoch": 0.8597748208802457,
+      "grad_norm": 0.6318978071212769,
+      "learning_rate": 9.384146781475533e-07,
+      "loss": 0.7577,
+      "step": 840
+    },
+    {
+      "epoch": 0.8648925281473899,
+      "grad_norm": 0.5832816362380981,
+      "learning_rate": 9.366858733191307e-07,
+      "loss": 0.7506,
+      "step": 845
+    },
+    {
+      "epoch": 0.8700102354145343,
+      "grad_norm": 0.6932022571563721,
+      "learning_rate": 9.349347762302071e-07,
+      "loss": 0.7523,
+      "step": 850
+    },
+    {
+      "epoch": 0.8751279426816786,
+      "grad_norm": 0.7047157287597656,
+      "learning_rate": 9.331614762720703e-07,
+      "loss": 0.7487,
+      "step": 855
+    },
+    {
+      "epoch": 0.8802456499488229,
+      "grad_norm": 0.6591235995292664,
+      "learning_rate": 9.313660639694358e-07,
+      "loss": 0.7538,
+      "step": 860
+    },
+    {
+      "epoch": 0.8853633572159673,
+      "grad_norm": 0.66665118932724,
+      "learning_rate": 9.295486309758269e-07,
+      "loss": 0.7518,
+      "step": 865
+    },
+    {
+      "epoch": 0.8904810644831116,
+      "grad_norm": 0.6165961027145386,
+      "learning_rate": 9.277092700688951e-07,
+      "loss": 0.7495,
+      "step": 870
+    },
+    {
+      "epoch": 0.8955987717502559,
+      "grad_norm": 0.7449588179588318,
+      "learning_rate": 9.258480751456838e-07,
+      "loss": 0.7515,
+      "step": 875
+    },
+    {
+      "epoch": 0.9007164790174002,
+      "grad_norm": 0.7553215622901917,
+      "learning_rate": 9.239651412178357e-07,
+      "loss": 0.7534,
+      "step": 880
+    },
+    {
+      "epoch": 0.9058341862845445,
+      "grad_norm": 0.747010350227356,
+      "learning_rate": 9.220605644067419e-07,
+      "loss": 0.7548,
+      "step": 885
+    },
+    {
+      "epoch": 0.9109518935516888,
+      "grad_norm": 0.7272236347198486,
+      "learning_rate": 9.20134441938635e-07,
+      "loss": 0.7531,
+      "step": 890
+    },
+    {
+      "epoch": 0.9160696008188332,
+      "grad_norm": 0.8726323246955872,
+      "learning_rate": 9.181868721396266e-07,
+      "loss": 0.7479,
+      "step": 895
+    },
+    {
+      "epoch": 0.9211873080859775,
+      "grad_norm": 0.7914009094238281,
+      "learning_rate": 9.16217954430687e-07,
+      "loss": 0.7522,
+      "step": 900
+    },
+    {
+      "epoch": 0.9263050153531218,
+      "grad_norm": 0.6367310285568237,
+      "learning_rate": 9.142277893225708e-07,
+      "loss": 0.7497,
+      "step": 905
+    },
+    {
+      "epoch": 0.9314227226202662,
+      "grad_norm": 0.8285405039787292,
+      "learning_rate": 9.122164784106842e-07,
+      "loss": 0.753,
+      "step": 910
+    },
+    {
+      "epoch": 0.9365404298874105,
+      "grad_norm": 0.7742036581039429,
+      "learning_rate": 9.101841243699015e-07,
+      "loss": 0.7534,
+      "step": 915
+    },
+    {
+      "epoch": 0.9416581371545547,
+      "grad_norm": 0.7512480020523071,
+      "learning_rate": 9.081308309493209e-07,
+      "loss": 0.747,
+      "step": 920
+    },
+    {
+      "epoch": 0.946775844421699,
+      "grad_norm": 0.5556691288948059,
+      "learning_rate": 9.060567029669699e-07,
+      "loss": 0.7465,
+      "step": 925
+    },
+    {
+      "epoch": 0.9518935516888434,
+      "grad_norm": 1.0232101678848267,
+      "learning_rate": 9.039618463044536e-07,
+      "loss": 0.7485,
+      "step": 930
+    },
+    {
+      "epoch": 0.9570112589559877,
+      "grad_norm": 0.8321600556373596,
+      "learning_rate": 9.018463679015505e-07,
+      "loss": 0.7488,
+      "step": 935
+    },
+    {
+      "epoch": 0.962128966223132,
+      "grad_norm": 0.7009038329124451,
+      "learning_rate": 8.997103757507521e-07,
+      "loss": 0.7483,
+      "step": 940
+    },
+    {
+      "epoch": 0.9672466734902764,
+      "grad_norm": 0.6939564347267151,
+      "learning_rate": 8.975539788917514e-07,
+      "loss": 0.7485,
+      "step": 945
+    },
+    {
+      "epoch": 0.9723643807574207,
+      "grad_norm": 0.7738851308822632,
+      "learning_rate": 8.953772874058757e-07,
+      "loss": 0.7479,
+      "step": 950
+    },
+    {
+      "epoch": 0.977482088024565,
+      "grad_norm": 0.5913597941398621,
+      "learning_rate": 8.931804124104672e-07,
+      "loss": 0.7473,
+      "step": 955
+    },
+    {
+      "epoch": 0.9825997952917093,
+      "grad_norm": 0.8486027717590332,
+      "learning_rate": 8.909634660532106e-07,
+      "loss": 0.7479,
+      "step": 960
+    },
+    {
+      "epoch": 0.9877175025588536,
+      "grad_norm": 0.6463382840156555,
+      "learning_rate": 8.887265615064083e-07,
+      "loss": 0.7486,
+      "step": 965
+    },
+    {
+      "epoch": 0.9928352098259979,
+      "grad_norm": 0.6264991164207458,
+      "learning_rate": 8.864698129612031e-07,
+      "loss": 0.7467,
+      "step": 970
+    },
+    {
+      "epoch": 0.9979529170931423,
+      "grad_norm": 0.7566510438919067,
+      "learning_rate": 8.841933356217488e-07,
+      "loss": 0.7463,
+      "step": 975
+    },
+    {
+      "epoch": 1.0030706243602865,
+      "grad_norm": 0.7290503978729248,
+      "learning_rate": 8.818972456993288e-07,
+      "loss": 0.7504,
+      "step": 980
+    },
+    {
+      "epoch": 1.008188331627431,
+      "grad_norm": 0.8277891874313354,
+      "learning_rate": 8.795816604064241e-07,
+      "loss": 0.7472,
+      "step": 985
+    },
+    {
+      "epoch": 1.0133060388945752,
+      "grad_norm": 0.6427952647209167,
+      "learning_rate": 8.772466979507302e-07,
+      "loss": 0.7487,
+      "step": 990
+    },
+    {
+      "epoch": 1.0184237461617196,
+      "grad_norm": 0.6775041818618774,
+      "learning_rate": 8.748924775291216e-07,
+      "loss": 0.745,
+      "step": 995
+    },
+    {
+      "epoch": 1.0235414534288638,
+      "grad_norm": 0.6815404891967773,
+      "learning_rate": 8.725191193215675e-07,
+      "loss": 0.7485,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0235414534288638,
+      "eval_accuracy": 0.67557,
+      "eval_loss": 0.6936712265014648,
+      "eval_macro_f1": 0.6724504812400831,
+      "eval_precision": 0.6760463081581009,
+      "eval_recall": 0.6725003053739838,
+      "eval_runtime": 73.7408,
+      "eval_samples_per_second": 1356.102,
+      "eval_steps_per_second": 1.329,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0286591606960083,
+      "grad_norm": 0.8586804866790771,
+      "learning_rate": 8.701267444849974e-07,
+      "loss": 0.7457,
+      "step": 1005
+    },
+    {
+      "epoch": 1.0337768679631525,
+      "grad_norm": 0.5989358425140381,
+      "learning_rate": 8.677154751471152e-07,
+      "loss": 0.7443,
+      "step": 1010
+    },
+    {
+      "epoch": 1.0388945752302967,
+      "grad_norm": 0.6888963580131531,
+      "learning_rate": 8.65285434400165e-07,
+      "loss": 0.7458,
+      "step": 1015
+    },
+    {
+      "epoch": 1.0440122824974412,
+      "grad_norm": 0.6407850384712219,
+      "learning_rate": 8.628367462946482e-07,
+      "loss": 0.7493,
+      "step": 1020
+    },
+    {
+      "epoch": 1.0491299897645854,
+      "grad_norm": 0.6202091574668884,
+      "learning_rate": 8.603695358329896e-07,
+      "loss": 0.7471,
+      "step": 1025
+    },
+    {
+      "epoch": 1.0542476970317298,
+      "grad_norm": 0.7456187605857849,
+      "learning_rate": 8.57883928963157e-07,
+      "loss": 0.7431,
+      "step": 1030
+    },
+    {
+      "epoch": 1.059365404298874,
+      "grad_norm": 0.6171067357063293,
+      "learning_rate": 8.553800525722317e-07,
+      "loss": 0.7435,
+      "step": 1035
+    },
+    {
+      "epoch": 1.0644831115660185,
+      "grad_norm": 0.8527712821960449,
+      "learning_rate": 8.528580344799305e-07,
+      "loss": 0.7453,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0696008188331627,
+      "grad_norm": 0.6724162697792053,
+      "learning_rate": 8.503180034320816e-07,
+      "loss": 0.7467,
+      "step": 1045
+    },
+    {
+      "epoch": 1.0747185261003072,
+      "grad_norm": 0.581979513168335,
+      "learning_rate": 8.477600890940513e-07,
+      "loss": 0.7508,
+      "step": 1050
+    },
+    {
+      "epoch": 1.0798362333674514,
+      "grad_norm": 0.6551439166069031,
+      "learning_rate": 8.451844220441253e-07,
+      "loss": 0.7469,
+      "step": 1055
+    },
+    {
+      "epoch": 1.0849539406345956,
+      "grad_norm": 0.6437426209449768,
+      "learning_rate": 8.42591133766843e-07,
+      "loss": 0.7468,
+      "step": 1060
+    },
+    {
+      "epoch": 1.09007164790174,
+      "grad_norm": 0.5788704752922058,
+      "learning_rate": 8.39980356646285e-07,
+      "loss": 0.7424,
+      "step": 1065
+    },
+    {
+      "epoch": 1.0951893551688843,
+      "grad_norm": 0.5575606226921082,
+      "learning_rate": 8.373522239593149e-07,
+      "loss": 0.7396,
+      "step": 1070
+    },
+    {
+      "epoch": 1.1003070624360287,
+      "grad_norm": 0.737180769443512,
+      "learning_rate": 8.347068698687765e-07,
+      "loss": 0.744,
+      "step": 1075
+    },
+    {
+      "epoch": 1.105424769703173,
+      "grad_norm": 0.592766284942627,
+      "learning_rate": 8.320444294166439e-07,
+      "loss": 0.7469,
+      "step": 1080
+    },
+    {
+      "epoch": 1.1105424769703174,
+      "grad_norm": 0.63823401927948,
+      "learning_rate": 8.293650385171287e-07,
+      "loss": 0.7447,
+      "step": 1085
+    },
+    {
+      "epoch": 1.1156601842374616,
+      "grad_norm": 0.6114454865455627,
+      "learning_rate": 8.266688339497412e-07,
+      "loss": 0.7475,
+      "step": 1090
+    },
+    {
+      "epoch": 1.120777891504606,
+      "grad_norm": 0.53263258934021,
+      "learning_rate": 8.239559533523082e-07,
+      "loss": 0.7455,
+      "step": 1095
+    },
+    {
+      "epoch": 1.1258955987717503,
+      "grad_norm": 0.7016158699989319,
+      "learning_rate": 8.212265352139466e-07,
+      "loss": 0.742,
+      "step": 1100
+    },
+    {
+      "epoch": 1.1310133060388945,
+      "grad_norm": 0.6125472784042358,
+      "learning_rate": 8.184807188679939e-07,
+      "loss": 0.7383,
+      "step": 1105
+    },
+    {
+      "epoch": 1.136131013306039,
+      "grad_norm": 0.6008788347244263,
+      "learning_rate": 8.157186444848952e-07,
+      "loss": 0.7435,
+      "step": 1110
+    },
+    {
+      "epoch": 1.1412487205731832,
+      "grad_norm": 0.6357280015945435,
+      "learning_rate": 8.129404530650479e-07,
+      "loss": 0.7443,
+      "step": 1115
+    },
+    {
+      "epoch": 1.1463664278403276,
+      "grad_norm": 0.6422165036201477,
+      "learning_rate": 8.101462864316038e-07,
+      "loss": 0.7449,
+      "step": 1120
+    },
+    {
+      "epoch": 1.1514841351074718,
+      "grad_norm": 0.6852079629898071,
+      "learning_rate": 8.07336287223229e-07,
+      "loss": 0.7428,
+      "step": 1125
+    },
+    {
+      "epoch": 1.156601842374616,
+      "grad_norm": 0.5539452433586121,
+      "learning_rate": 8.045105988868224e-07,
+      "loss": 0.7455,
+      "step": 1130
+    },
+    {
+      "epoch": 1.1617195496417605,
+      "grad_norm": 0.5939313173294067,
+      "learning_rate": 8.016693656701931e-07,
+      "loss": 0.7376,
+      "step": 1135
+    },
+    {
+      "epoch": 1.1668372569089047,
+      "grad_norm": 0.7522106766700745,
+      "learning_rate": 7.98812732614697e-07,
+      "loss": 0.7464,
+      "step": 1140
+    },
+    {
+      "epoch": 1.1719549641760492,
+      "grad_norm": 0.6572809815406799,
+      "learning_rate": 7.959408455478313e-07,
+      "loss": 0.7448,
+      "step": 1145
+    },
+    {
+      "epoch": 1.1770726714431934,
+      "grad_norm": 0.5842403173446655,
+      "learning_rate": 7.93053851075792e-07,
+      "loss": 0.7396,
+      "step": 1150
+    },
+    {
+      "epoch": 1.1821903787103378,
+      "grad_norm": 0.5845000147819519,
+      "learning_rate": 7.901518965759888e-07,
+      "loss": 0.7438,
+      "step": 1155
+    },
+    {
+      "epoch": 1.187308085977482,
+      "grad_norm": 0.5873178839683533,
+      "learning_rate": 7.872351301895217e-07,
+      "loss": 0.7421,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1924257932446265,
+      "grad_norm": 0.6385728120803833,
+      "learning_rate": 7.843037008136189e-07,
+      "loss": 0.7431,
+      "step": 1165
+    },
+    {
+      "epoch": 1.1975435005117707,
+      "grad_norm": 0.5818535685539246,
+      "learning_rate": 7.813577580940356e-07,
+      "loss": 0.7416,
+      "step": 1170
+    },
+    {
+      "epoch": 1.202661207778915,
+      "grad_norm": 0.5611526370048523,
+      "learning_rate": 7.783974524174149e-07,
+      "loss": 0.743,
+      "step": 1175
+    },
+    {
+      "epoch": 1.2077789150460594,
+      "grad_norm": 0.6002296805381775,
+      "learning_rate": 7.754229349036102e-07,
+      "loss": 0.7407,
+      "step": 1180
+    },
+    {
+      "epoch": 1.2128966223132036,
+      "grad_norm": 0.6006008982658386,
+      "learning_rate": 7.724343573979718e-07,
+      "loss": 0.7437,
+      "step": 1185
+    },
+    {
+      "epoch": 1.218014329580348,
+      "grad_norm": 0.6336845755577087,
+      "learning_rate": 7.694318724635945e-07,
+      "loss": 0.7405,
+      "step": 1190
+    },
+    {
+      "epoch": 1.2231320368474923,
+      "grad_norm": 0.6916839480400085,
+      "learning_rate": 7.664156333735293e-07,
+      "loss": 0.7468,
+      "step": 1195
+    },
+    {
+      "epoch": 1.2282497441146367,
+      "grad_norm": 0.5944891571998596,
+      "learning_rate": 7.633857941029602e-07,
+      "loss": 0.7485,
+      "step": 1200
+    },
+    {
+      "epoch": 1.233367451381781,
+      "grad_norm": 0.5755409598350525,
+      "learning_rate": 7.603425093213429e-07,
+      "loss": 0.7418,
+      "step": 1205
+    },
+    {
+      "epoch": 1.2384851586489254,
+      "grad_norm": 0.6128578186035156,
+      "learning_rate": 7.572859343845092e-07,
+      "loss": 0.7396,
+      "step": 1210
+    },
+    {
+      "epoch": 1.2436028659160696,
+      "grad_norm": 0.6123960614204407,
+      "learning_rate": 7.542162253267363e-07,
+      "loss": 0.7363,
+      "step": 1215
+    },
+    {
+      "epoch": 1.2487205731832138,
+      "grad_norm": 0.6969608664512634,
+      "learning_rate": 7.511335388527822e-07,
+      "loss": 0.7406,
+      "step": 1220
+    },
+    {
+      "epoch": 1.2538382804503583,
+      "grad_norm": 0.6491796970367432,
+      "learning_rate": 7.480380323298851e-07,
+      "loss": 0.7429,
+      "step": 1225
+    },
+    {
+      "epoch": 1.2589559877175025,
+      "grad_norm": 0.5883914828300476,
+      "learning_rate": 7.449298637797309e-07,
+      "loss": 0.7375,
+      "step": 1230
+    },
+    {
+      "epoch": 1.264073694984647,
+      "grad_norm": 0.6160842776298523,
+      "learning_rate": 7.418091918703854e-07,
+      "loss": 0.7393,
+      "step": 1235
+    },
+    {
+      "epoch": 1.2691914022517912,
+      "grad_norm": 0.5568389892578125,
+      "learning_rate": 7.386761759081954e-07,
+      "loss": 0.7387,
+      "step": 1240
+    },
+    {
+      "epoch": 1.2743091095189354,
+      "grad_norm": 0.532599151134491,
+      "learning_rate": 7.35530975829656e-07,
+      "loss": 0.741,
+      "step": 1245
+    },
+    {
+      "epoch": 1.2794268167860798,
+      "grad_norm": 0.5400995016098022,
+      "learning_rate": 7.323737521932457e-07,
+      "loss": 0.7367,
+      "step": 1250
+    },
+    {
+      "epoch": 1.2845445240532243,
+      "grad_norm": 0.5307775735855103,
+      "learning_rate": 7.292046661712307e-07,
+      "loss": 0.7399,
+      "step": 1255
+    },
+    {
+      "epoch": 1.2896622313203685,
+      "grad_norm": 0.5908007621765137,
+      "learning_rate": 7.260238795414366e-07,
+      "loss": 0.74,
+      "step": 1260
+    },
+    {
+      "epoch": 1.2947799385875127,
+      "grad_norm": 0.5410370826721191,
+      "learning_rate": 7.228315546789907e-07,
+      "loss": 0.7388,
+      "step": 1265
+    },
+    {
+      "epoch": 1.2998976458546572,
+      "grad_norm": 0.5406989455223083,
+      "learning_rate": 7.19627854548032e-07,
+      "loss": 0.7337,
+      "step": 1270
+    },
+    {
+      "epoch": 1.3050153531218014,
+      "grad_norm": 0.589767575263977,
+      "learning_rate": 7.164129426933927e-07,
+      "loss": 0.7426,
+      "step": 1275
+    },
+    {
+      "epoch": 1.3101330603889458,
+      "grad_norm": 0.5926154255867004,
+      "learning_rate": 7.131869832322496e-07,
+      "loss": 0.7374,
+      "step": 1280
+    },
+    {
+      "epoch": 1.31525076765609,
+      "grad_norm": 0.7507414817810059,
+      "learning_rate": 7.099501408457452e-07,
+      "loss": 0.7375,
+      "step": 1285
+    },
+    {
+      "epoch": 1.3203684749232343,
+      "grad_norm": 0.6162967681884766,
+      "learning_rate": 7.06702580770582e-07,
+      "loss": 0.7381,
+      "step": 1290
+    },
+    {
+      "epoch": 1.3254861821903787,
+      "grad_norm": 0.5118803977966309,
+      "learning_rate": 7.034444687905868e-07,
+      "loss": 0.7344,
+      "step": 1295
+    },
+    {
+      "epoch": 1.330603889457523,
+      "grad_norm": 0.5982370972633362,
+      "learning_rate": 7.001759712282478e-07,
+      "loss": 0.7382,
+      "step": 1300
+    },
+    {
+      "epoch": 1.3357215967246674,
+      "grad_norm": 0.6339845657348633,
+      "learning_rate": 6.968972549362238e-07,
+      "loss": 0.7386,
+      "step": 1305
+    },
+    {
+      "epoch": 1.3408393039918116,
+      "grad_norm": 0.5755071043968201,
+      "learning_rate": 6.936084872888271e-07,
+      "loss": 0.7349,
+      "step": 1310
+    },
+    {
+      "epoch": 1.345957011258956,
+      "grad_norm": 0.6089357137680054,
+      "learning_rate": 6.90309836173479e-07,
+      "loss": 0.7377,
+      "step": 1315
+    },
+    {
+      "epoch": 1.3510747185261003,
+      "grad_norm": 0.6137183308601379,
+      "learning_rate": 6.87001469982139e-07,
+      "loss": 0.7417,
+      "step": 1320
+    },
+    {
+      "epoch": 1.3561924257932447,
+      "grad_norm": 0.6864479184150696,
+      "learning_rate": 6.836835576027093e-07,
+      "loss": 0.7321,
+      "step": 1325
+    },
+    {
+      "epoch": 1.361310133060389,
+      "grad_norm": 0.5657494068145752,
+      "learning_rate": 6.803562684104125e-07,
+      "loss": 0.7411,
+      "step": 1330
+    },
+    {
+      "epoch": 1.3664278403275332,
+      "grad_norm": 0.6047109365463257,
+      "learning_rate": 6.770197722591456e-07,
+      "loss": 0.7399,
+      "step": 1335
+    },
+    {
+      "epoch": 1.3715455475946776,
+      "grad_norm": 0.5772355198860168,
+      "learning_rate": 6.736742394728097e-07,
+      "loss": 0.7374,
+      "step": 1340
+    },
+    {
+      "epoch": 1.3766632548618218,
+      "grad_norm": 0.7158586382865906,
+      "learning_rate": 6.703198408366142e-07,
+      "loss": 0.739,
+      "step": 1345
+    },
+    {
+      "epoch": 1.3817809621289663,
+      "grad_norm": 0.5718494057655334,
+      "learning_rate": 6.669567475883592e-07,
+      "loss": 0.7435,
+      "step": 1350
+    },
+    {
+      "epoch": 1.3868986693961105,
+      "grad_norm": 0.6494776606559753,
+      "learning_rate": 6.635851314096935e-07,
+      "loss": 0.7358,
+      "step": 1355
+    },
+    {
+      "epoch": 1.3920163766632547,
+      "grad_norm": 0.5958154201507568,
+      "learning_rate": 6.602051644173509e-07,
+      "loss": 0.7375,
+      "step": 1360
+    },
+    {
+      "epoch": 1.3971340839303992,
+      "grad_norm": 0.5509739518165588,
+      "learning_rate": 6.568170191543634e-07,
+      "loss": 0.7412,
+      "step": 1365
+    },
+    {
+      "epoch": 1.4022517911975436,
+      "grad_norm": 0.5368937253952026,
+      "learning_rate": 6.534208685812536e-07,
+      "loss": 0.7393,
+      "step": 1370
+    },
+    {
+      "epoch": 1.4073694984646878,
+      "grad_norm": 0.5369133353233337,
+      "learning_rate": 6.500168860672047e-07,
+      "loss": 0.7398,
+      "step": 1375
+    },
+    {
+      "epoch": 1.412487205731832,
+      "grad_norm": 0.5789251327514648,
+      "learning_rate": 6.466052453812111e-07,
+      "loss": 0.7371,
+      "step": 1380
+    },
+    {
+      "epoch": 1.4176049129989765,
+      "grad_norm": 0.5568552017211914,
+      "learning_rate": 6.431861206832069e-07,
+      "loss": 0.7363,
+      "step": 1385
+    },
+    {
+      "epoch": 1.4227226202661207,
+      "grad_norm": 0.5325226783752441,
+      "learning_rate": 6.397596865151752e-07,
+      "loss": 0.7348,
+      "step": 1390
+    },
+    {
+      "epoch": 1.4278403275332652,
+      "grad_norm": 0.5849957466125488,
+      "learning_rate": 6.363261177922388e-07,
+      "loss": 0.7363,
+      "step": 1395
+    },
+    {
+      "epoch": 1.4329580348004094,
+      "grad_norm": 0.6208518743515015,
+      "learning_rate": 6.328855897937303e-07,
+      "loss": 0.7365,
+      "step": 1400
+    },
+    {
+      "epoch": 1.4380757420675536,
+      "grad_norm": 0.5599240064620972,
+      "learning_rate": 6.294382781542445e-07,
+      "loss": 0.7371,
+      "step": 1405
+    },
+    {
+      "epoch": 1.443193449334698,
+      "grad_norm": 0.5623425841331482,
+      "learning_rate": 6.25984358854672e-07,
+      "loss": 0.74,
+      "step": 1410
+    },
+    {
+      "epoch": 1.4483111566018425,
+      "grad_norm": 0.6866716146469116,
+      "learning_rate": 6.225240082132172e-07,
+      "loss": 0.7383,
+      "step": 1415
+    },
+    {
+      "epoch": 1.4534288638689867,
+      "grad_norm": 0.5852178931236267,
+      "learning_rate": 6.190574028763952e-07,
+      "loss": 0.7381,
+      "step": 1420
+    },
+    {
+      "epoch": 1.458546571136131,
+      "grad_norm": 0.5319634079933167,
+      "learning_rate": 6.15584719810016e-07,
+      "loss": 0.7349,
+      "step": 1425
+    },
+    {
+      "epoch": 1.4636642784032754,
+      "grad_norm": 0.5798255205154419,
+      "learning_rate": 6.121061362901498e-07,
+      "loss": 0.7331,
+      "step": 1430
+    },
+    {
+      "epoch": 1.4687819856704196,
+      "grad_norm": 0.4803605079650879,
+      "learning_rate": 6.086218298940778e-07,
+      "loss": 0.7356,
+      "step": 1435
+    },
+    {
+      "epoch": 1.473899692937564,
+      "grad_norm": 0.7146285772323608,
+      "learning_rate": 6.051319784912261e-07,
+      "loss": 0.7384,
+      "step": 1440
+    },
+    {
+      "epoch": 1.4790174002047083,
+      "grad_norm": 0.47007301449775696,
+      "learning_rate": 6.016367602340868e-07,
+      "loss": 0.7332,
+      "step": 1445
+    },
+    {
+      "epoch": 1.4841351074718525,
+      "grad_norm": 0.6568506956100464,
+      "learning_rate": 5.981363535491233e-07,
+      "loss": 0.7378,
+      "step": 1450
+    },
+    {
+      "epoch": 1.489252814738997,
+      "grad_norm": 0.5178249478340149,
+      "learning_rate": 5.946309371276614e-07,
+      "loss": 0.7338,
+      "step": 1455
+    },
+    {
+      "epoch": 1.4943705220061412,
+      "grad_norm": 0.5785830616950989,
+      "learning_rate": 5.911206899167676e-07,
+      "loss": 0.7392,
+      "step": 1460
+    },
+    {
+      "epoch": 1.4994882292732856,
+      "grad_norm": 0.5021066665649414,
+      "learning_rate": 5.87605791110114e-07,
+      "loss": 0.7342,
+      "step": 1465
+    },
+    {
+      "epoch": 1.5046059365404298,
+      "grad_norm": 0.5594333410263062,
+      "learning_rate": 5.840864201388312e-07,
+      "loss": 0.7351,
+      "step": 1470
+    },
+    {
+      "epoch": 1.509723643807574,
+      "grad_norm": 0.5204704999923706,
+      "learning_rate": 5.805627566623475e-07,
+      "loss": 0.7375,
+      "step": 1475
+    },
+    {
+      "epoch": 1.5148413510747185,
+      "grad_norm": 0.6187242865562439,
+      "learning_rate": 5.770349805592185e-07,
+      "loss": 0.7351,
+      "step": 1480
+    },
+    {
+      "epoch": 1.519959058341863,
+      "grad_norm": 0.5294100046157837,
+      "learning_rate": 5.735032719179443e-07,
+      "loss": 0.7383,
+      "step": 1485
+    },
+    {
+      "epoch": 1.5250767656090072,
+      "grad_norm": 0.5450606942176819,
+      "learning_rate": 5.699678110277762e-07,
+      "loss": 0.7365,
+      "step": 1490
+    },
+    {
+      "epoch": 1.5301944728761514,
+      "grad_norm": 0.5091442465782166,
+      "learning_rate": 5.664287783695122e-07,
+      "loss": 0.7343,
+      "step": 1495
+    },
+    {
+      "epoch": 1.5353121801432958,
+      "grad_norm": 0.557119607925415,
+      "learning_rate": 5.628863546062856e-07,
+      "loss": 0.7298,
+      "step": 1500
+    },
+    {
+      "epoch": 1.5353121801432958,
+      "eval_accuracy": 0.67304,
+      "eval_loss": 0.6938837766647339,
+      "eval_macro_f1": 0.6609359830000188,
+      "eval_precision": 0.685850518502884,
+      "eval_recall": 0.6657447133221994,
+      "eval_runtime": 73.8645,
+      "eval_samples_per_second": 1353.83,
+      "eval_steps_per_second": 1.327,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4885,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 1
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.0201035364007936e+17,
+  "train_batch_size": 512,
+  "trial_name": null,
+  "trial_params": null
+}

fourier-spectral-norm-classifier/checkpoint-1500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff