Amit5674 commited on 12 days ago

Commit

38a8c2b

verified ·

1 Parent(s): 233e740

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +2 -0
checkpoint-2706/config.json +97 -0
checkpoint-2706/model.safetensors +3 -0
checkpoint-2706/optimizer.pt +3 -0
checkpoint-2706/rng_state.pth +3 -0
checkpoint-2706/scheduler.pt +3 -0
checkpoint-2706/special_tokens_map.json +51 -0
checkpoint-2706/tokenizer.json +0 -0
checkpoint-2706/tokenizer_config.json +66 -0
checkpoint-2706/trainer_state.json +1923 -0
checkpoint-2706/training_args.bin +3 -0
checkpoint-2706/vocab.txt +3 -0
config.json +97 -0
model.safetensors +3 -0
script_arguments.json +169 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +66 -0
training_args.bin +3 -0
vocab.txt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-2706/vocab.txt filter=lfs diff=lfs merge=lfs -text
+vocab.txt filter=lfs diff=lfs merge=lfs -text

checkpoint-2706/config.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "_name_or_path": "dicta-il/neodictabert",
+  "architectures": [
+    "NeoBERTForSequenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "dicta-il/neodictabert--modeling_neobert.NeoBERTConfig",
+    "AutoModel": "dicta-il/neodictabert--modeling_neobert.NeoBERT",
+    "AutoModelForMaskedLM": "dicta-il/neodictabert--modeling_neobert.NeoBERTLMHead",
+    "AutoModelForQuestionAnswering": "dicta-il/neodictabert--modeling_neobert.NeoBERTForQuestionAnswering",
+    "AutoModelForSequenceClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForSequenceClassification",
+    "AutoModelForTokenClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForTokenClassification"
+  },
+  "decoder_init_range": 0.02,
+  "dim_head": 64,
+  "embedding_init_range": 0.02,
+  "encoder_init_range": 0.02,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "construct_state_confusion",
+    "1": "directional_preposition_swap",
+    "2": "entity_date_swap",
+    "3": "entity_event_swap",
+    "4": "entity_location_swap",
+    "5": "entity_organization_swap",
+    "6": "entity_person_swap",
+    "7": "entity_title_swap",
+    "8": "hebrew_root_pattern_confusion",
+    "9": "homographic_gender_errors",
+    "10": "idiom_collocation_corruption",
+    "11": "measure_unit_swap",
+    "12": "morphological_connective_confusion",
+    "13": "noun_gender_swap",
+    "14": "number_swap",
+    "15": "pronoun_reference_errors",
+    "16": "sentence_negation",
+    "17": "specificity_shift_errors",
+    "18": "verb_gender_swap",
+    "19": "verb_tense_swap"
+  },
+  "intermediate_size": 3072,
+  "kwargs": {
+    "_commit_hash": "9052b2e47fe3e615931563bb2a74b26df6e028a3",
+    "architectures": [
+      "NeoBERTLMHead"
+    ],
+    "attn_implementation": null,
+    "auto_map": {
+      "AutoConfig": "dicta-il/neodictabert--modeling_neobert.NeoBERTConfig",
+      "AutoModel": "dicta-il/neodictabert--modeling_neobert.NeoBERT",
+      "AutoModelForMaskedLM": "dicta-il/neodictabert--modeling_neobert.NeoBERTLMHead",
+      "AutoModelForQuestionAnswering": "dicta-il/neodictabert--modeling_neobert.NeoBERTForQuestionAnswering",
+      "AutoModelForSequenceClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForSequenceClassification",
+      "AutoModelForTokenClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForTokenClassification"
+    },
+    "decoder_init_range": 0.02,
+    "dim_head": 64,
+    "kwargs": {
+      "decoder_init_range": 0.02
+    },
+    "model_type": "neobert",
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.53.0"
+  },
+  "label2id": {
+    "construct_state_confusion": 0,
+    "directional_preposition_swap": 1,
+    "entity_date_swap": 2,
+    "entity_event_swap": 3,
+    "entity_location_swap": 4,
+    "entity_organization_swap": 5,
+    "entity_person_swap": 6,
+    "entity_title_swap": 7,
+    "hebrew_root_pattern_confusion": 8,
+    "homographic_gender_errors": 9,
+    "idiom_collocation_corruption": 10,
+    "measure_unit_swap": 11,
+    "morphological_connective_confusion": 12,
+    "noun_gender_swap": 13,
+    "number_swap": 14,
+    "pronoun_reference_errors": 15,
+    "sentence_negation": 16,
+    "specificity_shift_errors": 17,
+    "verb_gender_swap": 18,
+    "verb_tense_swap": 19
+  },
+  "max_length": 4096,
+  "model_type": "neobert",
+  "norm_eps": 1e-06,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "pad_token_id": 3,
+  "problem_type": "multi_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "vocab_size": 128000
+}

checkpoint-2706/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:970fea6a79cfdd383e06cb000073b4a35151bf831d37f74b6451f0639b0ee9c7
+size 1452799672

checkpoint-2706/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c5a2024c2f0044e805c6fb58269b7388bf8a8580ef0cba02d69837f336574c8
+size 2905706059

checkpoint-2706/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f891a83add9650f9e1a670038346801f14068cfeb3063b27d9c0d9752e96abe4
+size 14645

checkpoint-2706/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5d735340dbb649b58801033a109c8b47c8267a76c74aa768774287c4cd166b8
+size 1465

checkpoint-2706/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-2706/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2706/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "[BLANK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 4096,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

checkpoint-2706/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1923 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9993075751280986,
+  "eval_steps": 50.0,
+  "global_step": 2706,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007385865300281586,
+      "grad_norm": 56.16927719116211,
+      "learning_rate": 6.666666666666667e-07,
+      "loss": 2.2986,
+      "step": 10
+    },
+    {
+      "epoch": 0.014771730600563172,
+      "grad_norm": 52.235191345214844,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 2.1946,
+      "step": 20
+    },
+    {
+      "epoch": 0.02215759590084476,
+      "grad_norm": 14.341564178466797,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 1.9498,
+      "step": 30
+    },
+    {
+      "epoch": 0.029543461201126345,
+      "grad_norm": 9.743363380432129,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 1.7599,
+      "step": 40
+    },
+    {
+      "epoch": 0.03692932650140793,
+      "grad_norm": 10.694592475891113,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 1.77,
+      "step": 50
+    },
+    {
+      "epoch": 0.04431519180168952,
+      "grad_norm": 6.704069137573242,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.5609,
+      "step": 60
+    },
+    {
+      "epoch": 0.0517010571019711,
+      "grad_norm": 5.9378342628479,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 1.5134,
+      "step": 70
+    },
+    {
+      "epoch": 0.05908692240225269,
+      "grad_norm": 5.821998119354248,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 1.4795,
+      "step": 80
+    },
+    {
+      "epoch": 0.06647278770253427,
+      "grad_norm": 6.466773986816406,
+      "learning_rate": 6e-06,
+      "loss": 1.4666,
+      "step": 90
+    },
+    {
+      "epoch": 0.07385865300281585,
+      "grad_norm": 5.7971625328063965,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.4187,
+      "step": 100
+    },
+    {
+      "epoch": 0.08124451830309745,
+      "grad_norm": 19.75885581970215,
+      "learning_rate": 7.333333333333333e-06,
+      "loss": 1.4012,
+      "step": 110
+    },
+    {
+      "epoch": 0.08863038360337903,
+      "grad_norm": 6.692321300506592,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.3932,
+      "step": 120
+    },
+    {
+      "epoch": 0.09601624890366062,
+      "grad_norm": 8.816634178161621,
+      "learning_rate": 8.666666666666668e-06,
+      "loss": 1.3924,
+      "step": 130
+    },
+    {
+      "epoch": 0.1034021142039422,
+      "grad_norm": 6.486945152282715,
+      "learning_rate": 9.333333333333334e-06,
+      "loss": 1.3117,
+      "step": 140
+    },
+    {
+      "epoch": 0.1107879795042238,
+      "grad_norm": 8.362743377685547,
+      "learning_rate": 1e-05,
+      "loss": 1.2642,
+      "step": 150
+    },
+    {
+      "epoch": 0.11817384480450538,
+      "grad_norm": 7.534619331359863,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 1.2891,
+      "step": 160
+    },
+    {
+      "epoch": 0.12555971010478698,
+      "grad_norm": 7.239850997924805,
+      "learning_rate": 1.1333333333333334e-05,
+      "loss": 1.2664,
+      "step": 170
+    },
+    {
+      "epoch": 0.13294557540506854,
+      "grad_norm": 6.650047779083252,
+      "learning_rate": 1.2e-05,
+      "loss": 1.2494,
+      "step": 180
+    },
+    {
+      "epoch": 0.14033144070535014,
+      "grad_norm": 5.859479904174805,
+      "learning_rate": 1.2666666666666667e-05,
+      "loss": 1.2844,
+      "step": 190
+    },
+    {
+      "epoch": 0.1477173060056317,
+      "grad_norm": 7.5547027587890625,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.2898,
+      "step": 200
+    },
+    {
+      "epoch": 0.1551031713059133,
+      "grad_norm": 8.316688537597656,
+      "learning_rate": 1.4e-05,
+      "loss": 1.1774,
+      "step": 210
+    },
+    {
+      "epoch": 0.1624890366061949,
+      "grad_norm": 7.763572692871094,
+      "learning_rate": 1.4666666666666666e-05,
+      "loss": 1.2881,
+      "step": 220
+    },
+    {
+      "epoch": 0.16987490190647647,
+      "grad_norm": 7.132694244384766,
+      "learning_rate": 1.5333333333333334e-05,
+      "loss": 1.2359,
+      "step": 230
+    },
+    {
+      "epoch": 0.17726076720675807,
+      "grad_norm": 6.167331218719482,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.1939,
+      "step": 240
+    },
+    {
+      "epoch": 0.18464663250703967,
+      "grad_norm": 7.399999141693115,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 1.2213,
+      "step": 250
+    },
+    {
+      "epoch": 0.19203249780732123,
+      "grad_norm": 5.161776065826416,
+      "learning_rate": 1.7333333333333336e-05,
+      "loss": 1.1747,
+      "step": 260
+    },
+    {
+      "epoch": 0.19941836310760283,
+      "grad_norm": 9.162799835205078,
+      "learning_rate": 1.8e-05,
+      "loss": 1.1751,
+      "step": 270
+    },
+    {
+      "epoch": 0.2068042284078844,
+      "grad_norm": 6.043332576751709,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 1.181,
+      "step": 280
+    },
+    {
+      "epoch": 0.214190093708166,
+      "grad_norm": 5.533137798309326,
+      "learning_rate": 1.9333333333333333e-05,
+      "loss": 1.1727,
+      "step": 290
+    },
+    {
+      "epoch": 0.2215759590084476,
+      "grad_norm": 4.7085862159729,
+      "learning_rate": 2e-05,
+      "loss": 1.3127,
+      "step": 300
+    },
+    {
+      "epoch": 0.22896182430872916,
+      "grad_norm": 5.2254815101623535,
+      "learning_rate": 1.9999147543290536e-05,
+      "loss": 1.1853,
+      "step": 310
+    },
+    {
+      "epoch": 0.23634768960901076,
+      "grad_norm": 5.015223503112793,
+      "learning_rate": 1.999659031849863e-05,
+      "loss": 1.1846,
+      "step": 320
+    },
+    {
+      "epoch": 0.24373355490929235,
+      "grad_norm": 6.505156993865967,
+      "learning_rate": 1.9992328761608965e-05,
+      "loss": 1.1572,
+      "step": 330
+    },
+    {
+      "epoch": 0.25111942020957395,
+      "grad_norm": 4.331061840057373,
+      "learning_rate": 1.99863635991801e-05,
+      "loss": 1.0744,
+      "step": 340
+    },
+    {
+      "epoch": 0.2585052855098555,
+      "grad_norm": 6.760496616363525,
+      "learning_rate": 1.997869584822058e-05,
+      "loss": 1.1019,
+      "step": 350
+    },
+    {
+      "epoch": 0.2658911508101371,
+      "grad_norm": 6.3948235511779785,
+      "learning_rate": 1.9969326816015556e-05,
+      "loss": 1.1073,
+      "step": 360
+    },
+    {
+      "epoch": 0.2732770161104187,
+      "grad_norm": 5.087249279022217,
+      "learning_rate": 1.9958258099903894e-05,
+      "loss": 1.0751,
+      "step": 370
+    },
+    {
+      "epoch": 0.2806628814107003,
+      "grad_norm": 10.829612731933594,
+      "learning_rate": 1.9945491587005867e-05,
+      "loss": 1.083,
+      "step": 380
+    },
+    {
+      "epoch": 0.28804874671098185,
+      "grad_norm": 5.7423176765441895,
+      "learning_rate": 1.9931029453901384e-05,
+      "loss": 1.0639,
+      "step": 390
+    },
+    {
+      "epoch": 0.2954346120112634,
+      "grad_norm": 4.613246440887451,
+      "learning_rate": 1.9914874166258927e-05,
+      "loss": 1.0604,
+      "step": 400
+    },
+    {
+      "epoch": 0.30282047731154504,
+      "grad_norm": 4.079463005065918,
+      "learning_rate": 1.9897028478415165e-05,
+      "loss": 1.0017,
+      "step": 410
+    },
+    {
+      "epoch": 0.3102063426118266,
+      "grad_norm": 4.641962051391602,
+      "learning_rate": 1.9877495432905363e-05,
+      "loss": 1.0263,
+      "step": 420
+    },
+    {
+      "epoch": 0.3175922079121082,
+      "grad_norm": 6.14805269241333,
+      "learning_rate": 1.9856278359944664e-05,
+      "loss": 1.0451,
+      "step": 430
+    },
+    {
+      "epoch": 0.3249780732123898,
+      "grad_norm": 5.665846824645996,
+      "learning_rate": 1.9833380876860305e-05,
+      "loss": 1.0361,
+      "step": 440
+    },
+    {
+      "epoch": 0.3323639385126714,
+      "grad_norm": 7.826813220977783,
+      "learning_rate": 1.9808806887474907e-05,
+      "loss": 0.9795,
+      "step": 450
+    },
+    {
+      "epoch": 0.33974980381295294,
+      "grad_norm": 4.955426216125488,
+      "learning_rate": 1.9782560581440894e-05,
+      "loss": 1.0433,
+      "step": 460
+    },
+    {
+      "epoch": 0.34713566911323457,
+      "grad_norm": 5.327470302581787,
+      "learning_rate": 1.97546464335262e-05,
+      "loss": 0.9605,
+      "step": 470
+    },
+    {
+      "epoch": 0.35452153441351614,
+      "grad_norm": 4.838713645935059,
+      "learning_rate": 1.972506920285136e-05,
+      "loss": 0.9935,
+      "step": 480
+    },
+    {
+      "epoch": 0.3619073997137977,
+      "grad_norm": 6.030056476593018,
+      "learning_rate": 1.969383393207813e-05,
+      "loss": 1.0043,
+      "step": 490
+    },
+    {
+      "epoch": 0.36929326501407933,
+      "grad_norm": 5.917972087860107,
+      "learning_rate": 1.9660945946549727e-05,
+      "loss": 0.9701,
+      "step": 500
+    },
+    {
+      "epoch": 0.3766791303143609,
+      "grad_norm": 4.341779708862305,
+      "learning_rate": 1.962641085338294e-05,
+      "loss": 0.9913,
+      "step": 510
+    },
+    {
+      "epoch": 0.38406499561464247,
+      "grad_norm": 4.399661064147949,
+      "learning_rate": 1.959023454051215e-05,
+      "loss": 0.9196,
+      "step": 520
+    },
+    {
+      "epoch": 0.39145086091492404,
+      "grad_norm": 4.028534412384033,
+      "learning_rate": 1.9552423175685478e-05,
+      "loss": 0.9369,
+      "step": 530
+    },
+    {
+      "epoch": 0.39883672621520566,
+      "grad_norm": 4.389466285705566,
+      "learning_rate": 1.9512983205413253e-05,
+      "loss": 1.0191,
+      "step": 540
+    },
+    {
+      "epoch": 0.40622259151548723,
+      "grad_norm": 5.277081489562988,
+      "learning_rate": 1.9471921353868932e-05,
+      "loss": 0.9399,
+      "step": 550
+    },
+    {
+      "epoch": 0.4136084568157688,
+      "grad_norm": 4.73630428314209,
+      "learning_rate": 1.9429244621742685e-05,
+      "loss": 0.9588,
+      "step": 560
+    },
+    {
+      "epoch": 0.4209943221160504,
+      "grad_norm": 3.3033573627471924,
+      "learning_rate": 1.938496028504784e-05,
+      "loss": 0.9038,
+      "step": 570
+    },
+    {
+      "epoch": 0.428380187416332,
+      "grad_norm": 7.80294942855835,
+      "learning_rate": 1.9339075893880382e-05,
+      "loss": 0.9403,
+      "step": 580
+    },
+    {
+      "epoch": 0.43576605271661356,
+      "grad_norm": 4.098162651062012,
+      "learning_rate": 1.9291599271131737e-05,
+      "loss": 0.9344,
+      "step": 590
+    },
+    {
+      "epoch": 0.4431519180168952,
+      "grad_norm": 3.7808070182800293,
+      "learning_rate": 1.9242538511155024e-05,
+      "loss": 0.8939,
+      "step": 600
+    },
+    {
+      "epoch": 0.45053778331717675,
+      "grad_norm": 4.160403728485107,
+      "learning_rate": 1.9191901978385048e-05,
+      "loss": 0.8786,
+      "step": 610
+    },
+    {
+      "epoch": 0.4579236486174583,
+      "grad_norm": 3.7800965309143066,
+      "learning_rate": 1.9139698305912227e-05,
+      "loss": 0.8977,
+      "step": 620
+    },
+    {
+      "epoch": 0.46530951391773995,
+      "grad_norm": 3.8200621604919434,
+      "learning_rate": 1.9085936394010733e-05,
+      "loss": 0.8793,
+      "step": 630
+    },
+    {
+      "epoch": 0.4726953792180215,
+      "grad_norm": 4.453779220581055,
+      "learning_rate": 1.903062540862107e-05,
+      "loss": 0.8813,
+      "step": 640
+    },
+    {
+      "epoch": 0.4800812445183031,
+      "grad_norm": 5.653434753417969,
+      "learning_rate": 1.897377477978736e-05,
+      "loss": 0.9544,
+      "step": 650
+    },
+    {
+      "epoch": 0.4874671098185847,
+      "grad_norm": 4.868826389312744,
+      "learning_rate": 1.8915394200049597e-05,
+      "loss": 0.8858,
+      "step": 660
+    },
+    {
+      "epoch": 0.4948529751188663,
+      "grad_norm": 4.187640190124512,
+      "learning_rate": 1.8855493622791163e-05,
+      "loss": 0.9077,
+      "step": 670
+    },
+    {
+      "epoch": 0.5022388404191479,
+      "grad_norm": 4.503122806549072,
+      "learning_rate": 1.8794083260541853e-05,
+      "loss": 0.9278,
+      "step": 680
+    },
+    {
+      "epoch": 0.5096247057194294,
+      "grad_norm": 4.902103900909424,
+      "learning_rate": 1.8731173583236737e-05,
+      "loss": 0.8281,
+      "step": 690
+    },
+    {
+      "epoch": 0.517010571019711,
+      "grad_norm": 4.273303031921387,
+      "learning_rate": 1.8666775316431113e-05,
+      "loss": 0.8054,
+      "step": 700
+    },
+    {
+      "epoch": 0.5243964363199927,
+      "grad_norm": 55.874359130859375,
+      "learning_rate": 1.8600899439471902e-05,
+      "loss": 0.8091,
+      "step": 710
+    },
+    {
+      "epoch": 0.5317823016202742,
+      "grad_norm": 4.271385192871094,
+      "learning_rate": 1.8533557183625773e-05,
+      "loss": 0.788,
+      "step": 720
+    },
+    {
+      "epoch": 0.5391681669205558,
+      "grad_norm": 5.59772253036499,
+      "learning_rate": 1.8464760030164287e-05,
+      "loss": 0.7942,
+      "step": 730
+    },
+    {
+      "epoch": 0.5465540322208374,
+      "grad_norm": 3.724728584289551,
+      "learning_rate": 1.8394519708406454e-05,
+      "loss": 0.8234,
+      "step": 740
+    },
+    {
+      "epoch": 0.5539398975211189,
+      "grad_norm": 5.2906036376953125,
+      "learning_rate": 1.8322848193718984e-05,
+      "loss": 0.8143,
+      "step": 750
+    },
+    {
+      "epoch": 0.5613257628214006,
+      "grad_norm": 5.114410877227783,
+      "learning_rate": 1.82497577054746e-05,
+      "loss": 0.7946,
+      "step": 760
+    },
+    {
+      "epoch": 0.5687116281216821,
+      "grad_norm": 4.730770587921143,
+      "learning_rate": 1.8175260704968716e-05,
+      "loss": 0.7771,
+      "step": 770
+    },
+    {
+      "epoch": 0.5760974934219637,
+      "grad_norm": 3.0836727619171143,
+      "learning_rate": 1.809936989329492e-05,
+      "loss": 0.739,
+      "step": 780
+    },
+    {
+      "epoch": 0.5834833587222453,
+      "grad_norm": 2.7664663791656494,
+      "learning_rate": 1.802209820917952e-05,
+      "loss": 0.731,
+      "step": 790
+    },
+    {
+      "epoch": 0.5908692240225268,
+      "grad_norm": 3.5617446899414062,
+      "learning_rate": 1.7943458826775646e-05,
+      "loss": 0.6807,
+      "step": 800
+    },
+    {
+      "epoch": 0.5982550893228085,
+      "grad_norm": 7.652963638305664,
+      "learning_rate": 1.786346515341712e-05,
+      "loss": 0.6883,
+      "step": 810
+    },
+    {
+      "epoch": 0.6056409546230901,
+      "grad_norm": 3.5472395420074463,
+      "learning_rate": 1.778213082733266e-05,
+      "loss": 0.6822,
+      "step": 820
+    },
+    {
+      "epoch": 0.6130268199233716,
+      "grad_norm": 4.652453899383545,
+      "learning_rate": 1.7699469715320663e-05,
+      "loss": 0.6508,
+      "step": 830
+    },
+    {
+      "epoch": 0.6204126852236532,
+      "grad_norm": 3.976405620574951,
+      "learning_rate": 1.7615495910385036e-05,
+      "loss": 0.6007,
+      "step": 840
+    },
+    {
+      "epoch": 0.6277985505239349,
+      "grad_norm": 3.0713090896606445,
+      "learning_rate": 1.7530223729332464e-05,
+      "loss": 0.6174,
+      "step": 850
+    },
+    {
+      "epoch": 0.6351844158242164,
+      "grad_norm": 4.036540508270264,
+      "learning_rate": 1.7443667710331523e-05,
+      "loss": 0.617,
+      "step": 860
+    },
+    {
+      "epoch": 0.642570281124498,
+      "grad_norm": 7.731866836547852,
+      "learning_rate": 1.7355842610434045e-05,
+      "loss": 0.6245,
+      "step": 870
+    },
+    {
+      "epoch": 0.6499561464247796,
+      "grad_norm": 4.550940036773682,
+      "learning_rate": 1.7266763403059162e-05,
+      "loss": 0.593,
+      "step": 880
+    },
+    {
+      "epoch": 0.6573420117250611,
+      "grad_norm": 2.5473084449768066,
+      "learning_rate": 1.7176445275440468e-05,
+      "loss": 0.5677,
+      "step": 890
+    },
+    {
+      "epoch": 0.6647278770253428,
+      "grad_norm": 2.1716835498809814,
+      "learning_rate": 1.7084903626036743e-05,
+      "loss": 0.5452,
+      "step": 900
+    },
+    {
+      "epoch": 0.6721137423256244,
+      "grad_norm": 4.398560523986816,
+      "learning_rate": 1.6992154061906637e-05,
+      "loss": 0.5599,
+      "step": 910
+    },
+    {
+      "epoch": 0.6794996076259059,
+      "grad_norm": 2.8742692470550537,
+      "learning_rate": 1.6898212396047788e-05,
+      "loss": 0.5004,
+      "step": 920
+    },
+    {
+      "epoch": 0.6868854729261875,
+      "grad_norm": 3.202517032623291,
+      "learning_rate": 1.6803094644700878e-05,
+      "loss": 0.5079,
+      "step": 930
+    },
+    {
+      "epoch": 0.6942713382264691,
+      "grad_norm": 5.449188232421875,
+      "learning_rate": 1.6706817024618966e-05,
+      "loss": 0.5122,
+      "step": 940
+    },
+    {
+      "epoch": 0.7016572035267507,
+      "grad_norm": 5.538541316986084,
+      "learning_rate": 1.6609395950302693e-05,
+      "loss": 0.5241,
+      "step": 950
+    },
+    {
+      "epoch": 0.7090430688270323,
+      "grad_norm": 3.380526304244995,
+      "learning_rate": 1.6510848031201755e-05,
+      "loss": 0.4631,
+      "step": 960
+    },
+    {
+      "epoch": 0.7164289341273139,
+      "grad_norm": 3.240527629852295,
+      "learning_rate": 1.6411190068883114e-05,
+      "loss": 0.5214,
+      "step": 970
+    },
+    {
+      "epoch": 0.7238147994275954,
+      "grad_norm": 16.668127059936523,
+      "learning_rate": 1.63104390541665e-05,
+      "loss": 0.5373,
+      "step": 980
+    },
+    {
+      "epoch": 0.731200664727877,
+      "grad_norm": 3.9278078079223633,
+      "learning_rate": 1.6208612164227605e-05,
+      "loss": 0.4789,
+      "step": 990
+    },
+    {
+      "epoch": 0.7385865300281587,
+      "grad_norm": 3.5258326530456543,
+      "learning_rate": 1.6105726759669534e-05,
+      "loss": 0.465,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7459723953284402,
+      "grad_norm": 2.779311418533325,
+      "learning_rate": 1.600180038156298e-05,
+      "loss": 0.4501,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7533582606287218,
+      "grad_norm": 3.857485771179199,
+      "learning_rate": 1.58968507484556e-05,
+      "loss": 0.4519,
+      "step": 1020
+    },
+    {
+      "epoch": 0.7607441259290034,
+      "grad_norm": 2.959052324295044,
+      "learning_rate": 1.579089575335117e-05,
+      "loss": 0.4357,
+      "step": 1030
+    },
+    {
+      "epoch": 0.7681299912292849,
+      "grad_norm": 1.8662097454071045,
+      "learning_rate": 1.568395346065899e-05,
+      "loss": 0.3633,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7755158565295666,
+      "grad_norm": 5.543001174926758,
+      "learning_rate": 1.5576042103114043e-05,
+      "loss": 0.4111,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7829017218298481,
+      "grad_norm": 6.083206653594971,
+      "learning_rate": 1.5467180078668485e-05,
+      "loss": 0.3764,
+      "step": 1060
+    },
+    {
+      "epoch": 0.7902875871301297,
+      "grad_norm": 2.5218305587768555,
+      "learning_rate": 1.5357385947354945e-05,
+      "loss": 0.374,
+      "step": 1070
+    },
+    {
+      "epoch": 0.7976734524304113,
+      "grad_norm": 4.317601680755615,
+      "learning_rate": 1.52466784281222e-05,
+      "loss": 0.3571,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8050593177306928,
+      "grad_norm": 2.0782041549682617,
+      "learning_rate": 1.5135076395643765e-05,
+      "loss": 0.3739,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8124451830309745,
+      "grad_norm": 2.443953037261963,
+      "learning_rate": 1.5022598877099913e-05,
+      "loss": 0.3607,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8198310483312561,
+      "grad_norm": 2.276827573776245,
+      "learning_rate": 1.4909265048933716e-05,
+      "loss": 0.3607,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8272169136315376,
+      "grad_norm": 2.808431386947632,
+      "learning_rate": 1.4795094233581616e-05,
+      "loss": 0.3387,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8346027789318192,
+      "grad_norm": 2.5325915813446045,
+      "learning_rate": 1.468010589617913e-05,
+      "loss": 0.3172,
+      "step": 1130
+    },
+    {
+      "epoch": 0.8419886442321008,
+      "grad_norm": 2.4943833351135254,
+      "learning_rate": 1.4564319641242202e-05,
+      "loss": 0.3193,
+      "step": 1140
+    },
+    {
+      "epoch": 0.8493745095323824,
+      "grad_norm": 2.2182066440582275,
+      "learning_rate": 1.4447755209324807e-05,
+      "loss": 0.3118,
+      "step": 1150
+    },
+    {
+      "epoch": 0.856760374832664,
+      "grad_norm": 1.920409083366394,
+      "learning_rate": 1.4330432473653369e-05,
+      "loss": 0.3246,
+      "step": 1160
+    },
+    {
+      "epoch": 0.8641462401329456,
+      "grad_norm": 3.2863781452178955,
+      "learning_rate": 1.4212371436738518e-05,
+      "loss": 0.3065,
+      "step": 1170
+    },
+    {
+      "epoch": 0.8715321054332271,
+      "grad_norm": 2.6266987323760986,
+      "learning_rate": 1.4093592226964863e-05,
+      "loss": 0.2813,
+      "step": 1180
+    },
+    {
+      "epoch": 0.8789179707335087,
+      "grad_norm": 2.526742935180664,
+      "learning_rate": 1.3974115095159273e-05,
+      "loss": 0.284,
+      "step": 1190
+    },
+    {
+      "epoch": 0.8863038360337904,
+      "grad_norm": 2.1190872192382812,
+      "learning_rate": 1.3853960411138272e-05,
+      "loss": 0.2865,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8936897013340719,
+      "grad_norm": 3.0260584354400635,
+      "learning_rate": 1.373314866023517e-05,
+      "loss": 0.3019,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9010755666343535,
+      "grad_norm": 4.537729740142822,
+      "learning_rate": 1.3611700439807503e-05,
+      "loss": 0.2946,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9084614319346351,
+      "grad_norm": 3.150209903717041,
+      "learning_rate": 1.3489636455725337e-05,
+      "loss": 0.2795,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9158472972349166,
+      "grad_norm": 1.6362818479537964,
+      "learning_rate": 1.336697751884111e-05,
+      "loss": 0.2815,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9232331625351983,
+      "grad_norm": 1.3282984495162964,
+      "learning_rate": 1.3243744541441578e-05,
+      "loss": 0.2679,
+      "step": 1250
+    },
+    {
+      "epoch": 0.9306190278354799,
+      "grad_norm": 4.261312961578369,
+      "learning_rate": 1.3119958533682417e-05,
+      "loss": 0.2634,
+      "step": 1260
+    },
+    {
+      "epoch": 0.9380048931357614,
+      "grad_norm": 2.1109001636505127,
+      "learning_rate": 1.2995640600006196e-05,
+      "loss": 0.2566,
+      "step": 1270
+    },
+    {
+      "epoch": 0.945390758436043,
+      "grad_norm": 2.4117610454559326,
+      "learning_rate": 1.2870811935544252e-05,
+      "loss": 0.2502,
+      "step": 1280
+    },
+    {
+      "epoch": 0.9527766237363247,
+      "grad_norm": 2.0748672485351562,
+      "learning_rate": 1.2745493822503096e-05,
+      "loss": 0.2422,
+      "step": 1290
+    },
+    {
+      "epoch": 0.9601624890366062,
+      "grad_norm": 3.0310394763946533,
+      "learning_rate": 1.261970762653598e-05,
+      "loss": 0.2508,
+      "step": 1300
+    },
+    {
+      "epoch": 0.9675483543368878,
+      "grad_norm": 2.0341477394104004,
+      "learning_rate": 1.2493474793100249e-05,
+      "loss": 0.2467,
+      "step": 1310
+    },
+    {
+      "epoch": 0.9749342196371694,
+      "grad_norm": 1.4582960605621338,
+      "learning_rate": 1.2366816843801066e-05,
+      "loss": 0.2479,
+      "step": 1320
+    },
+    {
+      "epoch": 0.9823200849374509,
+      "grad_norm": 3.3330225944519043,
+      "learning_rate": 1.2239755372722169e-05,
+      "loss": 0.2516,
+      "step": 1330
+    },
+    {
+      "epoch": 0.9897059502377326,
+      "grad_norm": 1.4349642992019653,
+      "learning_rate": 1.2112312042744263e-05,
+      "loss": 0.2153,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9970918155380141,
+      "grad_norm": 2.073673725128174,
+      "learning_rate": 1.1984508581851694e-05,
+      "loss": 0.1858,
+      "step": 1350
+    },
+    {
+      "epoch": 1.0051701057101972,
+      "grad_norm": 4.247702598571777,
+      "learning_rate": 1.1856366779428008e-05,
+      "loss": 0.2183,
+      "step": 1360
+    },
+    {
+      "epoch": 1.0125559710104788,
+      "grad_norm": 4.242294788360596,
+      "learning_rate": 1.1727908482541048e-05,
+      "loss": 0.2059,
+      "step": 1370
+    },
+    {
+      "epoch": 1.0199418363107602,
+      "grad_norm": 2.2901999950408936,
+      "learning_rate": 1.1599155592218234e-05,
+      "loss": 0.2207,
+      "step": 1380
+    },
+    {
+      "epoch": 1.0273277016110418,
+      "grad_norm": 1.7798693180084229,
+      "learning_rate": 1.1470130059712607e-05,
+      "loss": 0.1898,
+      "step": 1390
+    },
+    {
+      "epoch": 1.0347135669113234,
+      "grad_norm": 1.9651380777359009,
+      "learning_rate": 1.1340853882760343e-05,
+      "loss": 0.1958,
+      "step": 1400
+    },
+    {
+      "epoch": 1.042099432211605,
+      "grad_norm": 1.8335607051849365,
+      "learning_rate": 1.1211349101830323e-05,
+      "loss": 0.2201,
+      "step": 1410
+    },
+    {
+      "epoch": 1.0494852975118867,
+      "grad_norm": 2.270725965499878,
+      "learning_rate": 1.1081637796366432e-05,
+      "loss": 0.1881,
+      "step": 1420
+    },
+    {
+      "epoch": 1.0568711628121683,
+      "grad_norm": 3.337350368499756,
+      "learning_rate": 1.0951742081023196e-05,
+      "loss": 0.2176,
+      "step": 1430
+    },
+    {
+      "epoch": 1.0642570281124497,
+      "grad_norm": 3.7382607460021973,
+      "learning_rate": 1.0821684101895429e-05,
+      "loss": 0.2043,
+      "step": 1440
+    },
+    {
+      "epoch": 1.0716428934127313,
+      "grad_norm": 1.3422726392745972,
+      "learning_rate": 1.0691486032742522e-05,
+      "loss": 0.1908,
+      "step": 1450
+    },
+    {
+      "epoch": 1.079028758713013,
+      "grad_norm": 3.4625842571258545,
+      "learning_rate": 1.0561170071207987e-05,
+      "loss": 0.1747,
+      "step": 1460
+    },
+    {
+      "epoch": 1.0864146240132946,
+      "grad_norm": 1.8566938638687134,
+      "learning_rate": 1.0430758435034985e-05,
+      "loss": 0.2003,
+      "step": 1470
+    },
+    {
+      "epoch": 1.0938004893135762,
+      "grad_norm": 4.041960716247559,
+      "learning_rate": 1.0300273358278362e-05,
+      "loss": 0.1716,
+      "step": 1480
+    },
+    {
+      "epoch": 1.1011863546138578,
+      "grad_norm": 1.5447806119918823,
+      "learning_rate": 1.016973708751395e-05,
+      "loss": 0.1911,
+      "step": 1490
+    },
+    {
+      "epoch": 1.1085722199141392,
+      "grad_norm": 1.8091706037521362,
+      "learning_rate": 1.003917187804572e-05,
+      "loss": 0.1687,
+      "step": 1500
+    },
+    {
+      "epoch": 1.1159580852144209,
+      "grad_norm": 1.5981247425079346,
+      "learning_rate": 9.908599990111438e-06,
+      "loss": 0.1706,
+      "step": 1510
+    },
+    {
+      "epoch": 1.1233439505147025,
+      "grad_norm": 1.5762553215026855,
+      "learning_rate": 9.778043685087488e-06,
+      "loss": 0.1896,
+      "step": 1520
+    },
+    {
+      "epoch": 1.130729815814984,
+      "grad_norm": 1.4694616794586182,
+      "learning_rate": 9.64752522169351e-06,
+      "loss": 0.1718,
+      "step": 1530
+    },
+    {
+      "epoch": 1.1381156811152657,
+      "grad_norm": 1.4669324159622192,
+      "learning_rate": 9.517066852197469e-06,
+      "loss": 0.1481,
+      "step": 1540
+    },
+    {
+      "epoch": 1.1455015464155474,
+      "grad_norm": 2.1808154582977295,
+      "learning_rate": 9.386690818621845e-06,
+      "loss": 0.1878,
+      "step": 1550
+    },
+    {
+      "epoch": 1.1528874117158288,
+      "grad_norm": 1.0794235467910767,
+      "learning_rate": 9.256419348951545e-06,
+      "loss": 0.1809,
+      "step": 1560
+    },
+    {
+      "epoch": 1.1602732770161104,
+      "grad_norm": 1.1634767055511475,
+      "learning_rate": 9.126274653344249e-06,
+      "loss": 0.1558,
+      "step": 1570
+    },
+    {
+      "epoch": 1.167659142316392,
+      "grad_norm": 3.980741024017334,
+      "learning_rate": 8.996278920343753e-06,
+      "loss": 0.1714,
+      "step": 1580
+    },
+    {
+      "epoch": 1.1750450076166736,
+      "grad_norm": 1.3018531799316406,
+      "learning_rate": 8.866454313097011e-06,
+      "loss": 0.1476,
+      "step": 1590
+    },
+    {
+      "epoch": 1.1824308729169553,
+      "grad_norm": 1.6033530235290527,
+      "learning_rate": 8.736822965575526e-06,
+      "loss": 0.1702,
+      "step": 1600
+    },
+    {
+      "epoch": 1.1898167382172367,
+      "grad_norm": 1.6837263107299805,
+      "learning_rate": 8.607406978801692e-06,
+      "loss": 0.1622,
+      "step": 1610
+    },
+    {
+      "epoch": 1.1972026035175183,
+      "grad_norm": 4.44855260848999,
+      "learning_rate": 8.478228417080749e-06,
+      "loss": 0.2111,
+      "step": 1620
+    },
+    {
+      "epoch": 1.2045884688178,
+      "grad_norm": 1.133955478668213,
+      "learning_rate": 8.349309304239033e-06,
+      "loss": 0.1407,
+      "step": 1630
+    },
+    {
+      "epoch": 1.2119743341180815,
+      "grad_norm": 2.430974006652832,
+      "learning_rate": 8.22067161986909e-06,
+      "loss": 0.1502,
+      "step": 1640
+    },
+    {
+      "epoch": 1.2193601994183632,
+      "grad_norm": 1.0593976974487305,
+      "learning_rate": 8.092337295582342e-06,
+      "loss": 0.1461,
+      "step": 1650
+    },
+    {
+      "epoch": 1.2267460647186448,
+      "grad_norm": 1.5466171503067017,
+      "learning_rate": 7.964328211269949e-06,
+      "loss": 0.1257,
+      "step": 1660
+    },
+    {
+      "epoch": 1.2341319300189264,
+      "grad_norm": 3.7850043773651123,
+      "learning_rate": 7.83666619137247e-06,
+      "loss": 0.1237,
+      "step": 1670
+    },
+    {
+      "epoch": 1.2415177953192078,
+      "grad_norm": 2.987395763397217,
+      "learning_rate": 7.709373001158989e-06,
+      "loss": 0.135,
+      "step": 1680
+    },
+    {
+      "epoch": 1.2489036606194894,
+      "grad_norm": 1.1026815176010132,
+      "learning_rate": 7.582470343016315e-06,
+      "loss": 0.1339,
+      "step": 1690
+    },
+    {
+      "epoch": 1.256289525919771,
+      "grad_norm": 0.8675901293754578,
+      "learning_rate": 7.455979852748926e-06,
+      "loss": 0.1187,
+      "step": 1700
+    },
+    {
+      "epoch": 1.2636753912200527,
+      "grad_norm": 1.0071134567260742,
+      "learning_rate": 7.3299230958902455e-06,
+      "loss": 0.1288,
+      "step": 1710
+    },
+    {
+      "epoch": 1.2710612565203343,
+      "grad_norm": 1.257807731628418,
+      "learning_rate": 7.2043215640259045e-06,
+      "loss": 0.1219,
+      "step": 1720
+    },
+    {
+      "epoch": 1.2784471218206157,
+      "grad_norm": 1.5844953060150146,
+      "learning_rate": 7.079196671129613e-06,
+      "loss": 0.1293,
+      "step": 1730
+    },
+    {
+      "epoch": 1.2858329871208973,
+      "grad_norm": 1.242968201637268,
+      "learning_rate": 6.954569749912268e-06,
+      "loss": 0.1242,
+      "step": 1740
+    },
+    {
+      "epoch": 1.293218852421179,
+      "grad_norm": 6.035883903503418,
+      "learning_rate": 6.8304620481849e-06,
+      "loss": 0.1324,
+      "step": 1750
+    },
+    {
+      "epoch": 1.3006047177214606,
+      "grad_norm": 1.1064496040344238,
+      "learning_rate": 6.706894725236118e-06,
+      "loss": 0.113,
+      "step": 1760
+    },
+    {
+      "epoch": 1.3079905830217422,
+      "grad_norm": 3.75222110748291,
+      "learning_rate": 6.583888848224628e-06,
+      "loss": 0.1402,
+      "step": 1770
+    },
+    {
+      "epoch": 1.3153764483220236,
+      "grad_norm": 2.064958333969116,
+      "learning_rate": 6.4614653885874564e-06,
+      "loss": 0.1354,
+      "step": 1780
+    },
+    {
+      "epoch": 1.3227623136223052,
+      "grad_norm": 1.2012087106704712,
+      "learning_rate": 6.339645218464521e-06,
+      "loss": 0.1162,
+      "step": 1790
+    },
+    {
+      "epoch": 1.3301481789225869,
+      "grad_norm": 3.533600330352783,
+      "learning_rate": 6.218449107140093e-06,
+      "loss": 0.114,
+      "step": 1800
+    },
+    {
+      "epoch": 1.3375340442228685,
+      "grad_norm": 1.0663248300552368,
+      "learning_rate": 6.097897717501829e-06,
+      "loss": 0.1102,
+      "step": 1810
+    },
+    {
+      "epoch": 1.34491990952315,
+      "grad_norm": 2.6653411388397217,
+      "learning_rate": 5.978011602517908e-06,
+      "loss": 0.1115,
+      "step": 1820
+    },
+    {
+      "epoch": 1.3523057748234317,
+      "grad_norm": 2.8922715187072754,
+      "learning_rate": 5.858811201732952e-06,
+      "loss": 0.1168,
+      "step": 1830
+    },
+    {
+      "epoch": 1.3596916401237134,
+      "grad_norm": 0.7805532813072205,
+      "learning_rate": 5.740316837783247e-06,
+      "loss": 0.0985,
+      "step": 1840
+    },
+    {
+      "epoch": 1.3670775054239948,
+      "grad_norm": 1.6969873905181885,
+      "learning_rate": 5.622548712931907e-06,
+      "loss": 0.115,
+      "step": 1850
+    },
+    {
+      "epoch": 1.3744633707242764,
+      "grad_norm": 1.0871217250823975,
+      "learning_rate": 5.50552690562457e-06,
+      "loss": 0.1077,
+      "step": 1860
+    },
+    {
+      "epoch": 1.381849236024558,
+      "grad_norm": 1.25892174243927,
+      "learning_rate": 5.389271367066193e-06,
+      "loss": 0.0974,
+      "step": 1870
+    },
+    {
+      "epoch": 1.3892351013248396,
+      "grad_norm": 0.6338607668876648,
+      "learning_rate": 5.273801917819552e-06,
+      "loss": 0.098,
+      "step": 1880
+    },
+    {
+      "epoch": 1.3966209666251213,
+      "grad_norm": 0.43911364674568176,
+      "learning_rate": 5.159138244425996e-06,
+      "loss": 0.0965,
+      "step": 1890
+    },
+    {
+      "epoch": 1.4040068319254027,
+      "grad_norm": 0.7171842455863953,
+      "learning_rate": 5.045299896049063e-06,
+      "loss": 0.1043,
+      "step": 1900
+    },
+    {
+      "epoch": 1.4113926972256843,
+      "grad_norm": 0.7495408058166504,
+      "learning_rate": 4.932306281141531e-06,
+      "loss": 0.1067,
+      "step": 1910
+    },
+    {
+      "epoch": 1.418778562525966,
+      "grad_norm": 0.6386808753013611,
+      "learning_rate": 4.82017666413643e-06,
+      "loss": 0.095,
+      "step": 1920
+    },
+    {
+      "epoch": 1.4261644278262475,
+      "grad_norm": 0.4710920751094818,
+      "learning_rate": 4.7089301621626285e-06,
+      "loss": 0.0946,
+      "step": 1930
+    },
+    {
+      "epoch": 1.4335502931265292,
+      "grad_norm": 2.0037851333618164,
+      "learning_rate": 4.598585741785529e-06,
+      "loss": 0.1343,
+      "step": 1940
+    },
+    {
+      "epoch": 1.4409361584268106,
+      "grad_norm": 0.731887936592102,
+      "learning_rate": 4.489162215773437e-06,
+      "loss": 0.1021,
+      "step": 1950
+    },
+    {
+      "epoch": 1.4483220237270924,
+      "grad_norm": 1.012526035308838,
+      "learning_rate": 4.380678239890128e-06,
+      "loss": 0.0986,
+      "step": 1960
+    },
+    {
+      "epoch": 1.4557078890273738,
+      "grad_norm": 1.7591279745101929,
+      "learning_rate": 4.273152309714231e-06,
+      "loss": 0.0921,
+      "step": 1970
+    },
+    {
+      "epoch": 1.4630937543276554,
+      "grad_norm": 0.5881451964378357,
+      "learning_rate": 4.166602757485865e-06,
+      "loss": 0.0889,
+      "step": 1980
+    },
+    {
+      "epoch": 1.470479619627937,
+      "grad_norm": 0.6772285103797913,
+      "learning_rate": 4.061047748981171e-06,
+      "loss": 0.0999,
+      "step": 1990
+    },
+    {
+      "epoch": 1.4778654849282187,
+      "grad_norm": 1.0633774995803833,
+      "learning_rate": 3.9565052804151925e-06,
+      "loss": 0.0929,
+      "step": 2000
+    },
+    {
+      "epoch": 1.4852513502285003,
+      "grad_norm": 0.5887898802757263,
+      "learning_rate": 3.852993175373679e-06,
+      "loss": 0.0929,
+      "step": 2010
+    },
+    {
+      "epoch": 1.4926372155287817,
+      "grad_norm": 0.9685658812522888,
+      "learning_rate": 3.7505290817743256e-06,
+      "loss": 0.0932,
+      "step": 2020
+    },
+    {
+      "epoch": 1.5000230808290633,
+      "grad_norm": 3.481058120727539,
+      "learning_rate": 3.6491304688579376e-06,
+      "loss": 0.1034,
+      "step": 2030
+    },
+    {
+      "epoch": 1.507408946129345,
+      "grad_norm": 1.2913931608200073,
+      "learning_rate": 3.5488146242101018e-06,
+      "loss": 0.0914,
+      "step": 2040
+    },
+    {
+      "epoch": 1.5147948114296266,
+      "grad_norm": 0.49071353673934937,
+      "learning_rate": 3.4495986508137847e-06,
+      "loss": 0.097,
+      "step": 2050
+    },
+    {
+      "epoch": 1.5221806767299082,
+      "grad_norm": 0.7845070362091064,
+      "learning_rate": 3.3514994641334274e-06,
+      "loss": 0.0895,
+      "step": 2060
+    },
+    {
+      "epoch": 1.5295665420301896,
+      "grad_norm": 0.7540778517723083,
+      "learning_rate": 3.254533789231008e-06,
+      "loss": 0.094,
+      "step": 2070
+    },
+    {
+      "epoch": 1.5369524073304714,
+      "grad_norm": 0.8221713900566101,
+      "learning_rate": 3.158718157914559e-06,
+      "loss": 0.0857,
+      "step": 2080
+    },
+    {
+      "epoch": 1.5443382726307529,
+      "grad_norm": 0.458886057138443,
+      "learning_rate": 3.0640689059196328e-06,
+      "loss": 0.0834,
+      "step": 2090
+    },
+    {
+      "epoch": 1.5517241379310345,
+      "grad_norm": 5.687739372253418,
+      "learning_rate": 2.9706021701242127e-06,
+      "loss": 0.0944,
+      "step": 2100
+    },
+    {
+      "epoch": 1.559110003231316,
+      "grad_norm": 0.609434962272644,
+      "learning_rate": 2.8783338857975087e-06,
+      "loss": 0.0926,
+      "step": 2110
+    },
+    {
+      "epoch": 1.5664958685315975,
+      "grad_norm": 3.346607208251953,
+      "learning_rate": 2.787279783883129e-06,
+      "loss": 0.087,
+      "step": 2120
+    },
+    {
+      "epoch": 1.5738817338318793,
+      "grad_norm": 2.047215700149536,
+      "learning_rate": 2.697455388317094e-06,
+      "loss": 0.0807,
+      "step": 2130
+    },
+    {
+      "epoch": 1.5812675991321608,
+      "grad_norm": 1.0655306577682495,
+      "learning_rate": 2.6088760133811418e-06,
+      "loss": 0.0857,
+      "step": 2140
+    },
+    {
+      "epoch": 1.5886534644324424,
+      "grad_norm": 1.1660749912261963,
+      "learning_rate": 2.5215567610917623e-06,
+      "loss": 0.08,
+      "step": 2150
+    },
+    {
+      "epoch": 1.596039329732724,
+      "grad_norm": 0.45875102281570435,
+      "learning_rate": 2.4355125186254547e-06,
+      "loss": 0.0931,
+      "step": 2160
+    },
+    {
+      "epoch": 1.6034251950330056,
+      "grad_norm": 1.5347977876663208,
+      "learning_rate": 2.3507579557805803e-06,
+      "loss": 0.083,
+      "step": 2170
+    },
+    {
+      "epoch": 1.6108110603332872,
+      "grad_norm": 1.1268221139907837,
+      "learning_rate": 2.26730752247629e-06,
+      "loss": 0.0841,
+      "step": 2180
+    },
+    {
+      "epoch": 1.6181969256335687,
+      "grad_norm": 0.4492045044898987,
+      "learning_rate": 2.1851754462889373e-06,
+      "loss": 0.0791,
+      "step": 2190
+    },
+    {
+      "epoch": 1.6255827909338505,
+      "grad_norm": 0.9329794645309448,
+      "learning_rate": 2.104375730026406e-06,
+      "loss": 0.0827,
+      "step": 2200
+    },
+    {
+      "epoch": 1.632968656234132,
+      "grad_norm": 0.4460253119468689,
+      "learning_rate": 2.024922149340748e-06,
+      "loss": 0.0812,
+      "step": 2210
+    },
+    {
+      "epoch": 1.6403545215344135,
+      "grad_norm": 3.0073747634887695,
+      "learning_rate": 1.9468282503795465e-06,
+      "loss": 0.0836,
+      "step": 2220
+    },
+    {
+      "epoch": 1.6477403868346951,
+      "grad_norm": 0.7037497758865356,
+      "learning_rate": 1.8701073474764342e-06,
+      "loss": 0.0757,
+      "step": 2230
+    },
+    {
+      "epoch": 1.6551262521349765,
+      "grad_norm": 2.326693058013916,
+      "learning_rate": 1.7947725208810962e-06,
+      "loss": 0.0743,
+      "step": 2240
+    },
+    {
+      "epoch": 1.6625121174352584,
+      "grad_norm": 0.2990873456001282,
+      "learning_rate": 1.720836614529211e-06,
+      "loss": 0.0799,
+      "step": 2250
+    },
+    {
+      "epoch": 1.6698979827355398,
+      "grad_norm": 0.4213595688343048,
+      "learning_rate": 1.648312233852666e-06,
+      "loss": 0.0802,
+      "step": 2260
+    },
+    {
+      "epoch": 1.6772838480358214,
+      "grad_norm": 0.5848265290260315,
+      "learning_rate": 1.5772117436304446e-06,
+      "loss": 0.0795,
+      "step": 2270
+    },
+    {
+      "epoch": 1.684669713336103,
+      "grad_norm": 0.6411451697349548,
+      "learning_rate": 1.5075472658805301e-06,
+      "loss": 0.0739,
+      "step": 2280
+    },
+    {
+      "epoch": 1.6920555786363847,
+      "grad_norm": 0.8654035925865173,
+      "learning_rate": 1.4393306777932192e-06,
+      "loss": 0.0796,
+      "step": 2290
+    },
+    {
+      "epoch": 1.6994414439366663,
+      "grad_norm": 0.7043092250823975,
+      "learning_rate": 1.3725736097061537e-06,
+      "loss": 0.0811,
+      "step": 2300
+    },
+    {
+      "epoch": 1.7068273092369477,
+      "grad_norm": 1.6693702936172485,
+      "learning_rate": 1.307287443121452e-06,
+      "loss": 0.094,
+      "step": 2310
+    },
+    {
+      "epoch": 1.7142131745372293,
+      "grad_norm": 0.33761119842529297,
+      "learning_rate": 1.2434833087652642e-06,
+      "loss": 0.0759,
+      "step": 2320
+    },
+    {
+      "epoch": 1.721599039837511,
+      "grad_norm": 0.9389520883560181,
+      "learning_rate": 1.181172084690072e-06,
+      "loss": 0.0727,
+      "step": 2330
+    },
+    {
+      "epoch": 1.7289849051377926,
+      "grad_norm": 0.2903837263584137,
+      "learning_rate": 1.120364394420087e-06,
+      "loss": 0.0743,
+      "step": 2340
+    },
+    {
+      "epoch": 1.7363707704380742,
+      "grad_norm": 0.325009822845459,
+      "learning_rate": 1.0610706051400165e-06,
+      "loss": 0.0801,
+      "step": 2350
+    },
+    {
+      "epoch": 1.7437566357383556,
+      "grad_norm": 0.9325069785118103,
+      "learning_rate": 1.0033008259275635e-06,
+      "loss": 0.0759,
+      "step": 2360
+    },
+    {
+      "epoch": 1.7511425010386374,
+      "grad_norm": 1.0802961587905884,
+      "learning_rate": 9.470649060299041e-07,
+      "loss": 0.0779,
+      "step": 2370
+    },
+    {
+      "epoch": 1.7585283663389188,
+      "grad_norm": 0.4947347939014435,
+      "learning_rate": 8.923724331844875e-07,
+      "loss": 0.0786,
+      "step": 2380
+    },
+    {
+      "epoch": 1.7659142316392005,
+      "grad_norm": 0.47125598788261414,
+      "learning_rate": 8.392327319843985e-07,
+      "loss": 0.0751,
+      "step": 2390
+    },
+    {
+      "epoch": 1.773300096939482,
+      "grad_norm": 0.3219301402568817,
+      "learning_rate": 7.876548622886038e-07,
+      "loss": 0.0702,
+      "step": 2400
+    },
+    {
+      "epoch": 1.7806859622397635,
+      "grad_norm": 0.602854311466217,
+      "learning_rate": 7.376476176773184e-07,
+      "loss": 0.0772,
+      "step": 2410
+    },
+    {
+      "epoch": 1.7880718275400453,
+      "grad_norm": 0.48326513171195984,
+      "learning_rate": 6.89219523952781e-07,
+      "loss": 0.0797,
+      "step": 2420
+    },
+    {
+      "epoch": 1.7954576928403267,
+      "grad_norm": 0.5595663189888,
+      "learning_rate": 6.423788376856765e-07,
+      "loss": 0.066,
+      "step": 2430
+    },
+    {
+      "epoch": 1.8028435581406084,
+      "grad_norm": 1.7976887226104736,
+      "learning_rate": 5.971335448074611e-07,
+      "loss": 0.0732,
+      "step": 2440
+    },
+    {
+      "epoch": 1.81022942344089,
+      "grad_norm": 1.282763957977295,
+      "learning_rate": 5.534913592488322e-07,
+      "loss": 0.0816,
+      "step": 2450
+    },
+    {
+      "epoch": 1.8176152887411716,
+      "grad_norm": 0.9589461088180542,
+      "learning_rate": 5.114597216245698e-07,
+      "loss": 0.0798,
+      "step": 2460
+    },
+    {
+      "epoch": 1.8250011540414532,
+      "grad_norm": 0.43628719449043274,
+      "learning_rate": 4.7104579796497405e-07,
+      "loss": 0.0835,
+      "step": 2470
+    },
+    {
+      "epoch": 1.8323870193417346,
+      "grad_norm": 0.49431607127189636,
+      "learning_rate": 4.3225647849411854e-07,
+      "loss": 0.074,
+      "step": 2480
+    },
+    {
+      "epoch": 1.8397728846420165,
+      "grad_norm": 0.9135465025901794,
+      "learning_rate": 3.9509837645513306e-07,
+      "loss": 0.0736,
+      "step": 2490
+    },
+    {
+      "epoch": 1.847158749942298,
+      "grad_norm": 0.6499918103218079,
+      "learning_rate": 3.595778269826966e-07,
+      "loss": 0.0723,
+      "step": 2500
+    },
+    {
+      "epoch": 1.8545446152425795,
+      "grad_norm": 1.299659013748169,
+      "learning_rate": 3.257008860229527e-07,
+      "loss": 0.0735,
+      "step": 2510
+    },
+    {
+      "epoch": 1.8619304805428611,
+      "grad_norm": 0.7049327492713928,
+      "learning_rate": 2.9347332930102503e-07,
+      "loss": 0.0713,
+      "step": 2520
+    },
+    {
+      "epoch": 1.8693163458431425,
+      "grad_norm": 0.29024580121040344,
+      "learning_rate": 2.6290065133630637e-07,
+      "loss": 0.0774,
+      "step": 2530
+    },
+    {
+      "epoch": 1.8767022111434244,
+      "grad_norm": 0.7386340498924255,
+      "learning_rate": 2.3398806450568577e-07,
+      "loss": 0.0739,
+      "step": 2540
+    },
+    {
+      "epoch": 1.8840880764437058,
+      "grad_norm": 0.5153611898422241,
+      "learning_rate": 2.067404981548915e-07,
+      "loss": 0.0702,
+      "step": 2550
+    },
+    {
+      "epoch": 1.8914739417439874,
+      "grad_norm": 1.2201671600341797,
+      "learning_rate": 1.811625977580722e-07,
+      "loss": 0.082,
+      "step": 2560
+    },
+    {
+      "epoch": 1.898859807044269,
+      "grad_norm": 0.7881399989128113,
+      "learning_rate": 1.5725872412579058e-07,
+      "loss": 0.0677,
+      "step": 2570
+    },
+    {
+      "epoch": 1.9062456723445507,
+      "grad_norm": 0.3312283456325531,
+      "learning_rate": 1.3503295266153903e-07,
+      "loss": 0.0756,
+      "step": 2580
+    },
+    {
+      "epoch": 1.9136315376448323,
+      "grad_norm": 0.4955926239490509,
+      "learning_rate": 1.14489072666919e-07,
+      "loss": 0.0692,
+      "step": 2590
+    },
+    {
+      "epoch": 1.9210174029451137,
+      "grad_norm": 0.45805656909942627,
+      "learning_rate": 9.563058669559755e-08,
+      "loss": 0.0753,
+      "step": 2600
+    },
+    {
+      "epoch": 1.9284032682453955,
+      "grad_norm": 0.5555469393730164,
+      "learning_rate": 7.846070995615518e-08,
+      "loss": 0.0716,
+      "step": 2610
+    },
+    {
+      "epoch": 1.935789133545677,
+      "grad_norm": 0.5252045392990112,
+      "learning_rate": 6.298236976391537e-08,
+      "loss": 0.0772,
+      "step": 2620
+    },
+    {
+      "epoch": 1.9431749988459586,
+      "grad_norm": 1.8346993923187256,
+      "learning_rate": 4.919820504186934e-08,
+      "loss": 0.0764,
+      "step": 2630
+    },
+    {
+      "epoch": 1.9505608641462402,
+      "grad_norm": 0.4004700481891632,
+      "learning_rate": 3.711056587075712e-08,
+      "loss": 0.0739,
+      "step": 2640
+    },
+    {
+      "epoch": 1.9579467294465216,
+      "grad_norm": 1.077645182609558,
+      "learning_rate": 2.672151308840243e-08,
+      "loss": 0.07,
+      "step": 2650
+    },
+    {
+      "epoch": 1.9653325947468034,
+      "grad_norm": 0.6247801184654236,
+      "learning_rate": 1.8032817938352653e-08,
+      "loss": 0.0666,
+      "step": 2660
+    },
+    {
+      "epoch": 1.9727184600470848,
+      "grad_norm": 0.4016879200935364,
+      "learning_rate": 1.1045961767904844e-08,
+      "loss": 0.0695,
+      "step": 2670
+    },
+    {
+      "epoch": 1.9801043253473665,
+      "grad_norm": 0.5175566673278809,
+      "learning_rate": 5.7621357755432984e-09,
+      "loss": 0.0722,
+      "step": 2680
+    },
+    {
+      "epoch": 1.987490190647648,
+      "grad_norm": 0.5656958222389221,
+      "learning_rate": 2.1822408078508994e-09,
+      "loss": 0.0728,
+      "step": 2690
+    },
+    {
+      "epoch": 1.9948760559479295,
+      "grad_norm": 0.5182742476463318,
+      "learning_rate": 3.068872059253103e-10,
+      "loss": 0.0727,
+      "step": 2700
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2706,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.22919470739456e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2706/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:656f5a16bf45095ee35164228e3326efc9da3df52162180e39e4ecfe778dcfd3
+size 5969

checkpoint-2706/vocab.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fb90bfa35244d26f0065d1fcd0b5becc3da3d44d616a7e2aacaf6320b9fa2d0
+size 1500244

config.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "_name_or_path": "dicta-il/neodictabert",
+  "architectures": [
+    "NeoBERTForSequenceClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "dicta-il/neodictabert--modeling_neobert.NeoBERTConfig",
+    "AutoModel": "dicta-il/neodictabert--modeling_neobert.NeoBERT",
+    "AutoModelForMaskedLM": "dicta-il/neodictabert--modeling_neobert.NeoBERTLMHead",
+    "AutoModelForQuestionAnswering": "dicta-il/neodictabert--modeling_neobert.NeoBERTForQuestionAnswering",
+    "AutoModelForSequenceClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForSequenceClassification",
+    "AutoModelForTokenClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForTokenClassification"
+  },
+  "decoder_init_range": 0.02,
+  "dim_head": 64,
+  "embedding_init_range": 0.02,
+  "encoder_init_range": 0.02,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "construct_state_confusion",
+    "1": "directional_preposition_swap",
+    "2": "entity_date_swap",
+    "3": "entity_event_swap",
+    "4": "entity_location_swap",
+    "5": "entity_organization_swap",
+    "6": "entity_person_swap",
+    "7": "entity_title_swap",
+    "8": "hebrew_root_pattern_confusion",
+    "9": "homographic_gender_errors",
+    "10": "idiom_collocation_corruption",
+    "11": "measure_unit_swap",
+    "12": "morphological_connective_confusion",
+    "13": "noun_gender_swap",
+    "14": "number_swap",
+    "15": "pronoun_reference_errors",
+    "16": "sentence_negation",
+    "17": "specificity_shift_errors",
+    "18": "verb_gender_swap",
+    "19": "verb_tense_swap"
+  },
+  "intermediate_size": 3072,
+  "kwargs": {
+    "_commit_hash": "9052b2e47fe3e615931563bb2a74b26df6e028a3",
+    "architectures": [
+      "NeoBERTLMHead"
+    ],
+    "attn_implementation": null,
+    "auto_map": {
+      "AutoConfig": "dicta-il/neodictabert--modeling_neobert.NeoBERTConfig",
+      "AutoModel": "dicta-il/neodictabert--modeling_neobert.NeoBERT",
+      "AutoModelForMaskedLM": "dicta-il/neodictabert--modeling_neobert.NeoBERTLMHead",
+      "AutoModelForQuestionAnswering": "dicta-il/neodictabert--modeling_neobert.NeoBERTForQuestionAnswering",
+      "AutoModelForSequenceClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForSequenceClassification",
+      "AutoModelForTokenClassification": "dicta-il/neodictabert--modeling_neobert.NeoBERTForTokenClassification"
+    },
+    "decoder_init_range": 0.02,
+    "dim_head": 64,
+    "kwargs": {
+      "decoder_init_range": 0.02
+    },
+    "model_type": "neobert",
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.53.0"
+  },
+  "label2id": {
+    "construct_state_confusion": 0,
+    "directional_preposition_swap": 1,
+    "entity_date_swap": 2,
+    "entity_event_swap": 3,
+    "entity_location_swap": 4,
+    "entity_organization_swap": 5,
+    "entity_person_swap": 6,
+    "entity_title_swap": 7,
+    "hebrew_root_pattern_confusion": 8,
+    "homographic_gender_errors": 9,
+    "idiom_collocation_corruption": 10,
+    "measure_unit_swap": 11,
+    "morphological_connective_confusion": 12,
+    "noun_gender_swap": 13,
+    "number_swap": 14,
+    "pronoun_reference_errors": 15,
+    "sentence_negation": 16,
+    "specificity_shift_errors": 17,
+    "verb_gender_swap": 18,
+    "verb_tense_swap": 19
+  },
+  "max_length": 4096,
+  "model_type": "neobert",
+  "norm_eps": 1e-06,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "pad_token_id": 3,
+  "problem_type": "multi_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "vocab_size": 128000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:970fea6a79cfdd383e06cb000073b4a35151bf831d37f74b6451f0639b0ee9c7
+size 1452799672

script_arguments.json ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+  "model_args": {
+    "model_name_or_path": "dicta-il/neodictabert",
+    "language": "he",
+    "train_language": "he",
+    "config_name": null,
+    "tokenizer_name": null,
+    "cache_dir": null,
+    "do_lower_case": false,
+    "use_fast_tokenizer": true,
+    "model_revision": "main",
+    "use_auth_token": false,
+    "ignore_mismatched_sizes": false
+  },
+  "data_args": {
+    "max_seq_length": 3072,
+    "overwrite_cache": false,
+    "pad_to_max_length": true,
+    "max_train_samples": null,
+    "max_eval_samples": null,
+    "max_predict_samples": null,
+    "train_file": "/home/avramit/classifier/04122025/datasets/classifier/articleclaimlevel/train_cleaned.csv",
+    "validation_file": "/home/avramit/classifier/01122025/datasets/classifiers/claimlevel/mafat_validation.csv",
+    "test_file": "/home/avramit/classifier/01122025/datasets/classifiers/claimlevel/mafat_test.csv",
+    "train_dataset_percentage": 1.0
+  },
+  "training_args": {
+    "output_dir": "/home/avramit/classifier/04122025/classifier/multilabel/neodictabert-setup2-claimlevel-finetuned-multilabel_error_types_041225",
+    "overwrite_output_dir": true,
+    "do_train": true,
+    "do_eval": true,
+    "do_predict": true,
+    "eval_strategy": "no",
+    "prediction_loss_only": false,
+    "per_device_train_batch_size": 2,
+    "per_device_eval_batch_size": 8,
+    "per_gpu_train_batch_size": null,
+    "per_gpu_eval_batch_size": null,
+    "gradient_accumulation_steps": 16,
+    "eval_accumulation_steps": null,
+    "eval_delay": 0,
+    "torch_empty_cache_steps": null,
+    "learning_rate": 2e-05,
+    "weight_decay": 0.01,
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.999,
+    "adam_epsilon": 1e-08,
+    "max_grad_norm": 1.0,
+    "num_train_epochs": 2.0,
+    "max_steps": -1,
+    "lr_scheduler_type": "cosine",
+    "lr_scheduler_kwargs": {},
+    "warmup_ratio": 0.0,
+    "warmup_steps": 300,
+    "log_level": "passive",
+    "log_level_replica": "warning",
+    "log_on_each_node": true,
+    "logging_dir": "/home/avramit/classifier/04122025/classifier/multilabel/neodictabert-setup2-claimlevel-finetuned-multilabel_error_types_041225/runs/Dec04_02-57-23_ise-6000-08.auth.ad.bgu.ac.il",
+    "logging_strategy": "steps",
+    "logging_first_step": false,
+    "logging_steps": 10,
+    "logging_nan_inf_filter": true,
+    "save_strategy": "steps",
+    "save_steps": 100,
+    "save_total_limit": 1,
+    "save_safetensors": true,
+    "save_on_each_node": false,
+    "save_only_model": false,
+    "restore_callback_states_from_checkpoint": false,
+    "no_cuda": false,
+    "use_cpu": false,
+    "use_mps_device": false,
+    "seed": 42,
+    "data_seed": null,
+    "jit_mode_eval": false,
+    "use_ipex": false,
+    "bf16": false,
+    "fp16": false,
+    "fp16_opt_level": "O1",
+    "half_precision_backend": "auto",
+    "bf16_full_eval": false,
+    "fp16_full_eval": false,
+    "tf32": null,
+    "local_rank": 0,
+    "ddp_backend": null,
+    "tpu_num_cores": null,
+    "tpu_metrics_debug": false,
+    "debug": [],
+    "dataloader_drop_last": false,
+    "eval_steps": 50.0,
+    "dataloader_num_workers": 0,
+    "dataloader_prefetch_factor": null,
+    "past_index": -1,
+    "run_name": "/home/avramit/classifier/04122025/classifier/multilabel/neodictabert-setup2-claimlevel-finetuned-multilabel_error_types_041225",
+    "disable_tqdm": false,
+    "remove_unused_columns": true,
+    "label_names": null,
+    "load_best_model_at_end": false,
+    "metric_for_best_model": null,
+    "greater_is_better": null,
+    "ignore_data_skip": false,
+    "fsdp": [],
+    "fsdp_min_num_params": 0,
+    "fsdp_config": {
+      "min_num_params": 0,
+      "xla": false,
+      "xla_fsdp_v2": false,
+      "xla_fsdp_grad_ckpt": false
+    },
+    "fsdp_transformer_layer_cls_to_wrap": null,
+    "accelerator_config": "AcceleratorConfig(split_batches=False, dispatch_batches=None, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False)",
+    "deepspeed": null,
+    "label_smoothing_factor": 0.0,
+    "optim": "adamw_torch",
+    "optim_args": null,
+    "adafactor": false,
+    "group_by_length": false,
+    "length_column_name": "length",
+    "report_to": [],
+    "ddp_find_unused_parameters": null,
+    "ddp_bucket_cap_mb": null,
+    "ddp_broadcast_buffers": null,
+    "dataloader_pin_memory": true,
+    "dataloader_persistent_workers": false,
+    "skip_memory_metrics": true,
+    "use_legacy_prediction_loop": false,
+    "push_to_hub": false,
+    "resume_from_checkpoint": null,
+    "hub_model_id": null,
+    "hub_strategy": "every_save",
+    "hub_token": null,
+    "hub_private_repo": null,
+    "hub_always_push": false,
+    "gradient_checkpointing": false,
+    "gradient_checkpointing_kwargs": null,
+    "include_inputs_for_metrics": false,
+    "include_for_metrics": [],
+    "eval_do_concat_batches": true,
+    "fp16_backend": "auto",
+    "evaluation_strategy": null,
+    "push_to_hub_model_id": null,
+    "push_to_hub_organization": null,
+    "push_to_hub_token": null,
+    "mp_parameters": "",
+    "auto_find_batch_size": false,
+    "full_determinism": false,
+    "torchdynamo": null,
+    "ray_scope": "last",
+    "ddp_timeout": 1800,
+    "torch_compile": false,
+    "torch_compile_backend": null,
+    "torch_compile_mode": null,
+    "dispatch_batches": null,
+    "split_batches": null,
+    "include_tokens_per_second": false,
+    "include_num_input_tokens_seen": false,
+    "neftune_noise_alpha": null,
+    "optim_target_modules": null,
+    "batch_eval_metrics": false,
+    "eval_on_start": false,
+    "use_liger_kernel": false,
+    "eval_use_gather_object": false,
+    "average_tokens_across_devices": false,
+    "distributed_state": "Distributed environment: NO\nNum processes: 1\nProcess index: 0\nLocal process index: 0\nDevice: cuda\n",
+    "_n_gpu": 1,
+    "__cached__setup_devices": "cuda:0",
+    "deepspeed_plugin": null
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "[BLANK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 4096,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:656f5a16bf45095ee35164228e3326efc9da3df52162180e39e4ecfe778dcfd3
+size 5969

vocab.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fb90bfa35244d26f0065d1fcd0b5becc3da3d44d616a7e2aacaf6320b9fa2d0
+size 1500244