niobures commited on
Commit
1735ad5
·
verified ·
1 Parent(s): 5553459

DictaBERT

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +20 -0
  2. dictabert-ce/.gitattributes +36 -0
  3. dictabert-ce/README.md +94 -0
  4. dictabert-ce/config.json +33 -0
  5. dictabert-ce/model.safetensors +3 -0
  6. dictabert-ce/source.txt +1 -0
  7. dictabert-ce/special_tokens_map.json +37 -0
  8. dictabert-ce/tokenizer.json +0 -0
  9. dictabert-ce/tokenizer_config.json +70 -0
  10. dictabert-ce/vocab.txt +3 -0
  11. dictabert-char-spacefix/.gitattributes +35 -0
  12. dictabert-char-spacefix/README.md +78 -0
  13. dictabert-char-spacefix/config.json +26 -0
  14. dictabert-char-spacefix/model.safetensors +3 -0
  15. dictabert-char-spacefix/source.txt +1 -0
  16. dictabert-char-spacefix/special_tokens_map.json +37 -0
  17. dictabert-char-spacefix/tokenizer.json +1022 -0
  18. dictabert-char-spacefix/tokenizer_config.json +64 -0
  19. dictabert-char-spacefix/vocab.txt +0 -0
  20. dictabert-heq/.gitattributes +36 -0
  21. dictabert-heq/LICENSE +395 -0
  22. dictabert-heq/README.md +71 -0
  23. dictabert-heq/config.json +26 -0
  24. dictabert-heq/issues.txt +20 -0
  25. dictabert-heq/pytorch_model.bin +3 -0
  26. dictabert-heq/source.txt +1 -0
  27. dictabert-heq/speed.ipynb +220 -0
  28. dictabert-heq/tokenizer.json +0 -0
  29. dictabert-heq/tokenizer_config.json +13 -0
  30. dictabert-heq/vocab.txt +3 -0
  31. dictabert-joint/.gitattributes +36 -0
  32. dictabert-joint/BertForJointParsing.py +534 -0
  33. dictabert-joint/BertForMorphTagging.py +215 -0
  34. dictabert-joint/BertForPrefixMarking.py +266 -0
  35. dictabert-joint/BertForSyntaxParsing.py +315 -0
  36. dictabert-joint/README.md +521 -0
  37. dictabert-joint/config.json +93 -0
  38. dictabert-joint/model.safetensors +3 -0
  39. dictabert-joint/pytorch_model.bin +3 -0
  40. dictabert-joint/source.txt +1 -0
  41. dictabert-joint/special_tokens_map.json +37 -0
  42. dictabert-joint/tokenizer.json +0 -0
  43. dictabert-joint/tokenizer_config.json +63 -0
  44. dictabert-joint/vocab.txt +3 -0
  45. dictabert-large-char-menaked/.gitattributes +35 -0
  46. dictabert-large-char-menaked/BertForDiacritization.py +190 -0
  47. dictabert-large-char-menaked/README.md +69 -0
  48. dictabert-large-char-menaked/config.json +63 -0
  49. dictabert-large-char-menaked/issues.txt +35 -0
  50. dictabert-large-char-menaked/model.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dictabert-ce/vocab.txt filter=lfs diff=lfs merge=lfs -text
37
+ dictabert-heq/vocab.txt filter=lfs diff=lfs merge=lfs -text
38
+ dictabert-joint/vocab.txt filter=lfs diff=lfs merge=lfs -text
39
+ dictabert-large-heq/vocab.txt filter=lfs diff=lfs merge=lfs -text
40
+ dictabert-large-ner/vocab.txt filter=lfs diff=lfs merge=lfs -text
41
+ dictabert-large-parse/vocab.txt filter=lfs diff=lfs merge=lfs -text
42
+ dictabert-large/vocab.txt filter=lfs diff=lfs merge=lfs -text
43
+ dictabert-lex/vocab.txt filter=lfs diff=lfs merge=lfs -text
44
+ dictabert-morph/vocab.txt filter=lfs diff=lfs merge=lfs -text
45
+ dictabert-ner-handler/vocab.txt filter=lfs diff=lfs merge=lfs -text
46
+ dictabert-ner-ONNX/vocab.txt filter=lfs diff=lfs merge=lfs -text
47
+ dictabert-ner/vocab.txt filter=lfs diff=lfs merge=lfs -text
48
+ dictabert-parse/vocab.txt filter=lfs diff=lfs merge=lfs -text
49
+ dictabert-seg/vocab.txt filter=lfs diff=lfs merge=lfs -text
50
+ dictabert-sentiment/vocab.txt filter=lfs diff=lfs merge=lfs -text
51
+ dictabert-syntax/vocab.txt filter=lfs diff=lfs merge=lfs -text
52
+ dictabert-tiny-joint/vocab.txt filter=lfs diff=lfs merge=lfs -text
53
+ dictabert-tiny-parse/vocab.txt filter=lfs diff=lfs merge=lfs -text
54
+ dictabert-tiny/vocab.txt filter=lfs diff=lfs merge=lfs -text
55
+ dictabert/vocab.txt filter=lfs diff=lfs merge=lfs -text
dictabert-ce/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
dictabert-ce/README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - he
5
+ ---
6
+
7
+
8
+ ## Model Details
9
+
10
+ ### Model Description
11
+
12
+ This is the model card of a 🤗 transformers model that has been pushed on the Hub.
13
+
14
+ - **Model type:** CrossEncoder
15
+ - **Language(s) (NLP):** Hebrew
16
+ - **License:** [More Information Needed]
17
+ - **Finetuned from model [optional]:** [DictaBERT](https://huggingface.co/dicta-il/dictabert)
18
+
19
+
20
+ ## Uses
21
+
22
+ Model was trained for ranking task as a part of a Hebrew semantic search engine.
23
+
24
+ ## How to Get Started with the Model
25
+
26
+ Use the code below to get started with the model.
27
+
28
+ ```python
29
+ from sentence_transformers import CrossEncoder
30
+
31
+
32
+ query = "על מה לא הסכים דוד בן גוריון לוותר?"
33
+ doc1 = """
34
+ מלחמת סיני הסתיימה בתבוסה של הכוחות המצריים, אך ברית המועצות וארצות הברית הפעילו לחץ כבד על ישראל לסגת מחצי האי סיני.
35
+ ראש ממשלת ישראל, דוד בן-גוריון, הסכים, בעקבות הלחץ של שתי המעצמות,
36
+ לפנות את חצי האי סיני ורצועת עזה בתהליך שהסתיים במרץ 1957,
37
+ אך הודיע שסגירה של מצרי טיראן לשיט ישראלי תהווה עילה למלחמה.
38
+ ארצות הברית התחייבה לדאוג להבטחת חופש המעבר של ישראל במצרי טיראן.
39
+ כוח חירום בינלאומי של האו"ם הוצב בצד המצרי של הגבול עם ישראל ובשארם א-שייח' וכתוצאה מכך נשאר נתיב השיט במפרץ אילת פתוח לשיט הישראלי.
40
+ """
41
+ doc2 = """
42
+ ים סוף מהווה מוקד חשוב לתיירות מרחבי העולם.
43
+ מזג האוויר הנוח בעונת החורף, החופים היפים, הים הצלול ואתרי הצלילה המרהיבים לחופי סיני,
44
+ מצרים, וסודאן הופכים את חופי ים סוף ליעד תיירות מבוקש.
45
+ ראס מוחמד והחור הכחול בסיני, ידועים כאתרי צלילה מהמרהיבים בעולם.
46
+ מאז הסכם השלום בין ישראל למצרים פיתחה מצרים מאוד את התיירות לאורך חופי ים סוף,
47
+ ובמיוחד בסיני, ובנתה עשרות אתרי תיירות ומאות מלונות וכפרי נופש.
48
+ תיירות זו נפגעה קשות מאז המהפכה של 2011 במצרים,
49
+ עם עלייה חדה בתקריות טרור מצד ארגונים אסלאמיים קיצוניים בסיני.
50
+ """
51
+
52
+ model = CrossEncoder("haguy77/dictabert-ce")
53
+
54
+ scores = model.predict([[query, doc1], [query, doc2]]) # Note: query should ALWAYS be the first of each pair
55
+ # array([0.02000629, 0.00031683], dtype=float32)
56
+
57
+ results = model.rank(query, [doc2, doc1])
58
+ # [{'corpus_id': 1, 'score': 0.020006292}, {'corpus_id': 0, 'score': 0.00031683326}]
59
+ ```
60
+
61
+ ### Training Data
62
+
63
+ [Hebrew Question Answering Dataset (HeQ)](https://github.com/NNLP-IL/Hebrew-Question-Answering-Dataset)
64
+
65
+ ## Citation
66
+
67
+ **BibTeX:**
68
+
69
+ ```bibtex
70
+ @misc{shmidman2023dictabert,
71
+ title={DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew},
72
+ author={Shaltiel Shmidman and Avi Shmidman and Moshe Koppel},
73
+ year={2023},
74
+ eprint={2308.16687},
75
+ archivePrefix={arXiv},
76
+ primaryClass={cs.CL}
77
+ }
78
+ ```
79
+ ```bibtex
80
+ @inproceedings{cohen2023heq,
81
+ title={Heq: a large and diverse hebrew reading comprehension benchmark},
82
+ author={Cohen, Amir and Merhav-Fine, Hilla and Goldberg, Yoav and Tsarfaty, Reut},
83
+ booktitle={Findings of the Association for Computational Linguistics: EMNLP 2023},
84
+ pages={13693--13705},
85
+ year={2023}
86
+ }
87
+ ```
88
+
89
+ **APA:**
90
+ ```apa
91
+ Shmidman, S., Shmidman, A., & Koppel, M. (2023). DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew. arXiv preprint arXiv:2308.16687.
92
+
93
+ Cohen, A., Merhav-Fine, H., Goldberg, Y., & Tsarfaty, R. (2023, December). Heq: a large and diverse hebrew reading comprehension benchmark. In Findings of the Association for Computational Linguistics: EMNLP 2023 (pp. 13693-13705).
94
+ ```
dictabert-ce/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "dictabert-ce-heq-wiki",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "label2id": {
18
+ "LABEL_0": 0
19
+ },
20
+ "layer_norm_eps": 1e-12,
21
+ "max_position_embeddings": 512,
22
+ "model_type": "bert",
23
+ "newmodern": true,
24
+ "num_attention_heads": 12,
25
+ "num_hidden_layers": 12,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.38.2",
30
+ "type_vocab_size": 2,
31
+ "use_cache": true,
32
+ "vocab_size": 128000
33
+ }
dictabert-ce/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3386271f395a8888d16161407a4d05acebd63e2a83f4dd02e7298c89bf336aa2
3
+ size 737407996
dictabert-ce/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/haguy77/dictabert-ce
dictabert-ce/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
dictabert-ce/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
dictabert-ce/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BLANK]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "clean_up_tokenization_spaces": true,
53
+ "cls_token": "[CLS]",
54
+ "do_lower_case": true,
55
+ "mask_token": "[MASK]",
56
+ "max_length": 512,
57
+ "model_max_length": 512,
58
+ "pad_to_multiple_of": null,
59
+ "pad_token": "[PAD]",
60
+ "pad_token_type_id": 0,
61
+ "padding_side": "right",
62
+ "sep_token": "[SEP]",
63
+ "stride": 0,
64
+ "strip_accents": null,
65
+ "tokenize_chinese_chars": true,
66
+ "tokenizer_class": "BertTokenizer",
67
+ "truncation_side": "right",
68
+ "truncation_strategy": "longest_first",
69
+ "unk_token": "[UNK]"
70
+ }
dictabert-ce/vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb90bfa35244d26f0065d1fcd0b5becc3da3d44d616a7e2aacaf6320b9fa2d0
3
+ size 1500244
dictabert-char-spacefix/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
dictabert-char-spacefix/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - he
5
+ base_model:
6
+ - dicta-il/dictabert-char
7
+ ---
8
+ # DictaBERT-char-spacefix: A finetuned BERT model for restoring missing spaces in Hebrew texts.
9
+
10
+ DictaBERT-char-spacefix is a finetuned BERT model based on [dicta-il/dictabert-char](https://huggingface.co/dicta-il/dictabert-char), for the task of restoring missing spaces in Hebrew text.
11
+
12
+ This model is released to the public in this 2025 W-NUT paper: Avi Shmidman and Shaltiel Shmidman, "Restoring Missing Spaces in Scraped Hebrew Social Media", The 10th Workshop on Noisy and User-generated Text (W-NUT), 2025
13
+
14
+ Sample usage:
15
+
16
+ ```python
17
+ from transformers import pipeline
18
+
19
+ oracle = pipeline('token-classification', model='dicta-il/dictabert-char-spacefix')
20
+
21
+ text = 'בשנת 1948 השליםאפרים קישון אתלימודיובפיסולמתכת ובתולדותהאמנות והחל לפרסםמאמרים הומוריסטיים'
22
+
23
+ raw_output = oracle(text)
24
+
25
+ # Classifier returns LABEL_1 if there should be a space before the character
26
+ text_output = ''.join((' ' if o['entity'] == 'LABEL_1' else '') + o['word'] for o in raw_output)
27
+ print(text_output)
28
+ ```
29
+
30
+ Output:
31
+
32
+ ```text
33
+ בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים
34
+ ```
35
+
36
+ ## Citation
37
+
38
+ If you use DictaBERT-char-spacefix
39
+ in your research, please cite ```Restoring Missing Spaces in Scraped Hebrew Social Media```
40
+
41
+ **BibTeX:**
42
+
43
+ ```bibtex
44
+ @inproceedings{shmidman-shmidman-2025-restoring,
45
+ title = "Restoring Missing Spaces in Scraped {H}ebrew Social Media",
46
+ author = "Shmidman, Avi and
47
+ Shmidman, Shaltiel",
48
+ editor = "Bak, JinYeong and
49
+ Goot, Rob van der and
50
+ Jang, Hyeju and
51
+ Buaphet, Weerayut and
52
+ Ramponi, Alan and
53
+ Xu, Wei and
54
+ Ritter, Alan",
55
+ booktitle = "Proceedings of the Tenth Workshop on Noisy and User-generated Text",
56
+ month = may,
57
+ year = "2025",
58
+ address = "Albuquerque, New Mexico, USA",
59
+ publisher = "Association for Computational Linguistics",
60
+ url = "https://aclanthology.org/2025.wnut-1.3/",
61
+ pages = "16--25",
62
+ ISBN = "979-8-89176-232-9",
63
+ abstract = "A formidable challenge regarding scraped corpora of social media is the omission of whitespaces, causing pairs of words to be conflated together as one. In order for the text to be properly parsed and analyzed, these missing spaces must be detected and restored. However, it is particularly hard to restore whitespace in languages such as Hebrew which are written without vowels, because a conflated form can often be split into multiple different pairs of valid words. Thus, a simple dictionary lookup is not feasible. In this paper, we present and evaluate a series of neural approaches to restore missing spaces in scraped Hebrew social media. Our best all-around method involved pretraining a new character-based BERT model for Hebrew, and then fine-tuning a space restoration model on top of this new BERT model. This method is blazing fast, high-performing, and open for unrestricted use, providing a practical solution to process huge Hebrew social media corpora with a consumer-grade GPU. We release the new BERT model and the fine-tuned space-restoration model to the NLP community."
64
+ }
65
+ ```
66
+
67
+ ## License
68
+
69
+ Shield: [![CC BY 4.0][cc-by-shield]][cc-by]
70
+
71
+ This work is licensed under a
72
+ [Creative Commons Attribution 4.0 International License][cc-by].
73
+
74
+ [![CC BY 4.0][cc-by-image]][cc-by]
75
+
76
+ [cc-by]: http://creativecommons.org/licenses/by/4.0/
77
+ [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
78
+ [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg
dictabert-char-spacefix/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../TavFullParagraphBigModern/ckpt_31600/",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 2048,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.47.0",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 1024
26
+ }
dictabert-char-spacefix/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f564cf441cb623312a04cc69f013c5ecbd46b8db5d868aa8a77b159ee87d376
3
+ size 349696712
dictabert-char-spacefix/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/dicta-il/dictabert-char-spacefix
dictabert-char-spacefix/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
dictabert-char-spacefix/tokenizer.json ADDED
@@ -0,0 +1,1022 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[PAD]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 5,
53
+ "content": "[BLANK]",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ }
60
+ ],
61
+ "normalizer": {
62
+ "type": "Sequence",
63
+ "normalizers": [
64
+ {
65
+ "type": "NFKC"
66
+ },
67
+ {
68
+ "type": "Lowercase"
69
+ },
70
+ {
71
+ "type": "StripAccents"
72
+ },
73
+ {
74
+ "type": "Replace",
75
+ "pattern": {
76
+ "String": "<foreign>"
77
+ },
78
+ "content": "[UNK]"
79
+ },
80
+ {
81
+ "type": "Replace",
82
+ "pattern": {
83
+ "Regex": "[^֐-׿\u0000-‌-‿₠-₿∀-⋿⅐-↋ff-ﭏ]+"
84
+ },
85
+ "content": "[UNK]"
86
+ }
87
+ ]
88
+ },
89
+ "pre_tokenizer": {
90
+ "type": "Split",
91
+ "pattern": {
92
+ "Regex": "(\\[UNK\\]|[\\s\\S])"
93
+ },
94
+ "behavior": "Removed",
95
+ "invert": true
96
+ },
97
+ "post_processor": {
98
+ "type": "TemplateProcessing",
99
+ "single": [
100
+ {
101
+ "SpecialToken": {
102
+ "id": "[CLS]",
103
+ "type_id": 0
104
+ }
105
+ },
106
+ {
107
+ "Sequence": {
108
+ "id": "A",
109
+ "type_id": 0
110
+ }
111
+ },
112
+ {
113
+ "SpecialToken": {
114
+ "id": "[SEP]",
115
+ "type_id": 0
116
+ }
117
+ }
118
+ ],
119
+ "pair": [
120
+ {
121
+ "SpecialToken": {
122
+ "id": "[CLS]",
123
+ "type_id": 0
124
+ }
125
+ },
126
+ {
127
+ "Sequence": {
128
+ "id": "A",
129
+ "type_id": 0
130
+ }
131
+ },
132
+ {
133
+ "SpecialToken": {
134
+ "id": "[SEP]",
135
+ "type_id": 0
136
+ }
137
+ },
138
+ {
139
+ "Sequence": {
140
+ "id": "B",
141
+ "type_id": 1
142
+ }
143
+ },
144
+ {
145
+ "SpecialToken": {
146
+ "id": "[SEP]",
147
+ "type_id": 1
148
+ }
149
+ }
150
+ ],
151
+ "special_tokens": {
152
+ "[CLS]": {
153
+ "id": "[CLS]",
154
+ "ids": [
155
+ 1
156
+ ],
157
+ "tokens": [
158
+ "[CLS]"
159
+ ]
160
+ },
161
+ "[SEP]": {
162
+ "id": "[SEP]",
163
+ "ids": [
164
+ 2
165
+ ],
166
+ "tokens": [
167
+ "[SEP]"
168
+ ]
169
+ }
170
+ }
171
+ },
172
+ "decoder": null,
173
+ "model": {
174
+ "type": "WordPiece",
175
+ "unk_token": "[UNK]",
176
+ "continuing_subword_prefix": "##",
177
+ "max_input_chars_per_word": 100,
178
+ "vocab": {
179
+ "[UNK]": 0,
180
+ "[CLS]": 1,
181
+ "[SEP]": 2,
182
+ "[PAD]": 3,
183
+ "[MASK]": 4,
184
+ "[BLANK]": 5,
185
+ "\u0000": 6,
186
+ "\u0001": 7,
187
+ "\u0002": 8,
188
+ "\u0003": 9,
189
+ "\u0004": 10,
190
+ "\u0005": 11,
191
+ "\u0006": 12,
192
+ "\u0007": 13,
193
+ "\b": 14,
194
+ "\t": 15,
195
+ "\n": 16,
196
+ "\u000b": 17,
197
+ "\u000e": 18,
198
+ "\u000f": 19,
199
+ "\u0010": 20,
200
+ "\u0011": 21,
201
+ "\u0012": 22,
202
+ "\u0013": 23,
203
+ "\u0014": 24,
204
+ "\u0015": 25,
205
+ "\u0016": 26,
206
+ "\u0017": 27,
207
+ "\u0018": 28,
208
+ "\u0019": 29,
209
+ "\u001a": 30,
210
+ "\u001b": 31,
211
+ "\u001c": 32,
212
+ "\u001d": 33,
213
+ "\u001e": 34,
214
+ "\u001f": 35,
215
+ " ": 36,
216
+ "!": 37,
217
+ "\"": 38,
218
+ "#": 39,
219
+ "$": 40,
220
+ "%": 41,
221
+ "&": 42,
222
+ "'": 43,
223
+ "(": 44,
224
+ ")": 45,
225
+ "*": 46,
226
+ "+": 47,
227
+ ",": 48,
228
+ "-": 49,
229
+ ".": 50,
230
+ "/": 51,
231
+ "0": 52,
232
+ "1": 53,
233
+ "2": 54,
234
+ "3": 55,
235
+ "4": 56,
236
+ "5": 57,
237
+ "6": 58,
238
+ "7": 59,
239
+ "8": 60,
240
+ "9": 61,
241
+ ":": 62,
242
+ ";": 63,
243
+ "<": 64,
244
+ "=": 65,
245
+ ">": 66,
246
+ "?": 67,
247
+ "@": 68,
248
+ "K": 69,
249
+ "N": 70,
250
+ "U": 71,
251
+ "[": 72,
252
+ "\\": 73,
253
+ "]": 74,
254
+ "^": 75,
255
+ "_": 76,
256
+ "`": 77,
257
+ "a": 78,
258
+ "b": 79,
259
+ "c": 80,
260
+ "d": 81,
261
+ "e": 82,
262
+ "f": 83,
263
+ "g": 84,
264
+ "h": 85,
265
+ "i": 86,
266
+ "j": 87,
267
+ "k": 88,
268
+ "l": 89,
269
+ "m": 90,
270
+ "n": 91,
271
+ "o": 92,
272
+ "p": 93,
273
+ "q": 94,
274
+ "r": 95,
275
+ "s": 96,
276
+ "t": 97,
277
+ "u": 98,
278
+ "v": 99,
279
+ "w": 100,
280
+ "x": 101,
281
+ "y": 102,
282
+ "z": 103,
283
+ "{": 104,
284
+ "|": 105,
285
+ "}": 106,
286
+ "~": 107,
287
+ "": 108,
288
+ "€": 109,
289
+ "": 110,
290
+ "‚": 111,
291
+ "ƒ": 112,
292
+ "„": 113,
293
+ "†": 114,
294
+ "ˆ": 115,
295
+ "‰": 116,
296
+ "Œ": 117,
297
+ "": 118,
298
+ "Ž": 119,
299
+ "": 120,
300
+ "": 121,
301
+ "‘": 122,
302
+ "’": 123,
303
+ "“": 124,
304
+ "”": 125,
305
+ "•": 126,
306
+ "–": 127,
307
+ "—": 128,
308
+ "˜": 129,
309
+ "™": 130,
310
+ "š": 131,
311
+ "›": 132,
312
+ "œ": 133,
313
+ "": 134,
314
+ "ž": 135,
315
+ "Ÿ": 136,
316
+ "¡": 137,
317
+ "¢": 138,
318
+ "£": 139,
319
+ "¤": 140,
320
+ "¥": 141,
321
+ "¦": 142,
322
+ "§": 143,
323
+ "©": 144,
324
+ "«": 145,
325
+ "¬": 146,
326
+ "­": 147,
327
+ "®": 148,
328
+ "°": 149,
329
+ "±": 150,
330
+ "¶": 151,
331
+ "·": 152,
332
+ "»": 153,
333
+ "¿": 154,
334
+ "×": 155,
335
+ "ß": 156,
336
+ "à": 157,
337
+ "á": 158,
338
+ "â": 159,
339
+ "ã": 160,
340
+ "ä": 161,
341
+ "å": 162,
342
+ "æ": 163,
343
+ "ç": 164,
344
+ "è": 165,
345
+ "é": 166,
346
+ "ê": 167,
347
+ "ë": 168,
348
+ "ì": 169,
349
+ "í": 170,
350
+ "î": 171,
351
+ "ï": 172,
352
+ "ð": 173,
353
+ "ñ": 174,
354
+ "ò": 175,
355
+ "ó": 176,
356
+ "ô": 177,
357
+ "õ": 178,
358
+ "ö": 179,
359
+ "÷": 180,
360
+ "ø": 181,
361
+ "ù": 182,
362
+ "ú": 183,
363
+ "û": 184,
364
+ "ü": 185,
365
+ "ý": 186,
366
+ "þ": 187,
367
+ "ÿ": 188,
368
+ "ȼ": 189,
369
+ "˖": 190,
370
+ "˗": 191,
371
+ "ͱ": 192,
372
+ "ͳ": 193,
373
+ "͵": 194,
374
+ "ӏ": 195,
375
+ "ԝ": 196,
376
+ "֎": 197,
377
+ "־": 198,
378
+ "׀": 199,
379
+ "׃": 200,
380
+ "׆": 201,
381
+ "׈": 202,
382
+ "׉": 203,
383
+ "׊": 204,
384
+ "׋": 205,
385
+ "׍": 206,
386
+ "׎": 207,
387
+ "׏": 208,
388
+ "א": 209,
389
+ "ב": 210,
390
+ "ג": 211,
391
+ "ד": 212,
392
+ "ה": 213,
393
+ "ו": 214,
394
+ "ז": 215,
395
+ "ח": 216,
396
+ "ט": 217,
397
+ "י": 218,
398
+ "ך": 219,
399
+ "כ": 220,
400
+ "ל": 221,
401
+ "ם": 222,
402
+ "מ": 223,
403
+ "ן": 224,
404
+ "נ": 225,
405
+ "ס": 226,
406
+ "ע": 227,
407
+ "ף": 228,
408
+ "פ": 229,
409
+ "ץ": 230,
410
+ "צ": 231,
411
+ "ק": 232,
412
+ "ר": 233,
413
+ "ש": 234,
414
+ "ת": 235,
415
+ "׫": 236,
416
+ "װ": 237,
417
+ "ױ": 238,
418
+ "ײ": 239,
419
+ "׳": 240,
420
+ "״": 241,
421
+ "׸": 242,
422
+ "׹": 243,
423
+ "׺": 244,
424
+ "׿": 245,
425
+ "،": 246,
426
+ "؛": 247,
427
+ "؟": 248,
428
+ "٪": 249,
429
+ "٭": 250,
430
+ "۔": 251,
431
+ "۝": 252,
432
+ "۞": 253,
433
+ "۩": 254,
434
+ "ߋ": 255,
435
+ "ߐ": 256,
436
+ "ߕ": 257,
437
+ "ߗ": 258,
438
+ "ߜ": 259,
439
+ "ߝ": 260,
440
+ "ߞ": 261,
441
+ "ߟ": 262,
442
+ "ߠ": 263,
443
+ "ߡ": 264,
444
+ "ߢ": 265,
445
+ "ߨ": 266,
446
+ "ߩ": 267,
447
+ "ߪ": 268,
448
+ "।": 269,
449
+ "฿": 270,
450
+ "๏": 271,
451
+ "፡": 272,
452
+ "ᤞ": 273,
453
+ "᧐": 274,
454
+ "ᨁ": 275,
455
+ "ᨅ": 276,
456
+ "ᨔ": 277,
457
+ "ᨕ": 278,
458
+ "‌": 279,
459
+ "‍": 280,
460
+ "‎": 281,
461
+ "‏": 282,
462
+ "‐": 283,
463
+ "‒": 284,
464
+ "–": 285,
465
+ "—": 286,
466
+ "―": 287,
467
+ "‖": 288,
468
+ "‘": 289,
469
+ "’": 290,
470
+ "‚": 291,
471
+ "‛": 292,
472
+ "“": 293,
473
+ "”": 294,
474
+ "„": 295,
475
+ "‟": 296,
476
+ "†": 297,
477
+ "‡": 298,
478
+ "•": 299,
479
+ "‣": 300,
480
+ "‧": 301,
481
+ "
": 302,
482
+ "
": 303,
483
+ "‪": 304,
484
+ "‫": 305,
485
+ "‬": 306,
486
+ "‭": 307,
487
+ "‮": 308,
488
+ "‰": 309,
489
+ "′": 310,
490
+ "‹": 311,
491
+ "›": 312,
492
+ "※": 313,
493
+ "‽": 314,
494
+ "‿": 315,
495
+ "⁃": 316,
496
+ "⁄": 317,
497
+ "⁎": 318,
498
+ "⁠": 319,
499
+ "⁣": 320,
500
+ "⁦": 321,
501
+ "⁧": 322,
502
+ "⁨": 323,
503
+ "⁩": 324,
504
+ "₡": 325,
505
+ "₣": 326,
506
+ "₤": 327,
507
+ "₦": 328,
508
+ "₩": 329,
509
+ "₪": 330,
510
+ "₫": 331,
511
+ "€": 332,
512
+ "₭": 333,
513
+ "₮": 334,
514
+ "₱": 335,
515
+ "₴": 336,
516
+ "₵": 337,
517
+ "₸": 338,
518
+ "₹": 339,
519
+ "₺": 340,
520
+ "₼": 341,
521
+ "₽": 342,
522
+ "₾": 343,
523
+ "₿": 344,
524
+ "ↄ": 345,
525
+ "←": 346,
526
+ "↑": 347,
527
+ "→": 348,
528
+ "↓": 349,
529
+ "↔": 350,
530
+ "↗": 351,
531
+ "↘": 352,
532
+ "↙": 353,
533
+ "↩": 354,
534
+ "↳": 355,
535
+ "↵": 356,
536
+ "⇌": 357,
537
+ "⇐": 358,
538
+ "⇒": 359,
539
+ "⇓": 360,
540
+ "⇔": 361,
541
+ "⇦": 362,
542
+ "⇧": 363,
543
+ "⇨": 364,
544
+ "⇱": 365,
545
+ "∀": 366,
546
+ "∂": 367,
547
+ "∃": 368,
548
+ "∅": 369,
549
+ "∆": 370,
550
+ "∇": 371,
551
+ "∈": 372,
552
+ "∉": 373,
553
+ "∍": 374,
554
+ "∎": 375,
555
+ "∏": 376,
556
+ "∐": 377,
557
+ "∑": 378,
558
+ "−": 379,
559
+ "∕": 380,
560
+ "∗": 381,
561
+ "∘": 382,
562
+ "∙": 383,
563
+ "√": 384,
564
+ "∛": 385,
565
+ "∝": 386,
566
+ "∞": 387,
567
+ "∟": 388,
568
+ "∠": 389,
569
+ "∢": 390,
570
+ "∧": 391,
571
+ "∨": 392,
572
+ "∩": 393,
573
+ "∪": 394,
574
+ "∫": 395,
575
+ "∴": 396,
576
+ "∼": 397,
577
+ "≅": 398,
578
+ "≈": 399,
579
+ "≋": 400,
580
+ "≟": 401,
581
+ "≠": 402,
582
+ "≡": 403,
583
+ "≤": 404,
584
+ "≥": 405,
585
+ "≦": 406,
586
+ "≧": 407,
587
+ "≪": 408,
588
+ "≫": 409,
589
+ "⊂": 410,
590
+ "⊃": 411,
591
+ "⊆": 412,
592
+ "⊇": 413,
593
+ "⊕": 414,
594
+ "⊗": 415,
595
+ "⊙": 416,
596
+ "⊞": 417,
597
+ "⊠": 418,
598
+ "⊢": 419,
599
+ "⊤": 420,
600
+ "⊦": 421,
601
+ "⋃": 422,
602
+ "⋄": 423,
603
+ "⋅": 424,
604
+ "⋆": 425,
605
+ "⋇": 426,
606
+ "⋧": 427,
607
+ "⋮": 428,
608
+ "⋯": 429,
609
+ "⌀": 430,
610
+ "⌂": 431,
611
+ "⌘": 432,
612
+ "⌚": 433,
613
+ "⌛": 434,
614
+ "⌥": 435,
615
+ "⎙": 436,
616
+ "⏎": 437,
617
+ "⏪": 438,
618
+ "⏮": 439,
619
+ "⏰": 440,
620
+ "⏱": 441,
621
+ "⏳": 442,
622
+ "⏺": 443,
623
+ "─": 444,
624
+ "│": 445,
625
+ "┐": 446,
626
+ "└": 447,
627
+ "┴": 448,
628
+ "╋": 449,
629
+ "║": 450,
630
+ "╬": 451,
631
+ "█": 452,
632
+ "▌": 453,
633
+ "░": 454,
634
+ "■": 455,
635
+ "□": 456,
636
+ "▪": 457,
637
+ "▫": 458,
638
+ "▲": 459,
639
+ "△": 460,
640
+ "▶": 461,
641
+ "▷": 462,
642
+ "▸": 463,
643
+ "►": 464,
644
+ "▼": 465,
645
+ "▽": 466,
646
+ "▾": 467,
647
+ "◀": 468,
648
+ "◁": 469,
649
+ "◂": 470,
650
+ "◃": 471,
651
+ "◄": 472,
652
+ "◆": 473,
653
+ "◇": 474,
654
+ "◈": 475,
655
+ "◉": 476,
656
+ "◊": 477,
657
+ "○": 478,
658
+ "◌": 479,
659
+ "◎": 480,
660
+ "●": 481,
661
+ "◕": 482,
662
+ "◘": 483,
663
+ "◙": 484,
664
+ "◡": 485,
665
+ "◥": 486,
666
+ "◦": 487,
667
+ "◴": 488,
668
+ "◻": 489,
669
+ "◼": 490,
670
+ "◽": 491,
671
+ "◾": 492,
672
+ "☀": 493,
673
+ "☁": 494,
674
+ "☂": 495,
675
+ "☃": 496,
676
+ "☄": 497,
677
+ "★": 498,
678
+ "☆": 499,
679
+ "☉": 500,
680
+ "☎": 501,
681
+ "☏": 502,
682
+ "☐": 503,
683
+ "☑": 504,
684
+ "☒": 505,
685
+ "☔": 506,
686
+ "☕": 507,
687
+ "☘": 508,
688
+ "☚": 509,
689
+ "☜": 510,
690
+ "☝": 511,
691
+ "☠": 512,
692
+ "☢": 513,
693
+ "☯": 514,
694
+ "☰": 515,
695
+ "☹": 516,
696
+ "☺": 517,
697
+ "☻": 518,
698
+ "☼": 519,
699
+ "♀": 520,
700
+ "♂": 521,
701
+ "♔": 522,
702
+ "♕": 523,
703
+ "♚": 524,
704
+ "♛": 525,
705
+ "♟": 526,
706
+ "♠": 527,
707
+ "♡": 528,
708
+ "♢": 529,
709
+ "♣": 530,
710
+ "♥": 531,
711
+ "♦": 532,
712
+ "♧": 533,
713
+ "♨": 534,
714
+ "♪": 535,
715
+ "♫": 536,
716
+ "♬": 537,
717
+ "♭": 538,
718
+ "♯": 539,
719
+ "♰": 540,
720
+ "♻": 541,
721
+ "♿": 542,
722
+ "⚇": 543,
723
+ "⚒": 544,
724
+ "⚓": 545,
725
+ "⚔": 546,
726
+ "⚖": 547,
727
+ "⚘": 548,
728
+ "⚛": 549,
729
+ "⚜": 550,
730
+ "⚠": 551,
731
+ "⚡": 552,
732
+ "⚧": 553,
733
+ "⚪": 554,
734
+ "⚫": 555,
735
+ "⚽": 556,
736
+ "⛔": 557,
737
+ "⛰": 558,
738
+ "✂": 559,
739
+ "✅": 560,
740
+ "✆": 561,
741
+ "✈": 562,
742
+ "✉": 563,
743
+ "✊": 564,
744
+ "✋": 565,
745
+ "✌": 566,
746
+ "✍": 567,
747
+ "✎": 568,
748
+ "✏": 569,
749
+ "✓": 570,
750
+ "✔": 571,
751
+ "✖": 572,
752
+ "✗": 573,
753
+ "✙": 574,
754
+ "✛": 575,
755
+ "✡": 576,
756
+ "✦": 577,
757
+ "✧": 578,
758
+ "✨": 579,
759
+ "✩": 580,
760
+ "✪": 581,
761
+ "✫": 582,
762
+ "✭": 583,
763
+ "✮": 584,
764
+ "✯": 585,
765
+ "✰": 586,
766
+ "✱": 587,
767
+ "✲": 588,
768
+ "✳": 589,
769
+ "✴": 590,
770
+ "✶": 591,
771
+ "✸": 592,
772
+ "✺": 593,
773
+ "✻": 594,
774
+ "✽": 595,
775
+ "✾": 596,
776
+ "✿": 597,
777
+ "❀": 598,
778
+ "❁": 599,
779
+ "❂": 600,
780
+ "❃": 601,
781
+ "❄": 602,
782
+ "❇": 603,
783
+ "❈": 604,
784
+ "❋": 605,
785
+ "❌": 606,
786
+ "❎": 607,
787
+ "❏": 608,
788
+ "❑": 609,
789
+ "❒": 610,
790
+ "❓": 611,
791
+ "❔": 612,
792
+ "❕": 613,
793
+ "❖": 614,
794
+ "❗": 615,
795
+ "❝": 616,
796
+ "❞": 617,
797
+ "❣": 618,
798
+ "❤": 619,
799
+ "❥": 620,
800
+ "❦": 621,
801
+ "❭": 622,
802
+ "❯": 623,
803
+ "❶": 624,
804
+ "❷": 625,
805
+ "❸": 626,
806
+ "➊": 627,
807
+ "➋": 628,
808
+ "➌": 629,
809
+ "➍": 630,
810
+ "➎": 631,
811
+ "➔": 632,
812
+ "➕": 633,
813
+ "➖": 634,
814
+ "➡": 635,
815
+ "➢": 636,
816
+ "➤": 637,
817
+ "➦": 638,
818
+ "⟨": 639,
819
+ "⟩": 640,
820
+ "⠀": 641,
821
+ "⤵": 642,
822
+ "⤶": 643,
823
+ "⦁": 644,
824
+ "⦿": 645,
825
+ "⧼": 646,
826
+ "⧽": 647,
827
+ "⬅": 648,
828
+ "⬆": 649,
829
+ "⬇": 650,
830
+ "⬛": 651,
831
+ "⬜": 652,
832
+ "⭐": 653,
833
+ "⭕": 654,
834
+ "ⰲ": 655,
835
+ "ⰽ": 656,
836
+ "ⰾ": 657,
837
+ "ⱀ": 658,
838
+ "ⱁ": 659,
839
+ "ⱄ": 660,
840
+ "ⱏ": 661,
841
+ "ⱐ": 662,
842
+ "ⱑ": 663,
843
+ "ⱥ": 664,
844
+ "ⲟ": 665,
845
+ "ⴰ": 666,
846
+ "ⴻ": 667,
847
+ "ⵍ": 668,
848
+ "ⵏ": 669,
849
+ "ⵔ": 670,
850
+ "ⵢ": 671,
851
+ "ⵣ": 672,
852
+ "、": 673,
853
+ "。": 674,
854
+ "〈": 675,
855
+ "〉": 676,
856
+ "《": 677,
857
+ "》": 678,
858
+ "「": 679,
859
+ "」": 680,
860
+ "【": 681,
861
+ "】": 682,
862
+ "ꙭ": 683,
863
+ "": 684,
864
+ "": 685,
865
+ "": 686,
866
+ "": 687,
867
+ "": 688,
868
+ "": 689,
869
+ "": 690,
870
+ "": 691,
871
+ "": 692,
872
+ "": 693,
873
+ "": 694,
874
+ "": 695,
875
+ "": 696,
876
+ "": 697,
877
+ "": 698,
878
+ "": 699,
879
+ "": 700,
880
+ "": 701,
881
+ "": 702,
882
+ "": 703,
883
+ "": 704,
884
+ "": 705,
885
+ "": 706,
886
+ "": 707,
887
+ "": 708,
888
+ "": 709,
889
+ "": 710,
890
+ "": 711,
891
+ "": 712,
892
+ "": 713,
893
+ "": 714,
894
+ "": 715,
895
+ "": 716,
896
+ "": 717,
897
+ "": 718,
898
+ "": 719,
899
+ "": 720,
900
+ "": 721,
901
+ "": 722,
902
+ "": 723,
903
+ "": 724,
904
+ "": 725,
905
+ "": 726,
906
+ "": 727,
907
+ "": 728,
908
+ "": 729,
909
+ "": 730,
910
+ "": 731,
911
+ "": 732,
912
+ "": 733,
913
+ "": 734,
914
+ "": 735,
915
+ "": 736,
916
+ "": 737,
917
+ "": 738,
918
+ "": 739,
919
+ "": 740,
920
+ "": 741,
921
+ "": 742,
922
+ "": 743,
923
+ "": 744,
924
+ "": 745,
925
+ "": 746,
926
+ "": 747,
927
+ "": 748,
928
+ "": 749,
929
+ "": 750,
930
+ "": 751,
931
+ "": 752,
932
+ "": 753,
933
+ "": 754,
934
+ "": 755,
935
+ "": 756,
936
+ "": 757,
937
+ "": 758,
938
+ "": 759,
939
+ "": 760,
940
+ "": 761,
941
+ "": 762,
942
+ "": 763,
943
+ "": 764,
944
+ "": 765,
945
+ "": 766,
946
+ "": 767,
947
+ "": 768,
948
+ "": 769,
949
+ "": 770,
950
+ "": 771,
951
+ "": 772,
952
+ "": 773,
953
+ "": 774,
954
+ "": 775,
955
+ "": 776,
956
+ "": 777,
957
+ "": 778,
958
+ "": 779,
959
+ "": 780,
960
+ "": 781,
961
+ "": 782,
962
+ "": 783,
963
+ "": 784,
964
+ "": 785,
965
+ "": 786,
966
+ "": 787,
967
+ "": 788,
968
+ "": 789,
969
+ "": 790,
970
+ "": 791,
971
+ "": 792,
972
+ "": 793,
973
+ "": 794,
974
+ "": 795,
975
+ "": 796,
976
+ "": 797,
977
+ "": 798,
978
+ "": 799,
979
+ "": 800,
980
+ "": 801,
981
+ "": 802,
982
+ "": 803,
983
+ "": 804,
984
+ "": 805,
985
+ "": 806,
986
+ "": 807,
987
+ "": 808,
988
+ "": 809,
989
+ "": 810,
990
+ "": 811,
991
+ "": 812,
992
+ "": 813,
993
+ "": 814,
994
+ "": 815,
995
+ "": 816,
996
+ "": 817,
997
+ "": 818,
998
+ "": 819,
999
+ "": 820,
1000
+ "": 821,
1001
+ "": 822,
1002
+ "": 823,
1003
+ "": 824,
1004
+ "": 825,
1005
+ "": 826,
1006
+ "": 827,
1007
+ "": 828,
1008
+ "": 829,
1009
+ "": 830,
1010
+ "": 831,
1011
+ "": 832,
1012
+ "": 833,
1013
+ "": 834,
1014
+ "": 835,
1015
+ "": 836,
1016
+ "": 837,
1017
+ "": 838,
1018
+ "": 839,
1019
+ "": 840
1020
+ }
1021
+ }
1022
+ }
dictabert-char-spacefix/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BLANK]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "clean_up_tokenization_spaces": true,
53
+ "cls_token": "[CLS]",
54
+ "do_lower_case": true,
55
+ "extra_special_tokens": {},
56
+ "mask_token": "[MASK]",
57
+ "model_max_length": 2048,
58
+ "pad_token": "[PAD]",
59
+ "sep_token": "[SEP]",
60
+ "strip_accents": null,
61
+ "tokenize_chinese_chars": true,
62
+ "tokenizer_class": "BertTokenizer",
63
+ "unk_token": "[UNK]"
64
+ }
dictabert-char-spacefix/vocab.txt ADDED
Binary file (3.01 kB). View file
 
dictabert-heq/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
dictabert-heq/LICENSE ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More_considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution 4.0 International Public License
58
+
59
+ By exercising the Licensed Rights (defined below), You accept and agree
60
+ to be bound by the terms and conditions of this Creative Commons
61
+ Attribution 4.0 International Public License ("Public License"). To the
62
+ extent this Public License may be interpreted as a contract, You are
63
+ granted the Licensed Rights in consideration of Your acceptance of
64
+ these terms and conditions, and the Licensor grants You such rights in
65
+ consideration of benefits the Licensor receives from making the
66
+ Licensed Material available under these terms and conditions.
67
+
68
+
69
+ Section 1 -- Definitions.
70
+
71
+ a. Adapted Material means material subject to Copyright and Similar
72
+ Rights that is derived from or based upon the Licensed Material
73
+ and in which the Licensed Material is translated, altered,
74
+ arranged, transformed, or otherwise modified in a manner requiring
75
+ permission under the Copyright and Similar Rights held by the
76
+ Licensor. For purposes of this Public License, where the Licensed
77
+ Material is a musical work, performance, or sound recording,
78
+ Adapted Material is always produced where the Licensed Material is
79
+ synched in timed relation with a moving image.
80
+
81
+ b. Adapter's License means the license You apply to Your Copyright
82
+ and Similar Rights in Your contributions to Adapted Material in
83
+ accordance with the terms and conditions of this Public License.
84
+
85
+ c. Copyright and Similar Rights means copyright and/or similar rights
86
+ closely related to copyright including, without limitation,
87
+ performance, broadcast, sound recording, and Sui Generis Database
88
+ Rights, without regard to how the rights are labeled or
89
+ categorized. For purposes of this Public License, the rights
90
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
91
+ Rights.
92
+
93
+ d. Effective Technological Measures means those measures that, in the
94
+ absence of proper authority, may not be circumvented under laws
95
+ fulfilling obligations under Article 11 of the WIPO Copyright
96
+ Treaty adopted on December 20, 1996, and/or similar international
97
+ agreements.
98
+
99
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
100
+ any other exception or limitation to Copyright and Similar Rights
101
+ that applies to Your use of the Licensed Material.
102
+
103
+ f. Licensed Material means the artistic or literary work, database,
104
+ or other material to which the Licensor applied this Public
105
+ License.
106
+
107
+ g. Licensed Rights means the rights granted to You subject to the
108
+ terms and conditions of this Public License, which are limited to
109
+ all Copyright and Similar Rights that apply to Your use of the
110
+ Licensed Material and that the Licensor has authority to license.
111
+
112
+ h. Licensor means the individual(s) or entity(ies) granting rights
113
+ under this Public License.
114
+
115
+ i. Share means to provide material to the public by any means or
116
+ process that requires permission under the Licensed Rights, such
117
+ as reproduction, public display, public performance, distribution,
118
+ dissemination, communication, or importation, and to make material
119
+ available to the public including in ways that members of the
120
+ public may access the material from a place and at a time
121
+ individually chosen by them.
122
+
123
+ j. Sui Generis Database Rights means rights other than copyright
124
+ resulting from Directive 96/9/EC of the European Parliament and of
125
+ the Council of 11 March 1996 on the legal protection of databases,
126
+ as amended and/or succeeded, as well as other essentially
127
+ equivalent rights anywhere in the world.
128
+
129
+ k. You means the individual or entity exercising the Licensed Rights
130
+ under this Public License. Your has a corresponding meaning.
131
+
132
+
133
+ Section 2 -- Scope.
134
+
135
+ a. License grant.
136
+
137
+ 1. Subject to the terms and conditions of this Public License,
138
+ the Licensor hereby grants You a worldwide, royalty-free,
139
+ non-sublicensable, non-exclusive, irrevocable license to
140
+ exercise the Licensed Rights in the Licensed Material to:
141
+
142
+ a. reproduce and Share the Licensed Material, in whole or
143
+ in part; and
144
+
145
+ b. produce, reproduce, and Share Adapted Material.
146
+
147
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
148
+ Exceptions and Limitations apply to Your use, this Public
149
+ License does not apply, and You do not need to comply with
150
+ its terms and conditions.
151
+
152
+ 3. Term. The term of this Public License is specified in Section
153
+ 6(a).
154
+
155
+ 4. Media and formats; technical modifications allowed. The
156
+ Licensor authorizes You to exercise the Licensed Rights in
157
+ all media and formats whether now known or hereafter created,
158
+ and to make technical modifications necessary to do so. The
159
+ Licensor waives and/or agrees not to assert any right or
160
+ authority to forbid You from making technical modifications
161
+ necessary to exercise the Licensed Rights, including
162
+ technical modifications necessary to circumvent Effective
163
+ Technological Measures. For purposes of this Public License,
164
+ simply making modifications authorized by this Section 2(a)
165
+ (4) never produces Adapted Material.
166
+
167
+ 5. Downstream recipients.
168
+
169
+ a. Offer from the Licensor -- Licensed Material. Every
170
+ recipient of the Licensed Material automatically
171
+ receives an offer from the Licensor to exercise the
172
+ Licensed Rights under the terms and conditions of this
173
+ Public License.
174
+
175
+ b. No downstream restrictions. You may not offer or impose
176
+ any additional or different terms or conditions on, or
177
+ apply any Effective Technological Measures to, the
178
+ Licensed Material if doing so restricts exercise of the
179
+ Licensed Rights by any recipient of the Licensed
180
+ Material.
181
+
182
+ 6. No endorsement. Nothing in this Public License constitutes or
183
+ may be construed as permission to assert or imply that You
184
+ are, or that Your use of the Licensed Material is, connected
185
+ with, or sponsored, endorsed, or granted official status by,
186
+ the Licensor or others designated to receive attribution as
187
+ provided in Section 3(a)(1)(A)(i).
188
+
189
+ b. Other rights.
190
+
191
+ 1. Moral rights, such as the right of integrity, are not
192
+ licensed under this Public License, nor are publicity,
193
+ privacy, and/or other similar personality rights; however, to
194
+ the extent possible, the Licensor waives and/or agrees not to
195
+ assert any such rights held by the Licensor to the limited
196
+ extent necessary to allow You to exercise the Licensed
197
+ Rights, but not otherwise.
198
+
199
+ 2. Patent and trademark rights are not licensed under this
200
+ Public License.
201
+
202
+ 3. To the extent possible, the Licensor waives any right to
203
+ collect royalties from You for the exercise of the Licensed
204
+ Rights, whether directly or through a collecting society
205
+ under any voluntary or waivable statutory or compulsory
206
+ licensing scheme. In all other cases the Licensor expressly
207
+ reserves any right to collect such royalties.
208
+
209
+
210
+ Section 3 -- License Conditions.
211
+
212
+ Your exercise of the Licensed Rights is expressly made subject to the
213
+ following conditions.
214
+
215
+ a. Attribution.
216
+
217
+ 1. If You Share the Licensed Material (including in modified
218
+ form), You must:
219
+
220
+ a. retain the following if it is supplied by the Licensor
221
+ with the Licensed Material:
222
+
223
+ i. identification of the creator(s) of the Licensed
224
+ Material and any others designated to receive
225
+ attribution, in any reasonable manner requested by
226
+ the Licensor (including by pseudonym if
227
+ designated);
228
+
229
+ ii. a copyright notice;
230
+
231
+ iii. a notice that refers to this Public License;
232
+
233
+ iv. a notice that refers to the disclaimer of
234
+ warranties;
235
+
236
+ v. a URI or hyperlink to the Licensed Material to the
237
+ extent reasonably practicable;
238
+
239
+ b. indicate if You modified the Licensed Material and
240
+ retain an indication of any previous modifications; and
241
+
242
+ c. indicate the Licensed Material is licensed under this
243
+ Public License, and include the text of, or the URI or
244
+ hyperlink to, this Public License.
245
+
246
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
247
+ reasonable manner based on the medium, means, and context in
248
+ which You Share the Licensed Material. For example, it may be
249
+ reasonable to satisfy the conditions by providing a URI or
250
+ hyperlink to a resource that includes the required
251
+ information.
252
+
253
+ 3. If requested by the Licensor, You must remove any of the
254
+ information required by Section 3(a)(1)(A) to the extent
255
+ reasonably practicable.
256
+
257
+ 4. If You Share Adapted Material You produce, the Adapter's
258
+ License You apply must not prevent recipients of the Adapted
259
+ Material from complying with this Public License.
260
+
261
+
262
+ Section 4 -- Sui Generis Database Rights.
263
+
264
+ Where the Licensed Rights include Sui Generis Database Rights that
265
+ apply to Your use of the Licensed Material:
266
+
267
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268
+ to extract, reuse, reproduce, and Share all or a substantial
269
+ portion of the contents of the database;
270
+
271
+ b. if You include all or a substantial portion of the database
272
+ contents in a database in which You have Sui Generis Database
273
+ Rights, then the database in which You have Sui Generis Database
274
+ Rights (but not its individual contents) is Adapted Material; and
275
+
276
+ c. You must comply with the conditions in Section 3(a) if You Share
277
+ all or a substantial portion of the contents of the database.
278
+
279
+ For the avoidance of doubt, this Section 4 supplements and does not
280
+ replace Your obligations under this Public License where the Licensed
281
+ Rights include other Copyright and Similar Rights.
282
+
283
+
284
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285
+
286
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296
+
297
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306
+
307
+ c. The disclaimer of warranties and limitation of liability provided
308
+ above shall be interpreted in a manner that, to the extent
309
+ possible, most closely approximates an absolute disclaimer and
310
+ waiver of all liability.
311
+
312
+
313
+ Section 6 -- Term and Termination.
314
+
315
+ a. This Public License applies for the term of the Copyright and
316
+ Similar Rights licensed here. However, if You fail to comply with
317
+ this Public License, then Your rights under this Public License
318
+ terminate automatically.
319
+
320
+ b. Where Your right to use the Licensed Material has terminated under
321
+ Section 6(a), it reinstates:
322
+
323
+ 1. automatically as of the date the violation is cured, provided
324
+ it is cured within 30 days of Your discovery of the
325
+ violation; or
326
+
327
+ 2. upon express reinstatement by the Licensor.
328
+
329
+ For the avoidance of doubt, this Section 6(b) does not affect any
330
+ right the Licensor may have to seek remedies for Your violations
331
+ of this Public License.
332
+
333
+ c. For the avoidance of doubt, the Licensor may also offer the
334
+ Licensed Material under separate terms or conditions or stop
335
+ distributing the Licensed Material at any time; however, doing so
336
+ will not terminate this Public License.
337
+
338
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339
+ License.
340
+
341
+
342
+ Section 7 -- Other Terms and Conditions.
343
+
344
+ a. The Licensor shall not be bound by any additional or different
345
+ terms or conditions communicated by You unless expressly agreed.
346
+
347
+ b. Any arrangements, understandings, or agreements regarding the
348
+ Licensed Material not stated herein are separate from and
349
+ independent of the terms and conditions of this Public License.
350
+
351
+
352
+ Section 8 -- Interpretation.
353
+
354
+ a. For the avoidance of doubt, this Public License does not, and
355
+ shall not be interpreted to, reduce, limit, restrict, or impose
356
+ conditions on any use of the Licensed Material that could lawfully
357
+ be made without permission under this Public License.
358
+
359
+ b. To the extent possible, if any provision of this Public License is
360
+ deemed unenforceable, it shall be automatically reformed to the
361
+ minimum extent necessary to make it enforceable. If the provision
362
+ cannot be reformed, it shall be severed from this Public License
363
+ without affecting the enforceability of the remaining terms and
364
+ conditions.
365
+
366
+ c. No term or condition of this Public License will be waived and no
367
+ failure to comply consented to unless expressly agreed to by the
368
+ Licensor.
369
+
370
+ d. Nothing in this Public License constitutes or may be interpreted
371
+ as a limitation upon, or waiver of, any privileges and immunities
372
+ that apply to the Licensor or You, including from the legal
373
+ processes of any jurisdiction or authority.
374
+
375
+
376
+ =======================================================================
377
+
378
+ Creative Commons is not a party to its public
379
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
380
+ its public licenses to material it publishes and in those instances
381
+ will be considered the “Licensor.” The text of the Creative Commons
382
+ public licenses is dedicated to the public domain under the CC0 Public
383
+ Domain Dedication. Except for the limited purpose of indicating that
384
+ material is shared under a Creative Commons public license or as
385
+ otherwise permitted by the Creative Commons policies published at
386
+ creativecommons.org/policies, Creative Commons does not authorize the
387
+ use of the trademark "Creative Commons" or any other trademark or logo
388
+ of Creative Commons without its prior written consent including,
389
+ without limitation, in connection with any unauthorized modifications
390
+ to any of its public licenses or any other arrangements,
391
+ understandings, or agreements concerning use of licensed material. For
392
+ the avoidance of doubt, this paragraph does not form part of the
393
+ public licenses.
394
+
395
+ Creative Commons may be contacted at creativecommons.org.
dictabert-heq/README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - he
5
+ inference: false
6
+ ---
7
+ # DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew
8
+
9
+ State-of-the-art language model for Hebrew, released [here](https://arxiv.org/abs/2308.16687).
10
+
11
+ This is the fine-tuned model for the question-answering task using the [HeQ](https://u.cs.biu.ac.il/~yogo/heq.pdf) dataset.
12
+
13
+ For the bert-base models for other tasks, see [here](https://huggingface.co/collections/dicta-il/dictabert-6588e7cc08f83845fc42a18b).
14
+
15
+ Sample usage:
16
+
17
+ ```python
18
+ from transformers import pipeline
19
+
20
+ oracle = pipeline('question-answering', model='dicta-il/dictabert-heq')
21
+
22
+
23
+ context = 'בניית פרופילים של משתמשים נחשבת על ידי רבים כאיום פוטנציאלי על הפרטיות. מסיבה זו הגבילו חלק מהמדינות באמצעות חקיקה את המידע שניתן להשיג באמצעות עוגיות ואת אופן השימוש בעוגיות. ארצות הברית, למשל, קבעה חוקים נוקשים בכל הנוגע ליצירת עוגיות חדשות. חוקים אלו, אשר נקבעו בשנת 2000, נקבעו לאחר שנחשף כי המשרד ליישום המדיניות של הממשל האמריקאי נגד השימוש בסמים (ONDCP) בבית הלבן השתמש בעוגיות כדי לעקוב אחרי משתמשים שצפו בפרסומות נגד השימוש בסמים במטרה לבדוק האם משתמשים אלו נכנסו לאתרים התומכים בשימוש בסמים. דניאל בראנט, פעיל הדוגל בפרטיות המשתמשים באינטרנט, חשף כי ה-CIA שלח עוגיות קבועות למחשבי אזרחים במשך עשר שנים. ב-25 בדצמבר 2005 גילה בראנט כי הסוכנות לביטחון לאומי (ה-NSA) השאירה שתי עוגיות קבועות במחשבי מבקרים בגלל שדרוג תוכנה. לאחר שהנושא פורסם, הם ביטלו מיד את השימוש בהן.'
24
+ question = 'כיצד הוגבל המידע שניתן להשיג באמצעות העוגיות?'
25
+
26
+ oracle(question=question, context=context)
27
+ ```
28
+
29
+ Output:
30
+ ```json
31
+ {
32
+ "score": 0.998887836933136,
33
+ "start": 101,
34
+ "end": 114,
35
+ "answer": "באמצעות חקיקה"
36
+ }
37
+ ```
38
+
39
+ ## Citation
40
+
41
+ If you use DictaBERT in your research, please cite ```DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew```
42
+
43
+ **BibTeX:**
44
+
45
+ ```bibtex
46
+ @misc{shmidman2023dictabert,
47
+ title={DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew},
48
+ author={Shaltiel Shmidman and Avi Shmidman and Moshe Koppel},
49
+ year={2023},
50
+ eprint={2308.16687},
51
+ archivePrefix={arXiv},
52
+ primaryClass={cs.CL}
53
+ }
54
+ ```
55
+
56
+ ## License
57
+
58
+ Shield: [![CC BY 4.0][cc-by-shield]][cc-by]
59
+
60
+ This work is licensed under a
61
+ [Creative Commons Attribution 4.0 International License][cc-by].
62
+
63
+ [![CC BY 4.0][cc-by-image]][cc-by]
64
+
65
+ [cc-by]: http://creativecommons.org/licenses/by/4.0/
66
+ [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
67
+ [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg
68
+
69
+
70
+
71
+
dictabert-heq/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "newmodern": true,
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.31.0",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 128000
26
+ }
dictabert-heq/issues.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ------------------------------------------------------------------------------
2
+ #1 Adding `safetensors` variant of this model
3
+ ------------------------------------------------------------------------------
4
+
5
+ [SFconvertbot] Oct 27, 2024
6
+
7
+ This is an automated PR created with https://huggingface.co/spaces/safetensors/convert
8
+
9
+ This new file is equivalent to pytorch_model.bin but safe in the sense that
10
+ no arbitrary code can be put into it.
11
+
12
+ These files also happen to load much faster than their pytorch counterpart:
13
+ https://colab.research.google.com/github/huggingface/notebooks/blob/main/safetensors_doc/en/speed.ipynb
14
+
15
+ The widgets on your model page will run using this model even if this is not merged
16
+ making sure the file actually works.
17
+
18
+ If you find any issues: please report here: https://huggingface.co/spaces/safetensors/convert/discussions
19
+
20
+ Feel free to ignore this PR.
dictabert-heq/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3d14a6e3dabe859df3862034077116fa796cfdf963c39ad05e99c9d2b375681
3
+ size 735092905
dictabert-heq/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/dicta-il/dictabert-heq
dictabert-heq/speed.ipynb ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "Sm51_Do2Uh_y"
7
+ },
8
+ "source": [
9
+ "<!-- DISABLE-FRONTMATTER-SECTIONS -->"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "hiKkSKRdUh_0"
16
+ },
17
+ "source": [
18
+ "# Speed Comparison"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "markdown",
23
+ "metadata": {
24
+ "id": "63f59hZcUh_0"
25
+ },
26
+ "source": [
27
+ "`Safetensors` is really fast. Let's compare it against `PyTorch` by loading [gpt2](https://huggingface.co/gpt2) weights. To run the [GPU benchmark](#gpu-benchmark), make sure your machine has GPU or you have selected `GPU runtime` if you are using Google Colab.\n",
28
+ "\n",
29
+ "Before you begin, make sure you have all the necessary libraries installed:"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "metadata": {
36
+ "id": "FVUx03_SUh_0"
37
+ },
38
+ "outputs": [],
39
+ "source": [
40
+ "!pip install safetensors huggingface_hub torch"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "markdown",
45
+ "metadata": {
46
+ "id": "lKRAmkNBUh_1"
47
+ },
48
+ "source": [
49
+ "Let's start by importing all the packages that will be used:"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {
56
+ "id": "aj8sFZhlUh_1"
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "import os\n",
61
+ "import datetime\n",
62
+ "from huggingface_hub import hf_hub_download\n",
63
+ "from safetensors.torch import load_file\n",
64
+ "import torch"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "markdown",
69
+ "metadata": {
70
+ "id": "Ddd_qKtzUh_1"
71
+ },
72
+ "source": [
73
+ "Download safetensors & torch weights for gpt2:"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {
80
+ "id": "-7ESrRyDUh_2"
81
+ },
82
+ "outputs": [],
83
+ "source": [
84
+ "sf_filename = hf_hub_download(\"gpt2\", filename=\"model.safetensors\")\n",
85
+ "pt_filename = hf_hub_download(\"gpt2\", filename=\"pytorch_model.bin\")"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "markdown",
90
+ "metadata": {
91
+ "id": "jeriUWJxUh_2"
92
+ },
93
+ "source": [
94
+ "### CPU benchmark"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": null,
100
+ "metadata": {
101
+ "id": "jclEP0Z8Uh_2",
102
+ "outputId": "3b057f7e-d98a-458f-ab19-df77a3e38b55"
103
+ },
104
+ "outputs": [
105
+ {
106
+ "data": {
107
+ "text/plain": [
108
+ "Loaded safetensors 0:00:00.004015\n",
109
+ "Loaded pytorch 0:00:00.307460\n",
110
+ "on CPU, safetensors is faster than pytorch by: 76.6 X"
111
+ ]
112
+ },
113
+ "execution_count": null,
114
+ "metadata": {},
115
+ "output_type": "execute_result"
116
+ }
117
+ ],
118
+ "source": [
119
+ "start_st = datetime.datetime.now()\n",
120
+ "weights = load_file(sf_filename, device=\"cpu\")\n",
121
+ "load_time_st = datetime.datetime.now() - start_st\n",
122
+ "print(f\"Loaded safetensors {load_time_st}\")\n",
123
+ "\n",
124
+ "start_pt = datetime.datetime.now()\n",
125
+ "weights = torch.load(pt_filename, map_location=\"cpu\")\n",
126
+ "load_time_pt = datetime.datetime.now() - start_pt\n",
127
+ "print(f\"Loaded pytorch {load_time_pt}\")\n",
128
+ "\n",
129
+ "print(f\"on CPU, safetensors is faster than pytorch by: {load_time_pt/load_time_st:.1f} X\")"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "markdown",
134
+ "metadata": {
135
+ "id": "bJ0AvxqSUh_2"
136
+ },
137
+ "source": [
138
+ "This speedup is due to the fact that this library avoids unnecessary copies by mapping the file directly. It is actually possible to do on [pure pytorch](https://gist.github.com/Narsil/3edeec2669a5e94e4707aa0f901d2282).\n",
139
+ "The currently shown speedup was gotten on:\n",
140
+ "* OS: Ubuntu 18.04.6 LTS\n",
141
+ "* CPU: Intel(R) Xeon(R) CPU @ 2.00GHz"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "markdown",
146
+ "metadata": {
147
+ "id": "THYSS8dZUh_3"
148
+ },
149
+ "source": [
150
+ "### GPU benchmark"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": null,
156
+ "metadata": {
157
+ "id": "B0uQ1T3oUh_3",
158
+ "outputId": "98bd8eb2-b82a-4200-f3d4-6c2399a28647"
159
+ },
160
+ "outputs": [
161
+ {
162
+ "data": {
163
+ "text/plain": [
164
+ "Loaded safetensors 0:00:00.165206\n",
165
+ "Loaded pytorch 0:00:00.353889\n",
166
+ "on GPU, safetensors is faster than pytorch by: 2.1 X"
167
+ ]
168
+ },
169
+ "execution_count": null,
170
+ "metadata": {},
171
+ "output_type": "execute_result"
172
+ }
173
+ ],
174
+ "source": [
175
+ "# This is required because this feature hasn't been fully verified yet, but\n",
176
+ "# it's been tested on many different environments\n",
177
+ "os.environ[\"SAFETENSORS_FAST_GPU\"] = \"1\"\n",
178
+ "\n",
179
+ "# CUDA startup out of the measurement\n",
180
+ "torch.zeros((2, 2)).cuda()\n",
181
+ "\n",
182
+ "start_st = datetime.datetime.now()\n",
183
+ "weights = load_file(sf_filename, device=\"cuda:0\")\n",
184
+ "load_time_st = datetime.datetime.now() - start_st\n",
185
+ "print(f\"Loaded safetensors {load_time_st}\")\n",
186
+ "\n",
187
+ "start_pt = datetime.datetime.now()\n",
188
+ "weights = torch.load(pt_filename, map_location=\"cuda:0\")\n",
189
+ "load_time_pt = datetime.datetime.now() - start_pt\n",
190
+ "print(f\"Loaded pytorch {load_time_pt}\")\n",
191
+ "\n",
192
+ "print(f\"on GPU, safetensors is faster than pytorch by: {load_time_pt/load_time_st:.1f} X\")"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "markdown",
197
+ "metadata": {
198
+ "id": "0zjAzX_FUh_3"
199
+ },
200
+ "source": [
201
+ "The speedup works because this library is able to skip unecessary CPU allocations. It is unfortunately not replicable in pure pytorch as far as we know. The library works by memory mapping the file, creating the tensor empty with pytorch and calling `cudaMemcpy` directly to move the tensor directly on the GPU.\n",
202
+ "The currently shown speedup was gotten on:\n",
203
+ "* OS: Ubuntu 18.04.6 LTS.\n",
204
+ "* GPU: Tesla T4\n",
205
+ "* Driver Version: 460.32.03\n",
206
+ "* CUDA Version: 11.2"
207
+ ]
208
+ }
209
+ ],
210
+ "metadata": {
211
+ "language_info": {
212
+ "name": "python"
213
+ },
214
+ "colab": {
215
+ "provenance": []
216
+ }
217
+ },
218
+ "nbformat": 4,
219
+ "nbformat_minor": 0
220
+ }
dictabert-heq/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
dictabert-heq/tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
dictabert-heq/vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb90bfa35244d26f0065d1fcd0b5becc3da3d44d616a7e2aacaf6320b9fa2d0
3
+ size 1500244
dictabert-joint/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
dictabert-joint/BertForJointParsing.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import re
3
+ from operator import itemgetter
4
+ import torch
5
+ from torch import nn
6
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
7
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
8
+ from transformers.models.bert.modeling_bert import BertOnlyMLMHead
9
+ from transformers.utils import ModelOutput
10
+ from .BertForSyntaxParsing import BertSyntaxParsingHead, SyntaxLabels, SyntaxLogitsOutput, parse_logits as syntax_parse_logits
11
+ from .BertForPrefixMarking import BertPrefixMarkingHead, parse_logits as prefix_parse_logits, encode_sentences_for_bert_for_prefix_marking, get_prefixes_from_str
12
+ from .BertForMorphTagging import BertMorphTaggingHead, MorphLogitsOutput, MorphLabels, parse_logits as morph_parse_logits
13
+
14
+ import warnings
15
+
16
+ @dataclass
17
+ class JointParsingOutput(ModelOutput):
18
+ loss: Optional[torch.FloatTensor] = None
19
+ # logits will contain the optional predictions for the given labels
20
+ logits: Optional[Union[SyntaxLogitsOutput, None]] = None
21
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
22
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
23
+ # if no labels are given, we will always include the syntax logits separately
24
+ syntax_logits: Optional[SyntaxLogitsOutput] = None
25
+ ner_logits: Optional[torch.FloatTensor] = None
26
+ prefix_logits: Optional[torch.FloatTensor] = None
27
+ lex_logits: Optional[torch.FloatTensor] = None
28
+ morph_logits: Optional[MorphLogitsOutput] = None
29
+
30
+ # wrapper class to wrap a torch.nn.Module so that you can store a module in multiple linked
31
+ # properties without registering the parameter multiple times
32
+ class ModuleRef:
33
+ def __init__(self, module: torch.nn.Module):
34
+ self.module = module
35
+
36
+ def forward(self, *args, **kwargs):
37
+ return self.module.forward(*args, **kwargs)
38
+
39
+ def __call__(self, *args, **kwargs):
40
+ return self.module(*args, **kwargs)
41
+
42
+ class BertForJointParsing(BertPreTrainedModel):
43
+ _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
44
+
45
+ def __init__(self, config, do_syntax=None, do_ner=None, do_prefix=None, do_lex=None, do_morph=None, syntax_head_size=64):
46
+ super().__init__(config)
47
+
48
+ self.bert = BertModel(config, add_pooling_layer=False)
49
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
50
+ # create all the heads as None, and then populate them as defined
51
+ self.syntax, self.ner, self.prefix, self.lex, self.morph = (None,)*5
52
+
53
+ if do_syntax is not None:
54
+ config.do_syntax = do_syntax
55
+ config.syntax_head_size = syntax_head_size
56
+ if do_ner is not None: config.do_ner = do_ner
57
+ if do_prefix is not None: config.do_prefix = do_prefix
58
+ if do_lex is not None: config.do_lex = do_lex
59
+ if do_morph is not None: config.do_morph = do_morph
60
+
61
+ # add all the individual heads
62
+ if config.do_syntax:
63
+ self.syntax = BertSyntaxParsingHead(config)
64
+ if config.do_ner:
65
+ self.num_labels = config.num_labels
66
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels) # name it same as in BertForTokenClassification
67
+ self.ner = ModuleRef(self.classifier)
68
+ if config.do_prefix:
69
+ self.prefix = BertPrefixMarkingHead(config)
70
+ if config.do_lex:
71
+ self.cls = BertOnlyMLMHead(config) # name it the same as in BertForMaskedLM
72
+ self.lex = ModuleRef(self.cls)
73
+ if config.do_morph:
74
+ self.morph = BertMorphTaggingHead(config)
75
+
76
+ # Initialize weights and apply final processing
77
+ self.post_init()
78
+
79
+ def get_output_embeddings(self):
80
+ return self.cls.predictions.decoder if self.lex is not None else None
81
+
82
+ def set_output_embeddings(self, new_embeddings):
83
+ if self.lex is not None:
84
+
85
+ self.cls.predictions.decoder = new_embeddings
86
+
87
+ def forward(
88
+ self,
89
+ input_ids: Optional[torch.Tensor] = None,
90
+ attention_mask: Optional[torch.Tensor] = None,
91
+ token_type_ids: Optional[torch.Tensor] = None,
92
+ position_ids: Optional[torch.Tensor] = None,
93
+ prefix_class_id_options: Optional[torch.Tensor] = None,
94
+ labels: Optional[Union[SyntaxLabels, MorphLabels, torch.Tensor]] = None,
95
+ labels_type: Optional[Literal['syntax', 'ner', 'prefix', 'lex', 'morph']] = None,
96
+ head_mask: Optional[torch.Tensor] = None,
97
+ inputs_embeds: Optional[torch.Tensor] = None,
98
+ output_attentions: Optional[bool] = None,
99
+ output_hidden_states: Optional[bool] = None,
100
+ return_dict: Optional[bool] = None,
101
+ compute_syntax_mst: Optional[bool] = None
102
+ ):
103
+ if return_dict is False:
104
+ warnings.warn("Specified `return_dict=False` but the flag is ignored and treated as always True in this model.")
105
+
106
+ if labels is not None and labels_type is None:
107
+ raise ValueError("Cannot specify labels without labels_type")
108
+
109
+ if labels_type == 'seg' and prefix_class_id_options is None:
110
+ raise ValueError('Cannot calculate prefix logits without prefix_class_id_options')
111
+
112
+ if compute_syntax_mst is not None and self.syntax is None:
113
+ raise ValueError("Cannot compute syntax MST when the syntax head isn't loaded")
114
+
115
+
116
+ bert_outputs = self.bert(
117
+ input_ids,
118
+ attention_mask=attention_mask,
119
+ token_type_ids=token_type_ids,
120
+ position_ids=position_ids,
121
+ head_mask=head_mask,
122
+ inputs_embeds=inputs_embeds,
123
+ output_attentions=output_attentions,
124
+ output_hidden_states=output_hidden_states,
125
+ return_dict=True,
126
+ )
127
+
128
+ # calculate the extended attention mask for any child that might need it
129
+ extended_attention_mask = None
130
+ if attention_mask is not None:
131
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.size())
132
+
133
+ # extract the hidden states, and apply the dropout
134
+ hidden_states = self.dropout(bert_outputs[0])
135
+
136
+ logits = None
137
+ syntax_logits = None
138
+ ner_logits = None
139
+ prefix_logits = None
140
+ lex_logits = None
141
+ morph_logits = None
142
+
143
+ # Calculate the syntax
144
+ if self.syntax is not None and (labels is None or labels_type == 'syntax'):
145
+ # apply the syntax head
146
+ loss, syntax_logits = self.syntax(hidden_states, extended_attention_mask, labels, compute_syntax_mst)
147
+ logits = syntax_logits
148
+
149
+ # Calculate the NER
150
+ if self.ner is not None and (labels is None or labels_type == 'ner'):
151
+ ner_logits = self.ner(hidden_states)
152
+ logits = ner_logits
153
+ if labels is not None:
154
+ loss_fct = nn.CrossEntropyLoss()
155
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
156
+
157
+ # Calculate the segmentation
158
+ if self.prefix is not None and (labels is None or labels_type == 'prefix'):
159
+ loss, prefix_logits = self.prefix(hidden_states, prefix_class_id_options, labels)
160
+ logits = prefix_logits
161
+
162
+ # Calculate the lexeme
163
+ if self.lex is not None and (labels is None or labels_type == 'lex'):
164
+ lex_logits = self.lex(hidden_states)
165
+ logits = lex_logits
166
+ if labels is not None:
167
+ loss_fct = nn.CrossEntropyLoss() # -100 index = padding token
168
+ loss = loss_fct(lex_logits.view(-1, self.config.vocab_size), labels.view(-1))
169
+
170
+ if self.morph is not None and (labels is None or labels_type == 'morph'):
171
+ loss, morph_logits = self.morph(hidden_states, labels)
172
+ logits = morph_logits
173
+
174
+ # no labels => logits = None
175
+ if labels is None: logits = None
176
+
177
+ return JointParsingOutput(
178
+ loss,
179
+ logits,
180
+ hidden_states=bert_outputs.hidden_states,
181
+ attentions=bert_outputs.attentions,
182
+ # all the predicted logits section
183
+ syntax_logits=syntax_logits,
184
+ ner_logits=ner_logits,
185
+ prefix_logits=prefix_logits,
186
+ lex_logits=lex_logits,
187
+ morph_logits=morph_logits
188
+ )
189
+
190
+ def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, padding='longest', truncation=True, compute_syntax_mst=True, per_token_ner=False, output_style: Literal['json', 'ud', 'iahlt_ud'] = 'json'):
191
+ is_single_sentence = isinstance(sentences, str)
192
+ if is_single_sentence:
193
+ sentences = [sentences]
194
+
195
+ if output_style not in ['json', 'ud', 'iahlt_ud']:
196
+ raise ValueError('output_style must be in json/ud/iahlt_ud')
197
+ if output_style in ['ud', 'iahlt_ud'] and (self.prefix is None or self.morph is None or self.syntax is None or self.lex is None):
198
+ raise ValueError("Cannot output UD format when any of the prefix,morph,syntax, and lex heads aren't loaded.")
199
+
200
+ # predict the logits for the sentence
201
+ if self.prefix is not None:
202
+ inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, self.config.prefix_cfg, sentences, padding)
203
+ else:
204
+ inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
205
+
206
+ offset_mapping = inputs.pop('offset_mapping')
207
+ # Copy the tensors to the right device, and parse!
208
+ inputs = {k:v.to(self.device) for k,v in inputs.items()}
209
+ output = self.forward(**inputs, return_dict=True, compute_syntax_mst=compute_syntax_mst)
210
+
211
+ input_ids = inputs['input_ids'].tolist() # convert once
212
+ final_output = [dict(text=sentence, tokens=combine_token_wordpieces(ids, offsets, tokenizer)) for sentence, ids, offsets in zip(sentences, input_ids, offset_mapping)]
213
+ # Syntax logits: each sentence gets a dict(tree: List[dict(word,dep_head,dep_head_idx,dep_func)], root_idx: int)
214
+ if output.syntax_logits is not None:
215
+ for sent_idx,parsed in enumerate(syntax_parse_logits(input_ids, sentences, tokenizer, output.syntax_logits)):
216
+ merge_token_list(final_output[sent_idx]['tokens'], parsed['tree'], 'syntax')
217
+ final_output[sent_idx]['root_idx'] = parsed['root_idx']
218
+
219
+ # Prefix logits: each sentence gets a list([prefix_segment, word_without_prefix]) - **WITH CLS & SEP**
220
+ if output.prefix_logits is not None:
221
+ for sent_idx,parsed in enumerate(prefix_parse_logits(input_ids, sentences, tokenizer, output.prefix_logits, self.config.prefix_cfg)):
222
+ merge_token_list(final_output[sent_idx]['tokens'], map(tuple, parsed[1:-1]), 'seg')
223
+
224
+ # Lex logits each sentence gets a list(tuple(word, lexeme))
225
+ if output.lex_logits is not None:
226
+ for sent_idx, parsed in enumerate(lex_parse_logits(input_ids, sentences, tokenizer, output.lex_logits)):
227
+ merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'lex')
228
+
229
+ # morph logits each sentences get a dict(text=str, tokens=list(dict(token, pos, feats, prefixes, suffix, suffix_feats?)))
230
+ if output.morph_logits is not None:
231
+ for sent_idx,parsed in enumerate(morph_parse_logits(input_ids, sentences, tokenizer, output.morph_logits)):
232
+ merge_token_list(final_output[sent_idx]['tokens'], parsed['tokens'], 'morph')
233
+
234
+ # NER logits each sentence gets a list(tuple(word, ner))
235
+ if output.ner_logits is not None:
236
+ for sent_idx,parsed in enumerate(ner_parse_logits(input_ids, sentences, tokenizer, output.ner_logits, self.config.id2label)):
237
+ if per_token_ner:
238
+ merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'ner')
239
+ final_output[sent_idx]['ner_entities'] = aggregate_ner_tokens(final_output[sent_idx], parsed)
240
+
241
+ if output_style in ['ud', 'iahlt_ud']:
242
+ final_output = convert_output_to_ud(final_output, self.config, style='htb' if output_style == 'ud' else 'iahlt')
243
+
244
+ if is_single_sentence:
245
+ final_output = final_output[0]
246
+ return final_output
247
+
248
+
249
+
250
+ def aggregate_ner_tokens(final_output, parsed):
251
+ entities = []
252
+ prev = None
253
+ for token_idx, (d, (word, pred)) in enumerate(zip(final_output['tokens'], parsed)):
254
+ # O does nothing
255
+ if pred == 'O': prev = None
256
+ # B- || I-entity != prev (different entity or none)
257
+ elif pred.startswith('B-') or pred[2:] != prev:
258
+ prev = pred[2:]
259
+ entities.append([[word], dict(label=prev, start=d['offsets']['start'], end=d['offsets']['end'], token_start=token_idx, token_end=token_idx)])
260
+ else:
261
+ entities[-1][0].append(word)
262
+ entities[-1][1]['end'] = d['offsets']['end']
263
+ entities[-1][1]['token_end'] = token_idx
264
+
265
+ return [dict(phrase=' '.join(words), **d) for words, d in entities]
266
+
267
+ def merge_token_list(src, update, key):
268
+ for token_src, token_update in zip(src, update):
269
+ token_src[key] = token_update
270
+
271
+ def combine_token_wordpieces(input_ids: List[int], offset_mapping: torch.Tensor, tokenizer: BertTokenizerFast):
272
+ offset_mapping = offset_mapping.tolist()
273
+ ret = []
274
+ special_toks = tokenizer.all_special_tokens
275
+ special_toks.remove(tokenizer.unk_token)
276
+ special_toks.remove(tokenizer.mask_token)
277
+
278
+ for token, offsets in zip(tokenizer.convert_ids_to_tokens(input_ids), offset_mapping):
279
+ if token in special_toks: continue
280
+ if token.startswith('##'):
281
+ ret[-1]['token'] += token[2:]
282
+ ret[-1]['offsets']['end'] = offsets[1]
283
+ else: ret.append(dict(token=token, offsets=dict(start=offsets[0], end=offsets[1])))
284
+ return ret
285
+
286
+ def ner_parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor, id2label: Dict[int, str]):
287
+ predictions = torch.argmax(logits, dim=-1).tolist()
288
+ batch_ret = []
289
+
290
+ special_toks = tokenizer.all_special_tokens
291
+ special_toks.remove(tokenizer.unk_token)
292
+ special_toks.remove(tokenizer.mask_token)
293
+
294
+ for batch_idx in range(len(sentences)):
295
+
296
+ ret = []
297
+ batch_ret.append(ret)
298
+
299
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[batch_idx])
300
+ for tok_idx in range(len(tokens)):
301
+ token = tokens[tok_idx]
302
+ if token in special_toks: continue
303
+
304
+ # wordpieces should just be appended to the previous word
305
+ # we modify the last token in ret
306
+ # by discarding the original end position and replacing it with the new token's end position
307
+ if token.startswith('##'):
308
+ continue
309
+ # for each token, we append a tuple containing: token, label, start position, end position
310
+ ret.append((token, id2label[predictions[batch_idx][tok_idx]]))
311
+
312
+ return batch_ret
313
+
314
+ def lex_parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor):
315
+
316
+ predictions = torch.argsort(logits, dim=-1, descending=True)[..., :3].tolist()
317
+ batch_ret = []
318
+
319
+ special_toks = tokenizer.all_special_tokens
320
+ special_toks.remove(tokenizer.unk_token)
321
+ special_toks.remove(tokenizer.mask_token)
322
+ for batch_idx in range(len(sentences)):
323
+ intermediate_ret = []
324
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[batch_idx])
325
+ for tok_idx in range(len(tokens)):
326
+ token = tokens[tok_idx]
327
+ if token in special_toks: continue
328
+
329
+ # wordpieces should just be appended to the previous word
330
+ if token.startswith('##'):
331
+ intermediate_ret[-1] = (intermediate_ret[-1][0] + token[2:], intermediate_ret[-1][1])
332
+ continue
333
+ intermediate_ret.append((token, tokenizer.convert_ids_to_tokens(predictions[batch_idx][tok_idx])))
334
+
335
+ # build the final output taking into account valid letters
336
+ ret = []
337
+ batch_ret.append(ret)
338
+ for (token, lexemes) in intermediate_ret:
339
+ # must overlap on at least 2 non אהוי letters
340
+ possible_lets = set(c for c in token if c not in 'אהוי')
341
+ final_lex = '[BLANK]'
342
+ for lex in lexemes:
343
+ if sum(c in possible_lets for c in lex) >= min([2, len(possible_lets), len([c for c in lex if c not in 'אהוי'])]):
344
+ final_lex = lex
345
+ break
346
+ ret.append((token, final_lex))
347
+
348
+ return batch_ret
349
+
350
+ ud_prefixes_to_pos = {
351
+ 'ש': ['SCONJ'],
352
+ 'מש': ['SCONJ'],
353
+ 'כש': ['SCONJ'],
354
+ 'לכש': ['SCONJ'],
355
+ 'בש': ['SCONJ'],
356
+ 'לש': ['SCONJ'],
357
+ 'ו': ['CCONJ'],
358
+ 'ל': ['ADP'],
359
+ 'ה': ['DET', 'SCONJ'],
360
+ 'מ': ['ADP', 'SCONJ'],
361
+ 'ב': ['ADP'],
362
+ 'כ': ['ADP', 'ADV'],
363
+ }
364
+ ud_suffix_to_htb_str = {
365
+ 'Gender=Masc|Number=Sing|Person=3': '_הוא',
366
+ 'Gender=Masc|Number=Plur|Person=3': '_הם',
367
+ 'Gender=Fem|Number=Sing|Person=3': '_היא',
368
+ 'Gender=Fem|Number=Plur|Person=3': '_הן',
369
+ 'Gender=Fem,Masc|Number=Plur|Person=1': '_אנחנו',
370
+ 'Gender=Fem,Masc|Number=Sing|Person=1': '_אני',
371
+ 'Gender=Masc|Number=Plur|Person=2': '_אתם',
372
+ 'Gender=Masc|Number=Sing|Person=3': '_הוא',
373
+ 'Gender=Masc|Number=Sing|Person=2': '_אתה',
374
+ 'Gender=Fem|Number=Sing|Person=2': '_את',
375
+ 'Gender=Masc|Number=Plur|Person=3': '_הם'
376
+ }
377
+ def convert_output_to_ud(output_sentences, model_cfg, style: Literal['htb', 'iahlt']):
378
+ if style not in ['htb', 'iahlt']:
379
+ raise ValueError('style must be htb/iahlt')
380
+
381
+ final_output = []
382
+ for sent_idx, sentence in enumerate(output_sentences):
383
+ # next, go through each word and insert it in the UD format. Store in a temp format for the post process
384
+ intermediate_output = []
385
+ ranges = []
386
+ # store a mapping between each word index and the actual line it appears in
387
+ idx_to_key = {-1: 0}
388
+ for word_idx,word in enumerate(sentence['tokens']):
389
+ try:
390
+ # handle blank lexemes
391
+ if word['lex'] == '[BLANK]':
392
+ word['lex'] = word['seg'][-1]
393
+ except KeyError:
394
+ import json
395
+ print(json.dumps(sentence, ensure_ascii=False, indent=2))
396
+ exit(0)
397
+
398
+ start = len(intermediate_output)
399
+ # Add in all the prefixes
400
+ if len(word['seg']) > 1:
401
+ for pre in get_prefixes_from_str(word['seg'][0], model_cfg.prefix_cfg, greedy=True):
402
+ # pos - just take the first valid pos that appears in the predicted prefixes list.
403
+ pos = next((pos for pos in ud_prefixes_to_pos[pre] if pos in word['morph']['prefixes']), ud_prefixes_to_pos[pre][0])
404
+ dep, func = ud_get_prefix_dep(pre, word, word_idx)
405
+ intermediate_output.append(dict(word=pre, lex=pre, pos=pos, dep=dep, func=func, feats='_'))
406
+
407
+ # if there was an implicit heh, add it in dependent on the method
408
+ if not 'ה' in pre and intermediate_output[-1]['pos'] == 'ADP' and 'DET' in word['morph']['prefixes']:
409
+ if style == 'htb':
410
+ intermediate_output.append(dict(word='ה_', lex='ה', pos='DET', dep=word_idx, func='det', feats='_'))
411
+ elif style == 'iahlt':
412
+ intermediate_output[-1]['feats'] = 'Definite=Def|PronType=Art'
413
+
414
+
415
+ idx_to_key[word_idx] = len(intermediate_output) + 1
416
+ # add the main word in!
417
+ intermediate_output.append(dict(
418
+ word=word['seg'][-1], lex=word['lex'], pos=word['morph']['pos'],
419
+ dep=word['syntax']['dep_head_idx'], func=word['syntax']['dep_func'],
420
+ feats='|'.join(f'{k}={v}' for k,v in word['morph']['feats'].items())))
421
+
422
+ # if we have suffixes, this changes things
423
+ if word['morph']['suffix']:
424
+ # first determine the dependency info:
425
+ # For adp, num, det - they main word points to here, and the suffix points to the dependency
426
+ entry_to_assign_suf_dep = None
427
+ if word['morph']['pos'] in ['ADP', 'NUM', 'DET']:
428
+ entry_to_assign_suf_dep = intermediate_output[-1]
429
+ intermediate_output[-1]['func'] = 'case'
430
+ dep = word['syntax']['dep_head_idx']
431
+ func = word['syntax']['dep_func']
432
+ else:
433
+ # if pos is verb -> obj, num -> dep, default to -> nmod:poss
434
+ dep = word_idx
435
+ func = {'VERB': 'obj', 'NUM': 'dep'}.get(word['morph']['pos'], 'nmod:poss')
436
+
437
+ s_word, s_lex = word['seg'][-1], word['lex']
438
+ # update the word of the string and extract the string of the suffix!
439
+ # for IAHLT:
440
+ if style == 'iahlt':
441
+ # we need to shorten the main word and extract the suffix
442
+ # if it is longer than the lexeme - just take off the lexeme.
443
+ if len(s_word) > len(s_lex):
444
+ idx = len(s_lex)
445
+ # Otherwise, try to find the last letter of the lexeme, and fail that just take the last letter
446
+ else:
447
+ # take either len-1, or the last occurence (which can be -1 === len-1)
448
+ idx = min([len(s_word) - 1, s_word.rfind(s_lex[-1])])
449
+ # extract the suffix and update the main word
450
+ suf = s_word[idx:]
451
+ intermediate_output[-1]['word'] = s_word[:idx]
452
+ # for htb:
453
+ elif style == 'htb':
454
+ # main word becomes the lexeme, the suffix is based on the features
455
+ intermediate_output[-1]['word'] = (s_lex if s_lex != s_word else s_word[:-1]) + '_'
456
+ suf_feats = word['morph']['suffix_feats']
457
+ suf = ud_suffix_to_htb_str.get(f"Gender={suf_feats.get('Gender', 'Fem,Masc')}|Number={suf_feats.get('Number', 'Sing')}|Person={suf_feats.get('Person', '3')}", "_הוא")
458
+ # for HTB, if the function is poss, then add a shel pointing to the next word
459
+ if func == 'nmod:poss' and s_lex != 'של':
460
+ intermediate_output.append(dict(word='_של_', lex='של', pos='ADP', dep=len(intermediate_output) + 2, func='case', feats='_', absolute_dep=True))
461
+ # if the function is obj, then add a את pointing to the next word
462
+ elif func == 'obj' and s_lex != 'את':
463
+ intermediate_output.append(dict(word='_את_', lex='את', pos='ADP', dep=len(intermediate_output) + 2, func='case', feats='_', absolute_dep=True))
464
+
465
+ # add the main suffix in
466
+ intermediate_output.append(dict(word=suf, lex='הוא', pos='PRON', dep=dep, func=func, feats='|'.join(f'{k}={v}' for k,v in word['morph']['suffix_feats'].items())))
467
+ if entry_to_assign_suf_dep:
468
+ entry_to_assign_suf_dep['dep'] = len(intermediate_output)
469
+ entry_to_assign_suf_dep['absolute_dep'] = True
470
+
471
+ end = len(intermediate_output)
472
+ ranges.append((start, end, word['token']))
473
+
474
+ # now that we have the intermediate output, combine it to the final output
475
+ cur_output = []
476
+ final_output.append(cur_output)
477
+ # first, add the headers
478
+ cur_output.append(f'# sent_id = {sent_idx + 1}')
479
+ cur_output.append(f'# text = {sentence["text"]}')
480
+
481
+ # add in all the actual entries
482
+ for start,end,token in ranges:
483
+ if end - start > 1:
484
+ cur_output.append(f'{start + 1}-{end}\t{token}\t_\t_\t_\t_\t_\t_\t_\t_')
485
+ for idx,output in enumerate(intermediate_output[start:end], start + 1):
486
+ # compute the actual dependency location
487
+ dep = output['dep'] if output.get('absolute_dep', False) else idx_to_key[output['dep']]
488
+ func = normalize_dep_rel(output['func'], style)
489
+ # and add the full ud string in
490
+ cur_output.append('\t'.join([
491
+ str(idx),
492
+ output['word'],
493
+ output['lex'],
494
+ output['pos'],
495
+ output['pos'],
496
+ output['feats'],
497
+ str(dep),
498
+ func,
499
+ '_', '_'
500
+ ]))
501
+ return final_output
502
+
503
+ def normalize_dep_rel(dep, style: Literal['htb', 'iahlt']):
504
+ if style == 'iahlt':
505
+ if dep == 'compound:smixut': return 'compound'
506
+ if dep == 'nsubj:cop': return 'nsubj'
507
+ if dep == 'mark:q': return 'mark'
508
+ if dep == 'case:gen' or dep == 'case:acc': return 'case'
509
+ return dep
510
+
511
+
512
+ def ud_get_prefix_dep(pre, word, word_idx):
513
+ does_follow_main = False
514
+
515
+ # shin goes to the main word for verbs, otherwise follows the word
516
+ if pre.endswith('ש'):
517
+ does_follow_main = word['morph']['pos'] != 'VERB' and word['syntax']['dep_head_idx'] > word_idx
518
+ func = 'mark'
519
+ # vuv goes to the main word if the function is in the list, otherwise follows
520
+ elif pre == 'ו':
521
+ does_follow_main = word['syntax']['dep_func'] not in ["conj", "acl:recl", "parataxis", "root", "acl", "amod", "list", "appos", "dep", "flatccomp"]
522
+ func = 'cc'
523
+ else:
524
+ # for adj, noun, propn, pron, verb - prefixes go to the main word
525
+ if word['morph']['pos'] in ["ADJ", "NOUN", "PROPN", "PRON", "VERB"]:
526
+ does_follow_main = False
527
+ # otherwise - prefix follows the word if the function is in the list
528
+ else: does_follow_main = word['syntax']['dep_func'] in ["compound:affix", "det", "aux", "nummod", "advmod", "dep", "cop", "mark", "fixed"]
529
+
530
+ func = 'case'
531
+ if pre == 'ה':
532
+ func = 'det' if 'DET' in word['morph']['prefixes'] else 'mark'
533
+
534
+ return (word['syntax']['dep_head_idx'] if does_follow_main else word_idx), func
dictabert-joint/BertForMorphTagging.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from operator import itemgetter
3
+ from transformers.utils import ModelOutput
4
+ import torch
5
+ from torch import nn
6
+ from typing import Dict, List, Tuple, Optional
7
+ from dataclasses import dataclass
8
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
9
+
10
+ ALL_POS = ['DET', 'NOUN', 'VERB', 'CCONJ', 'ADP', 'PRON', 'PUNCT', 'ADJ', 'ADV', 'SCONJ', 'NUM', 'PROPN', 'AUX', 'X', 'INTJ', 'SYM']
11
+ ALL_PREFIX_POS = ['SCONJ', 'DET', 'ADV', 'CCONJ', 'ADP', 'NUM']
12
+ ALL_SUFFIX_POS = ['none', 'ADP_PRON', 'PRON']
13
+ ALL_FEATURES = [
14
+ ('Gender', ['none', 'Masc', 'Fem', 'Fem,Masc']),
15
+ ('Number', ['none', 'Sing', 'Plur', 'Plur,Sing', 'Dual', 'Dual,Plur']),
16
+ ('Person', ['none', '1', '2', '3', '1,2,3']),
17
+ ('Tense', ['none', 'Past', 'Fut', 'Pres', 'Imp'])
18
+ ]
19
+
20
+ @dataclass
21
+ class MorphLogitsOutput(ModelOutput):
22
+ prefix_logits: torch.FloatTensor = None
23
+ pos_logits: torch.FloatTensor = None
24
+ features_logits: List[torch.FloatTensor] = None
25
+ suffix_logits: torch.FloatTensor = None
26
+ suffix_features_logits: List[torch.FloatTensor] = None
27
+
28
+ def detach(self):
29
+ return MorphLogitsOutput(self.prefix_logits.detach(), self.pos_logits.detach(), [logits.deatch() for logits in self.features_logits], self.suffix_logits.detach(), [logits.deatch() for logits in self.suffix_features_logits])
30
+
31
+
32
+ @dataclass
33
+ class MorphTaggingOutput(ModelOutput):
34
+ loss: Optional[torch.FloatTensor] = None
35
+ logits: Optional[MorphLogitsOutput] = None
36
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
37
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
38
+
39
+ @dataclass
40
+ class MorphLabels(ModelOutput):
41
+ prefix_labels: Optional[torch.FloatTensor] = None
42
+ pos_labels: Optional[torch.FloatTensor] = None
43
+ features_labels: Optional[List[torch.FloatTensor]] = None
44
+ suffix_labels: Optional[torch.FloatTensor] = None
45
+ suffix_features_labels: Optional[List[torch.FloatTensor]] = None
46
+
47
+ def detach(self):
48
+ return MorphLabels(self.prefix_labels.detach(), self.pos_labels.detach(), [labels.detach() for labels in self.features_labels], self.suffix_labels.detach(), [labels.detach() for labels in self.suffix_features_labels])
49
+
50
+ def to(self, device):
51
+ return MorphLabels(self.prefix_labels.to(device), self.pos_labels.to(device), [feat.to(device) for feat in self.features_labels], self.suffix_labels.to(device), [feat.to(device) for feat in self.suffix_features_labels])
52
+
53
+ class BertMorphTaggingHead(nn.Module):
54
+ def __init__(self, config):
55
+ super().__init__()
56
+ self.config = config
57
+
58
+ self.num_prefix_classes = len(ALL_PREFIX_POS)
59
+ self.num_pos_classes = len(ALL_POS)
60
+ self.num_suffix_classes = len(ALL_SUFFIX_POS)
61
+ self.num_features_classes = list(map(len, map(itemgetter(1), ALL_FEATURES)))
62
+ # we need a classifier for prefix cls and POS cls
63
+ # the prefix will use BCEWithLogits for multiple labels cls
64
+ self.prefix_cls = nn.Linear(config.hidden_size, self.num_prefix_classes)
65
+ # and pos + feats will use good old cross entropy for single label
66
+ self.pos_cls = nn.Linear(config.hidden_size, self.num_pos_classes)
67
+ self.features_cls = nn.ModuleList([nn.Linear(config.hidden_size, len(features)) for _, features in ALL_FEATURES])
68
+ # and suffix + feats will also be cross entropy
69
+ self.suffix_cls = nn.Linear(config.hidden_size, self.num_suffix_classes)
70
+ self.suffix_features_cls = nn.ModuleList([nn.Linear(config.hidden_size, len(features)) for _, features in ALL_FEATURES])
71
+
72
+ def forward(
73
+ self,
74
+ hidden_states: torch.Tensor,
75
+ labels: Optional[MorphLabels] = None):
76
+ # run each of the classifiers on the transformed output
77
+ prefix_logits = self.prefix_cls(hidden_states)
78
+ pos_logits = self.pos_cls(hidden_states)
79
+ suffix_logits = self.suffix_cls(hidden_states)
80
+ features_logits = [cls(hidden_states) for cls in self.features_cls]
81
+ suffix_features_logits = [cls(hidden_states) for cls in self.suffix_features_cls]
82
+
83
+ loss = None
84
+ if labels is not None:
85
+ # step 1: prefix labels loss
86
+ loss_fct = nn.BCEWithLogitsLoss(weight=(labels.prefix_labels != -100).float())
87
+ loss = loss_fct(prefix_logits, labels.prefix_labels)
88
+ # step 2: pos labels loss
89
+ loss_fct = nn.CrossEntropyLoss()
90
+ loss += loss_fct(pos_logits.view(-1, self.num_pos_classes), labels.pos_labels.view(-1))
91
+ # step 2b: features
92
+ for feat_logits,feat_labels,num_features in zip(features_logits, labels.features_labels, self.num_features_classes):
93
+ loss += loss_fct(feat_logits.view(-1, num_features), feat_labels.view(-1))
94
+ # step 3: suffix logits loss
95
+ loss += loss_fct(suffix_logits.view(-1, self.num_suffix_classes), labels.suffix_labels.view(-1))
96
+ # step 3b: suffix features
97
+ for feat_logits,feat_labels,num_features in zip(suffix_features_logits, labels.suffix_features_labels, self.num_features_classes):
98
+ loss += loss_fct(feat_logits.view(-1, num_features), feat_labels.view(-1))
99
+
100
+ return loss, MorphLogitsOutput(prefix_logits, pos_logits, features_logits, suffix_logits, suffix_features_logits)
101
+
102
+ class BertForMorphTagging(BertPreTrainedModel):
103
+
104
+ def __init__(self, config):
105
+ super().__init__(config)
106
+
107
+ self.bert = BertModel(config, add_pooling_layer=False)
108
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
109
+ self.morph = BertMorphTaggingHead(config)
110
+
111
+ # Initialize weights and apply final processing
112
+ self.post_init()
113
+
114
+ def forward(
115
+ self,
116
+ input_ids: Optional[torch.Tensor] = None,
117
+ attention_mask: Optional[torch.Tensor] = None,
118
+ token_type_ids: Optional[torch.Tensor] = None,
119
+ position_ids: Optional[torch.Tensor] = None,
120
+ labels: Optional[MorphLabels] = None,
121
+ head_mask: Optional[torch.Tensor] = None,
122
+ inputs_embeds: Optional[torch.Tensor] = None,
123
+ output_attentions: Optional[bool] = None,
124
+ output_hidden_states: Optional[bool] = None,
125
+ return_dict: Optional[bool] = None,
126
+ ):
127
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
128
+
129
+ bert_outputs = self.bert(
130
+ input_ids,
131
+ attention_mask=attention_mask,
132
+ token_type_ids=token_type_ids,
133
+ position_ids=position_ids,
134
+ head_mask=head_mask,
135
+ inputs_embeds=inputs_embeds,
136
+ output_attentions=output_attentions,
137
+ output_hidden_states=output_hidden_states,
138
+ return_dict=return_dict,
139
+ )
140
+
141
+ hidden_states = bert_outputs[0]
142
+ hidden_states = self.dropout(hidden_states)
143
+
144
+ loss, logits = self.morph(hidden_states, labels)
145
+
146
+ if not return_dict:
147
+ return (loss,logits) + bert_outputs[2:]
148
+
149
+ return MorphTaggingOutput(
150
+ loss=loss,
151
+ logits=logits,
152
+ hidden_states=bert_outputs.hidden_states,
153
+ attentions=bert_outputs.attentions,
154
+ )
155
+
156
+ def predict(self, sentences: List[str], tokenizer: BertTokenizerFast, padding='longest'):
157
+ # tokenize the inputs and convert them to relevant device
158
+ inputs = tokenizer(sentences, padding=padding, truncation=True, return_tensors='pt')
159
+ inputs = {k:v.to(self.device) for k,v in inputs.items()}
160
+ # calculate the logits
161
+ logits = self.forward(**inputs, return_dict=True).logits
162
+ return parse_logits(inputs['input_ids'].tolist(), sentences, tokenizer, logits)
163
+
164
+ def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: BertTokenizerFast, logits: MorphLogitsOutput):
165
+ prefix_logits, pos_logits, feats_logits, suffix_logits, suffix_feats_logits = \
166
+ logits.prefix_logits, logits.pos_logits, logits.features_logits, logits.suffix_logits, logits.suffix_features_logits
167
+
168
+ prefix_predictions = (prefix_logits > 0.5).int().tolist() # Threshold at 0.5 for multi-label classification
169
+ pos_predictions = pos_logits.argmax(axis=-1).tolist()
170
+ suffix_predictions = suffix_logits.argmax(axis=-1).tolist()
171
+ feats_predictions = [logits.argmax(axis=-1).tolist() for logits in feats_logits]
172
+ suffix_feats_predictions = [logits.argmax(axis=-1).tolist() for logits in suffix_feats_logits]
173
+
174
+ # create the return dictionary
175
+ # for each sentence, return a dict object with the following files { text, tokens }
176
+ # Where tokens is a list of dicts, where each dict is:
177
+ # { pos: str, feats: dict, prefixes: List[str], suffix: str | bool, suffix_feats: dict | None}
178
+ special_toks = tokenizer.all_special_tokens
179
+ special_toks.remove(tokenizer.unk_token)
180
+ special_toks.remove(tokenizer.mask_token)
181
+
182
+ ret = []
183
+ for sent_idx,sentence in enumerate(sentences):
184
+ input_id_strs = tokenizer.convert_ids_to_tokens(input_ids[sent_idx])
185
+ # iterate through each token in the sentence, ignoring special tokens
186
+ tokens = []
187
+ for token_idx,token_str in enumerate(input_id_strs):
188
+ if token_str in special_toks: continue
189
+ if token_str.startswith('##'):
190
+ tokens[-1]['token'] += token_str[2:]
191
+ continue
192
+ tokens.append(dict(
193
+ token=token_str,
194
+ pos=ALL_POS[pos_predictions[sent_idx][token_idx]],
195
+ feats=get_features_dict_from_predictions(feats_predictions, (sent_idx, token_idx)),
196
+ prefixes=[ALL_PREFIX_POS[idx] for idx,i in enumerate(prefix_predictions[sent_idx][token_idx]) if i > 0],
197
+ suffix=get_suffix_or_false(ALL_SUFFIX_POS[suffix_predictions[sent_idx][token_idx]]),
198
+ ))
199
+ if tokens[-1]['suffix']:
200
+ tokens[-1]['suffix_feats'] = get_features_dict_from_predictions(suffix_feats_predictions, (sent_idx, token_idx))
201
+ ret.append(dict(text=sentence, tokens=tokens))
202
+ return ret
203
+
204
+ def get_suffix_or_false(suffix):
205
+ return False if suffix == 'none' else suffix
206
+
207
+ def get_features_dict_from_predictions(predictions, idx):
208
+ ret = {}
209
+ for (feat_idx, (feat_name, feat_values)) in enumerate(ALL_FEATURES):
210
+ val = feat_values[predictions[feat_idx][idx[0]][idx[1]]]
211
+ if val != 'none':
212
+ ret[feat_name] = val
213
+ return ret
214
+
215
+
dictabert-joint/BertForPrefixMarking.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.utils import ModelOutput
2
+ import torch
3
+ from torch import nn
4
+ from typing import Dict, List, Tuple, Optional
5
+ from dataclasses import dataclass
6
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
7
+
8
+ # define the classes, and the possible prefixes for each class
9
+ POSSIBLE_PREFIX_CLASSES = [ ['לכש', 'כש', 'מש', 'בש', 'לש'], ['מ'], ['ש'], ['ה'], ['ו'], ['כ'], ['ל'], ['ב'] ]
10
+ POSSIBLE_RABBINIC_PREFIX_CLASSES = [ ['לכש', 'כש', 'מש', 'בש', 'לש', 'לד', 'בד', 'מד', 'כד', 'לכד'], ['מ'], ['ש', 'ד'], ['ה'], ['ו'], ['כ'], ['ל'], ['ב'], ['א'], ['ק'] ]
11
+
12
+ class PrefixConfig(dict):
13
+ def __init__(self, possible_classes, **kwargs): # added kwargs for previous version where all features were kept as dict values
14
+ super().__init__()
15
+ self.possible_classes = possible_classes
16
+ self.total_classes = len(possible_classes)
17
+ self.prefix_c2i = {w: i for i, l in enumerate(possible_classes) for w in l}
18
+ self.all_prefix_items = list(sorted(self.prefix_c2i.keys(), key=len, reverse=True))
19
+
20
+ @property
21
+ def possible_classes(self) -> List[List[str]]:
22
+ return self.get('possible_classes')
23
+
24
+ @possible_classes.setter
25
+ def possible_classes(self, value: List[List[str]]):
26
+ self['possible_classes'] = value
27
+
28
+ DEFAULT_PREFIX_CONFIG = PrefixConfig(POSSIBLE_PREFIX_CLASSES)
29
+
30
+ def get_prefixes_from_str(s, cfg: PrefixConfig, greedy=False):
31
+ # keep trimming prefixes from the string
32
+ while len(s) > 0 and s[0] in cfg.prefix_c2i:
33
+ # find the longest string to trim
34
+ next_pre = next((pre for pre in cfg.all_prefix_items if s.startswith(pre)), None)
35
+ if next_pre is None:
36
+ return
37
+ yield next_pre
38
+ # if the chosen prefix is more than one letter, there is always an option that the
39
+ # prefix is actually just the first letter of the prefix - so offer that up as a valid prefix
40
+ # as well. We will still jump to the length of the longer one, since if the next two/three
41
+ # letters are a prefix, they have to be the longest one
42
+ if not greedy and len(next_pre) > 1:
43
+ yield next_pre[0]
44
+ s = s[len(next_pre):]
45
+
46
+ def get_prefix_classes_from_str(s, cfg: PrefixConfig, greedy=False):
47
+ for pre in get_prefixes_from_str(s, cfg, greedy):
48
+ yield cfg.prefix_c2i[pre]
49
+
50
+ @dataclass
51
+ class PrefixesClassifiersOutput(ModelOutput):
52
+ loss: Optional[torch.FloatTensor] = None
53
+ logits: Optional[torch.FloatTensor] = None
54
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
55
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
56
+
57
+ class BertPrefixMarkingHead(nn.Module):
58
+ def __init__(self, config) -> None:
59
+ super().__init__()
60
+ self.config = config
61
+
62
+ if not hasattr(config, 'prefix_cfg') or config.prefix_cfg is None:
63
+ setattr(config, 'prefix_cfg', DEFAULT_PREFIX_CONFIG)
64
+ if isinstance(config.prefix_cfg, dict):
65
+ config.prefix_cfg = PrefixConfig(config.prefix_cfg['possible_classes'])
66
+
67
+ # an embedding table containing an embedding for each prefix class + 1 for NONE
68
+ # we will concatenate either the embedding/NONE for each class - and we want the concatenate
69
+ # size to be the hidden_size
70
+ prefix_class_embed = config.hidden_size // config.prefix_cfg.total_classes
71
+ self.prefix_class_embeddings = nn.Embedding(config.prefix_cfg.total_classes + 1, prefix_class_embed)
72
+
73
+ # one layer for transformation, apply an activation, then another N classifiers for each prefix class
74
+ self.transform = nn.Linear(config.hidden_size + prefix_class_embed * config.prefix_cfg.total_classes, config.hidden_size)
75
+ self.activation = nn.Tanh()
76
+ self.classifiers = nn.ModuleList([nn.Linear(config.hidden_size, 2) for _ in range(config.prefix_cfg.total_classes)])
77
+
78
+ def forward(
79
+ self,
80
+ hidden_states: torch.Tensor,
81
+ prefix_class_id_options: torch.Tensor,
82
+ labels: Optional[torch.Tensor] = None) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
83
+
84
+ # encode the prefix_class_id_options
85
+ # If input_ids is batch x seq_len
86
+ # Then sequence_output is batch x seq_len x hidden_dim
87
+ # So prefix_class_id_options is batch x seq_len x total_classes
88
+ # Looking up the embeddings should give us batch x seq_len x total_classes x hidden_dim / N
89
+ possible_class_embed = self.prefix_class_embeddings(prefix_class_id_options)
90
+ # then flatten the final dimension - now we have batch x seq_len x hidden_dim_2
91
+ possible_class_embed = possible_class_embed.reshape(possible_class_embed.shape[:-2] + (-1,))
92
+
93
+ # concatenate the new class embed into the sequence output before the transform
94
+ pre_transform_output = torch.cat((hidden_states, possible_class_embed), dim=-1) # batch x seq_len x (hidden_dim + hidden_dim_2)
95
+ pre_logits_output = self.activation(self.transform(pre_transform_output))# batch x seq_len x hidden_dim
96
+
97
+ # run each of the classifiers on the transformed output
98
+ logits = torch.cat([cls(pre_logits_output).unsqueeze(-2) for cls in self.classifiers], dim=-2)
99
+
100
+ loss = None
101
+ if labels is not None:
102
+ loss_fct = nn.CrossEntropyLoss()
103
+ loss = loss_fct(logits.view(-1, 2), labels.view(-1))
104
+
105
+ return (loss, logits)
106
+
107
+
108
+
109
+ class BertForPrefixMarking(BertPreTrainedModel):
110
+
111
+ def __init__(self, config):
112
+ super().__init__(config)
113
+
114
+ self.bert = BertModel(config, add_pooling_layer=False)
115
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
116
+ self.prefix = BertPrefixMarkingHead(config)
117
+
118
+ # Initialize weights and apply final processing
119
+ self.post_init()
120
+
121
+ def forward(
122
+ self,
123
+ input_ids: Optional[torch.Tensor] = None,
124
+ attention_mask: Optional[torch.Tensor] = None,
125
+ token_type_ids: Optional[torch.Tensor] = None,
126
+ prefix_class_id_options: Optional[torch.Tensor] = None,
127
+ position_ids: Optional[torch.Tensor] = None,
128
+ labels: Optional[torch.Tensor] = None,
129
+ head_mask: Optional[torch.Tensor] = None,
130
+ inputs_embeds: Optional[torch.Tensor] = None,
131
+ output_attentions: Optional[bool] = None,
132
+ output_hidden_states: Optional[bool] = None,
133
+ return_dict: Optional[bool] = None,
134
+ ):
135
+ r"""
136
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
137
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
138
+ """
139
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
140
+
141
+ bert_outputs = self.bert(
142
+ input_ids,
143
+ attention_mask=attention_mask,
144
+ token_type_ids=token_type_ids,
145
+ position_ids=position_ids,
146
+ head_mask=head_mask,
147
+ inputs_embeds=inputs_embeds,
148
+ output_attentions=output_attentions,
149
+ output_hidden_states=output_hidden_states,
150
+ return_dict=return_dict,
151
+ )
152
+
153
+ hidden_states = bert_outputs[0]
154
+ hidden_states = self.dropout(hidden_states)
155
+
156
+ loss, logits = self.prefix.forward(hidden_states, prefix_class_id_options, labels)
157
+ if not return_dict:
158
+ return (loss,logits,) + bert_outputs[2:]
159
+
160
+ return PrefixesClassifiersOutput(
161
+ loss=loss,
162
+ logits=logits,
163
+ hidden_states=bert_outputs.hidden_states,
164
+ attentions=bert_outputs.attentions,
165
+ )
166
+
167
+ def predict(self, sentences: List[str], tokenizer: BertTokenizerFast, padding='longest'):
168
+ # step 1: encode the sentences through using the tokenizer, and get the input tensors + prefix id tensors
169
+ inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, self.config.prefix_cfg, sentences, padding)
170
+ inputs.pop('offset_mapping')
171
+ inputs = {k:v.to(self.device) for k,v in inputs.items()}
172
+
173
+ # run through bert
174
+ logits = self.forward(**inputs, return_dict=True).logits
175
+ return parse_logits(inputs['input_ids'].tolist(), sentences, tokenizer, logits, self.config.prefix_cfg)
176
+
177
+ def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.FloatTensor, config: PrefixConfig):
178
+ # extract the predictions by argmaxing the final dimension (batch x sequence x prefixes x prediction)
179
+ logit_preds = torch.argmax(logits, axis=3).tolist()
180
+
181
+ ret = []
182
+
183
+ for sent_idx,sent_ids in enumerate(input_ids):
184
+ tokens = tokenizer.convert_ids_to_tokens(sent_ids)
185
+ ret.append([])
186
+ for tok_idx,token in enumerate(tokens):
187
+ # If we've reached the pad token, then we are at the end
188
+ if token == tokenizer.pad_token: continue
189
+ if token.startswith('##'): continue
190
+
191
+ # combine the next tokens in? only if it's a breakup
192
+ next_tok_idx = tok_idx + 1
193
+ while next_tok_idx < len(tokens) and tokens[next_tok_idx].startswith('##'):
194
+ token += tokens[next_tok_idx][2:]
195
+ next_tok_idx += 1
196
+
197
+ prefix_len = get_predicted_prefix_len_from_logits(token, logit_preds[sent_idx][tok_idx], config)
198
+
199
+ if not prefix_len:
200
+ ret[-1].append([token])
201
+ else:
202
+ ret[-1].append([token[:prefix_len], token[prefix_len:]])
203
+ return ret
204
+
205
+ def encode_sentences_for_bert_for_prefix_marking(tokenizer: BertTokenizerFast, config: PrefixConfig, sentences: List[str], padding='longest', truncation=True):
206
+ inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
207
+ # create our prefix_id_options array which will be like the input ids shape but with an addtional
208
+ # dimension containing for each prefix whether it can be for that word
209
+ prefix_id_options = torch.full(inputs['input_ids'].shape + (config.total_classes,), config.total_classes, dtype=torch.long)
210
+
211
+ # go through each token, and fill in the vector accordingly
212
+ for sent_idx, sent_ids in enumerate(inputs['input_ids']):
213
+ tokens = tokenizer.convert_ids_to_tokens(sent_ids)
214
+ for tok_idx, token in enumerate(tokens):
215
+ # if the first letter isn't a valid prefix letter, nothing to talk about
216
+ if len(token) < 2 or not token[0] in config.prefix_c2i: continue
217
+
218
+ # combine the next tokens in? only if it's a breakup
219
+ next_tok_idx = tok_idx + 1
220
+ while next_tok_idx < len(tokens) and tokens[next_tok_idx].startswith('##'):
221
+ token += tokens[next_tok_idx][2:]
222
+ next_tok_idx += 1
223
+
224
+ # find all the possible prefixes - and mark them as 0 (and in the possible mark it as it's value for embed lookup)
225
+ for pre_class in get_prefix_classes_from_str(token, config):
226
+ prefix_id_options[sent_idx, tok_idx, pre_class] = pre_class
227
+
228
+ inputs['prefix_class_id_options'] = prefix_id_options
229
+ return inputs
230
+
231
+ def get_predicted_prefix_len_from_logits(token, token_logits, config: PrefixConfig):
232
+ # Go through each possible prefix, and check if the prefix is yes - and if
233
+ # so increase the counter of the matched length, otherwise break out. That will solve cases
234
+ # of predicting prefix combinations that don't exist on the word.
235
+ # For example, if we have the word ושכשהלכתי and the model predict ו & כש, then we will only
236
+ # take the vuv because in order to get the כש we need the ש as well.
237
+ # Two extra items:
238
+ # 1] Don't allow the same prefix multiple times
239
+ # 2] Always check that the word starts with that prefix - otherwise it's bad
240
+ # (except for the case of multi-letter prefix, where we force the next to be last)
241
+ cur_len, skip_next, last_check, seen_prefixes = 0, False, False, set()
242
+ for prefix in get_prefixes_from_str(token, config):
243
+ # Are we skipping this prefix? This will be the case where we matched כש, don't allow ש
244
+ if skip_next:
245
+ skip_next = False
246
+ continue
247
+ # check for duplicate prefixes, we don't allow two of the same prefix
248
+ # if it predicted two of the same, then we will break out
249
+ if prefix in seen_prefixes: break
250
+ seen_prefixes.add(prefix)
251
+
252
+ # check if we predicted this prefix
253
+ if token_logits[config.prefix_c2i[prefix]]:
254
+ cur_len += len(prefix)
255
+ if last_check: break
256
+ skip_next = len(prefix) > 1
257
+ # Otherwise, we predicted no. If we didn't, then this is the end of the prefix
258
+ # and time to break out. *Except* if it's a multi letter prefix, then we allow
259
+ # just the next letter - e.g., if כש doesn't match, then we allow כ, but then we know
260
+ # the word continues with a ש, and if it's not כש, then it's not כ-ש- (invalid)
261
+ elif len(prefix) > 1:
262
+ last_check = True
263
+ else:
264
+ break
265
+
266
+ return cur_len
dictabert-joint/BertForSyntaxParsing.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from transformers.utils import ModelOutput
3
+ import torch
4
+ from torch import nn
5
+ from typing import Dict, List, Tuple, Optional, Union
6
+ from dataclasses import dataclass
7
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
8
+
9
+ ALL_FUNCTION_LABELS = ["nsubj", "nsubj:cop", "punct", "mark", "mark:q", "case", "case:gen", "case:acc", "fixed", "obl", "det", "amod", "acl:relcl", "nmod", "cc", "conj", "root", "compound:smixut", "cop", "compound:affix", "advmod", "nummod", "appos", "nsubj:pass", "nmod:poss", "xcomp", "obj", "aux", "parataxis", "advcl", "ccomp", "csubj", "acl", "obl:tmod", "csubj:pass", "dep", "dislocated", "nmod:tmod", "nmod:npmod", "flat", "obl:npmod", "goeswith", "reparandum", "orphan", "list", "discourse", "iobj", "vocative", "expl", "flat:name"]
10
+
11
+ @dataclass
12
+ class SyntaxLogitsOutput(ModelOutput):
13
+ dependency_logits: torch.FloatTensor = None
14
+ function_logits: torch.FloatTensor = None
15
+ dependency_head_indices: torch.LongTensor = None
16
+
17
+ def detach(self):
18
+ return SyntaxTaggingOutput(self.dependency_logits.detach(), self.function_logits.detach(), self.dependency_head_indices.detach())
19
+
20
+ @dataclass
21
+ class SyntaxTaggingOutput(ModelOutput):
22
+ loss: Optional[torch.FloatTensor] = None
23
+ logits: Optional[SyntaxLogitsOutput] = None
24
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
25
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
26
+
27
+ @dataclass
28
+ class SyntaxLabels(ModelOutput):
29
+ dependency_labels: Optional[torch.LongTensor] = None
30
+ function_labels: Optional[torch.LongTensor] = None
31
+
32
+ def detach(self):
33
+ return SyntaxLabels(self.dependency_labels.detach(), self.function_labels.detach())
34
+
35
+ def to(self, device):
36
+ return SyntaxLabels(self.dependency_labels.to(device), self.function_labels.to(device))
37
+
38
+ class BertSyntaxParsingHead(nn.Module):
39
+ def __init__(self, config):
40
+ super().__init__()
41
+ self.config = config
42
+
43
+ # the attention query & key values
44
+ self.head_size = config.syntax_head_size# int(config.hidden_size / config.num_attention_heads * 2)
45
+ self.query = nn.Linear(config.hidden_size, self.head_size)
46
+ self.key = nn.Linear(config.hidden_size, self.head_size)
47
+ # the function classifier gets two encoding values and predicts the labels
48
+ self.num_function_classes = len(ALL_FUNCTION_LABELS)
49
+ self.cls = nn.Linear(config.hidden_size * 2, self.num_function_classes)
50
+
51
+ def forward(
52
+ self,
53
+ hidden_states: torch.Tensor,
54
+ extended_attention_mask: Optional[torch.Tensor],
55
+ labels: Optional[SyntaxLabels] = None,
56
+ compute_mst: bool = False) -> Tuple[torch.Tensor, SyntaxLogitsOutput]:
57
+
58
+ # Take the dot product between "query" and "key" to get the raw attention scores.
59
+ query_layer = self.query(hidden_states)
60
+ key_layer = self.key(hidden_states)
61
+ attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / math.sqrt(self.head_size)
62
+
63
+ # add in the attention mask
64
+ if extended_attention_mask is not None:
65
+ if extended_attention_mask.ndim == 4:
66
+ extended_attention_mask = extended_attention_mask.squeeze(1)
67
+ attention_scores += extended_attention_mask# batch x seq x seq
68
+
69
+ # At this point take the hidden_state of the word and of the dependency word, and predict the function
70
+ # If labels are provided, use the labels.
71
+ if self.training and labels is not None:
72
+ # Note that the labels can have -100, so just set those to zero with a max
73
+ dep_indices = labels.dependency_labels.clamp_min(0)
74
+ # Otherwise - check if he wants the MST or just the argmax
75
+ elif compute_mst:
76
+ dep_indices = compute_mst_tree(attention_scores, extended_attention_mask)
77
+ else:
78
+ dep_indices = torch.argmax(attention_scores, dim=-1)
79
+
80
+ # After we retrieved the dependency indicies, create a tensor of teh batch indices, and and retrieve the vectors of the heads to calculate the function
81
+ batch_indices = torch.arange(dep_indices.size(0)).view(-1, 1).expand(-1, dep_indices.size(1)).to(dep_indices.device)
82
+ dep_vectors = hidden_states[batch_indices, dep_indices, :] # batch x seq x dim
83
+
84
+ # concatenate that with the last hidden states, and send to the classifier output
85
+ cls_inputs = torch.cat((hidden_states, dep_vectors), dim=-1)
86
+ function_logits = self.cls(cls_inputs)
87
+
88
+ loss = None
89
+ if labels is not None:
90
+ loss_fct = nn.CrossEntropyLoss()
91
+ # step 1: dependency scores loss - this is applied to the attention scores
92
+ loss = loss_fct(attention_scores.view(-1, hidden_states.size(-2)), labels.dependency_labels.view(-1))
93
+ # step 2: function loss
94
+ loss += loss_fct(function_logits.view(-1, self.num_function_classes), labels.function_labels.view(-1))
95
+
96
+ return (loss, SyntaxLogitsOutput(attention_scores, function_logits, dep_indices))
97
+
98
+
99
+ class BertForSyntaxParsing(BertPreTrainedModel):
100
+
101
+ def __init__(self, config):
102
+ super().__init__(config)
103
+
104
+ self.bert = BertModel(config, add_pooling_layer=False)
105
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
106
+ self.syntax = BertSyntaxParsingHead(config)
107
+
108
+ # Initialize weights and apply final processing
109
+ self.post_init()
110
+
111
+ def forward(
112
+ self,
113
+ input_ids: Optional[torch.Tensor] = None,
114
+ attention_mask: Optional[torch.Tensor] = None,
115
+ token_type_ids: Optional[torch.Tensor] = None,
116
+ position_ids: Optional[torch.Tensor] = None,
117
+ labels: Optional[SyntaxLabels] = None,
118
+ head_mask: Optional[torch.Tensor] = None,
119
+ inputs_embeds: Optional[torch.Tensor] = None,
120
+ output_attentions: Optional[bool] = None,
121
+ output_hidden_states: Optional[bool] = None,
122
+ return_dict: Optional[bool] = None,
123
+ compute_syntax_mst: Optional[bool] = None,
124
+ ):
125
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
126
+
127
+ bert_outputs = self.bert(
128
+ input_ids,
129
+ attention_mask=attention_mask,
130
+ token_type_ids=token_type_ids,
131
+ position_ids=position_ids,
132
+ head_mask=head_mask,
133
+ inputs_embeds=inputs_embeds,
134
+ output_attentions=output_attentions,
135
+ output_hidden_states=output_hidden_states,
136
+ return_dict=return_dict,
137
+ )
138
+
139
+ extended_attention_mask = None
140
+ if attention_mask is not None:
141
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.size())
142
+ # apply the syntax head
143
+ loss, logits = self.syntax(self.dropout(bert_outputs[0]), extended_attention_mask, labels, compute_syntax_mst)
144
+
145
+ if not return_dict:
146
+ return (loss,(logits.dependency_logits, logits.function_logits)) + bert_outputs[2:]
147
+
148
+ return SyntaxTaggingOutput(
149
+ loss=loss,
150
+ logits=logits,
151
+ hidden_states=bert_outputs.hidden_states,
152
+ attentions=bert_outputs.attentions,
153
+ )
154
+
155
+ def predict(self, sentences: Union[str, List[str]], tokenizer: BertTokenizerFast, compute_mst=True):
156
+ if isinstance(sentences, str):
157
+ sentences = [sentences]
158
+
159
+ # predict the logits for the sentence
160
+ inputs = tokenizer(sentences, padding='longest', truncation=True, return_tensors='pt')
161
+ inputs = {k:v.to(self.device) for k,v in inputs.items()}
162
+ logits = self.forward(**inputs, return_dict=True, compute_syntax_mst=compute_mst).logits
163
+ return parse_logits(inputs['input_ids'].tolist(), sentences, tokenizer, logits)
164
+
165
+ def parse_logits(input_ids: List[List[int]], sentences: List[str], tokenizer: BertTokenizerFast, logits: SyntaxLogitsOutput):
166
+ outputs = []
167
+
168
+ special_toks = tokenizer.all_special_tokens
169
+ special_toks.remove(tokenizer.unk_token)
170
+ special_toks.remove(tokenizer.mask_token)
171
+
172
+ for i in range(len(sentences)):
173
+ deps = logits.dependency_head_indices[i].tolist()
174
+ funcs = logits.function_logits.argmax(-1)[i].tolist()
175
+ toks = [tok for tok in tokenizer.convert_ids_to_tokens(input_ids[i]) if tok not in special_toks]
176
+
177
+ # first, go through the tokens and create a mapping between each dependency index and the index without wordpieces
178
+ # wordpieces. At the same time, append the wordpieces in
179
+ idx_mapping = {-1:-1} # default root
180
+ real_idx = -1
181
+ for i in range(len(toks)):
182
+ if not toks[i].startswith('##'):
183
+ real_idx += 1
184
+ idx_mapping[i] = real_idx
185
+
186
+ # build our tree, keeping tracking of the root idx
187
+ tree = []
188
+ root_idx = 0
189
+ for i in range(len(toks)):
190
+ if toks[i].startswith('##'):
191
+ tree[-1]['word'] += toks[i][2:]
192
+ continue
193
+
194
+ dep_idx = deps[i + 1] - 1 # increase 1 for cls, decrease 1 for cls
195
+ if dep_idx == len(toks): dep_idx = i - 1 # if he predicts sep, then just point to the previous word
196
+
197
+ dep_head = 'root' if dep_idx == -1 else toks[dep_idx]
198
+ dep_func = ALL_FUNCTION_LABELS[funcs[i + 1]]
199
+
200
+ if dep_head == 'root': root_idx = len(tree)
201
+ tree.append(dict(word=toks[i], dep_head_idx=idx_mapping[dep_idx], dep_func=dep_func))
202
+ # append the head word
203
+ for d in tree:
204
+ d['dep_head'] = tree[d['dep_head_idx']]['word']
205
+
206
+ outputs.append(dict(tree=tree, root_idx=root_idx))
207
+ return outputs
208
+
209
+
210
+ def compute_mst_tree(attention_scores: torch.Tensor, extended_attention_mask: torch.LongTensor):
211
+ # attention scores should be 3 dimensions - batch x seq x seq (if it is 2 - just unsqueeze)
212
+ if attention_scores.ndim == 2: attention_scores = attention_scores.unsqueeze(0)
213
+ if attention_scores.ndim != 3 or attention_scores.shape[1] != attention_scores.shape[2]:
214
+ raise ValueError(f'Expected attention scores to be of shape batch x seq x seq, instead got {attention_scores.shape}')
215
+
216
+ batch_size, seq_len, _ = attention_scores.shape
217
+ # start by softmaxing so the scores are comparable
218
+ attention_scores = attention_scores.softmax(dim=-1)
219
+
220
+ batch_indices = torch.arange(batch_size, device=attention_scores.device)
221
+ seq_indices = torch.arange(seq_len, device=attention_scores.device)
222
+
223
+ seq_lens = torch.full((batch_size,), seq_len)
224
+
225
+ if extended_attention_mask is not None:
226
+ seq_lens = torch.argmax((extended_attention_mask != 0).int(), dim=2).squeeze(1)
227
+ # zero out any padding
228
+ attention_scores[extended_attention_mask.squeeze(1) != 0] = 0
229
+
230
+ # set the values for the CLS and sep to all by very low, so they never get chosen as a replacement arc
231
+ attention_scores[:, 0, :] = 0
232
+ attention_scores[batch_indices, seq_lens - 1, :] = 0
233
+ attention_scores[batch_indices, :, seq_lens - 1] = 0 # can never predict sep
234
+ # set the values for each token pointing to itself be 0
235
+ attention_scores[:, seq_indices, seq_indices] = 0
236
+
237
+ # find the root, and make him super high so we never have a conflict
238
+ root_cands = torch.argsort(attention_scores[:, :, 0], dim=-1)
239
+ attention_scores[batch_indices.unsqueeze(1), root_cands, 0] = 0
240
+ attention_scores[batch_indices, root_cands[:, -1], 0] = 1.0
241
+
242
+ # we start by getting the argmax for each score, and then computing the cycles and contracting them
243
+ sorted_indices = torch.argsort(attention_scores, dim=-1, descending=True)
244
+ indices = sorted_indices[:, :, 0].clone() # take the argmax
245
+
246
+ attention_scores = attention_scores.tolist()
247
+ seq_lens = seq_lens.tolist()
248
+ sorted_indices = [[sub_l[:slen] for sub_l in l[:slen]] for l,slen in zip(sorted_indices.tolist(), seq_lens)]
249
+
250
+
251
+ # go through each batch item and make sure our tree works
252
+ for batch_idx in range(batch_size):
253
+ # We have one root - detect the cycles and contract them. A cycle can never contain the root so really
254
+ # for every cycle, we look at all the nodes, and find the highest arc out of the cycle for any values. Replace that and tada
255
+ has_cycle, cycle_nodes = detect_cycle(indices[batch_idx], seq_lens[batch_idx])
256
+ contracted_arcs = set()
257
+ while has_cycle:
258
+ base_idx, head_idx = choose_contracting_arc(indices[batch_idx], sorted_indices[batch_idx], cycle_nodes, contracted_arcs, seq_lens[batch_idx], attention_scores[batch_idx])
259
+ indices[batch_idx, base_idx] = head_idx
260
+ contracted_arcs.add(base_idx)
261
+ # find the next cycle
262
+ has_cycle, cycle_nodes = detect_cycle(indices[batch_idx], seq_lens[batch_idx])
263
+
264
+ return indices
265
+
266
+ def detect_cycle(indices: torch.LongTensor, seq_len: int):
267
+ # Simple cycle detection algorithm
268
+ # Returns a boolean indicating if a cycle is detected and the nodes involved in the cycle
269
+ visited = set()
270
+ for node in range(1, seq_len - 1): # ignore the CLS/SEP tokens
271
+ if node in visited:
272
+ continue
273
+ current_path = set()
274
+ while node not in visited:
275
+ visited.add(node)
276
+ current_path.add(node)
277
+ node = indices[node].item()
278
+ if node == 0: break # roots never point to anything
279
+ if node in current_path:
280
+ return True, current_path # Cycle detected
281
+ return False, None
282
+
283
+ def choose_contracting_arc(indices: torch.LongTensor, sorted_indices: List[List[int]], cycle_nodes: set, contracted_arcs: set, seq_len: int, scores: List[List[float]]):
284
+ # Chooses the highest-scoring, non-cycling arc from a graph. Iterates through 'cycle_nodes' to find
285
+ # the best arc based on 'scores', avoiding cycles and zero node connections.
286
+ # For each node, we only look at the next highest scoring non-cycling arc
287
+ best_base_idx, best_head_idx = -1, -1
288
+ score = 0
289
+
290
+ # convert the indices to a list once, to avoid multiple conversions (saves a few seconds)
291
+ currents = indices.tolist()
292
+ for base_node in cycle_nodes:
293
+ if base_node in contracted_arcs: continue
294
+ # we don't want to take anything that has a higher score than the current value - we can end up in an endless loop
295
+ # Since the indices are sorted, as soon as we find our current item, we can move on to the next.
296
+ current = currents[base_node]
297
+ found_current = False
298
+
299
+ for head_node in sorted_indices[base_node]:
300
+ if head_node == current:
301
+ found_current = True
302
+ continue
303
+ if head_node in contracted_arcs: continue
304
+ if not found_current or head_node in cycle_nodes or head_node == 0:
305
+ continue
306
+
307
+ current_score = scores[base_node][head_node]
308
+ if current_score > score:
309
+ best_base_idx, best_head_idx, score = base_node, head_node, current_score
310
+ break
311
+
312
+ if best_base_idx == -1:
313
+ raise ValueError('Stuck in endless loop trying to compute syntax mst. Please try again setting compute_syntax_mst=False')
314
+
315
+ return best_base_idx, best_head_idx
dictabert-joint/README.md ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - he
5
+ inference: false
6
+ ---
7
+ # DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew
8
+
9
+ State-of-the-art language model for Hebrew, released [here](https://arxiv.org/abs/2403.06970).
10
+
11
+ This is the fine-tuned model for the joint parsing of the following tasks:
12
+
13
+ - Prefix Segmentation
14
+ - Morphological Disabmgiuation
15
+ - Lexicographical Analysis (Lemmatization)
16
+ - Syntactical Parsing (Dependency-Tree)
17
+ - Named-Entity Recognition
18
+
19
+ A live demo of the model with instant visualization of the syntax tree can be found [here](https://huggingface.co/spaces/dicta-il/joint-demo).
20
+
21
+ For a faster model, you can use the equivalent bert-tiny model for this task [here](https://huggingface.co/dicta-il/dictabert-tiny-joint).
22
+
23
+ For the bert-base models for other tasks, see [here](https://huggingface.co/collections/dicta-il/dictabert-6588e7cc08f83845fc42a18b).
24
+
25
+ ---
26
+
27
+ The model currently supports 3 types of output:
28
+
29
+ 1. **JSON**: The model returns a JSON object for each sentence in the input, where for each sentence we have the sentence text, the NER entities, and the list of tokens. For each token we include the output from each of the tasks.
30
+ ```python
31
+ model.predict(..., output_style='json')
32
+ ```
33
+
34
+ 1. **UD**: The model returns the full UD output for each sentence, according to the style of the Hebrew UD Treebank.
35
+ ```python
36
+ model.predict(..., output_style='ud')
37
+ ```
38
+
39
+ 1. **UD, in the style of IAHLT**: This model returns the full UD output, with slight modifications to match the style of IAHLT. This differences are mostly granularity of some dependency relations, how the suffix of a word is broken up, and implicit definite articles. The actual tagging behavior doesn't change.
40
+ ```python
41
+ model.predict(..., output_style='iahlt_ud')
42
+ ```
43
+
44
+ ---
45
+
46
+ If you only need the output for one of the tasks, you can tell the model to not initialize some of the heads, for example:
47
+ ```python
48
+ model = AutoModel.from_pretrained('dicta-il/dictabert-joint', trust_remote_code=True, do_lex=False)
49
+ ```
50
+
51
+ The list of options are: `do_lex`, `do_syntax`, `do_ner`, `do_prefix`, `do_morph`.
52
+
53
+ ---
54
+
55
+ Sample usage:
56
+
57
+ ```python
58
+ from transformers import AutoModel, AutoTokenizer
59
+
60
+ tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-joint')
61
+ model = AutoModel.from_pretrained('dicta-il/dictabert-joint', trust_remote_code=True)
62
+
63
+ model.eval()
64
+
65
+ sentence = 'בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים'
66
+ print(model.predict([sentence], tokenizer, output_style='json')) # see below for other return formats
67
+ ```
68
+
69
+ Output:
70
+ ```json
71
+ [
72
+ {
73
+ "text": "בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים",
74
+ "tokens": [
75
+ {
76
+ "token": "בשנת",
77
+ "syntax": {
78
+ "word": "בשנת",
79
+ "dep_head_idx": 2,
80
+ "dep_func": "obl",
81
+ "dep_head": "השלים"
82
+ },
83
+ "seg": [
84
+ "ב",
85
+ "שנת"
86
+ ],
87
+ "lex": "שנה",
88
+ "morph": {
89
+ "token": "בשנת",
90
+ "pos": "NOUN",
91
+ "feats": {
92
+ "Gender": "Fem",
93
+ "Number": "Sing"
94
+ },
95
+ "prefixes": [
96
+ "ADP"
97
+ ],
98
+ "suffix": false
99
+ }
100
+ },
101
+ {
102
+ "token": "1948",
103
+ "syntax": {
104
+ "word": "1948",
105
+ "dep_head_idx": 0,
106
+ "dep_func": "compound",
107
+ "dep_head": "בשנת"
108
+ },
109
+ "seg": [
110
+ "1948"
111
+ ],
112
+ "lex": "1948",
113
+ "morph": {
114
+ "token": "1948",
115
+ "pos": "NUM",
116
+ "feats": {},
117
+ "prefixes": [],
118
+ "suffix": false
119
+ }
120
+ },
121
+ {
122
+ "token": "השלים",
123
+ "syntax": {
124
+ "word": "השלים",
125
+ "dep_head_idx": -1,
126
+ "dep_func": "root",
127
+ "dep_head": "הומוריסטיים"
128
+ },
129
+ "seg": [
130
+ "השלים"
131
+ ],
132
+ "lex": "השלים",
133
+ "morph": {
134
+ "token": "השלים",
135
+ "pos": "VERB",
136
+ "feats": {
137
+ "Gender": "Masc",
138
+ "Number": "Sing",
139
+ "Person": "3",
140
+ "Tense": "Past"
141
+ },
142
+ "prefixes": [],
143
+ "suffix": false
144
+ }
145
+ },
146
+ {
147
+ "token": "אפרים",
148
+ "syntax": {
149
+ "word": "אפרים",
150
+ "dep_head_idx": 2,
151
+ "dep_func": "nsubj",
152
+ "dep_head": "השלים"
153
+ },
154
+ "seg": [
155
+ "אפרים"
156
+ ],
157
+ "lex": "אפרים",
158
+ "morph": {
159
+ "token": "אפרים",
160
+ "pos": "PROPN",
161
+ "feats": {},
162
+ "prefixes": [],
163
+ "suffix": false
164
+ }
165
+ },
166
+ {
167
+ "token": "קיש��ן",
168
+ "syntax": {
169
+ "word": "קישון",
170
+ "dep_head_idx": 3,
171
+ "dep_func": "flat",
172
+ "dep_head": "אפרים"
173
+ },
174
+ "seg": [
175
+ "קישון"
176
+ ],
177
+ "lex": "קישון",
178
+ "morph": {
179
+ "token": "קישון",
180
+ "pos": "PROPN",
181
+ "feats": {},
182
+ "prefixes": [],
183
+ "suffix": false
184
+ }
185
+ },
186
+ {
187
+ "token": "את",
188
+ "syntax": {
189
+ "word": "את",
190
+ "dep_head_idx": 6,
191
+ "dep_func": "case",
192
+ "dep_head": "לימודיו"
193
+ },
194
+ "seg": [
195
+ "את"
196
+ ],
197
+ "lex": "את",
198
+ "morph": {
199
+ "token": "את",
200
+ "pos": "ADP",
201
+ "feats": {},
202
+ "prefixes": [],
203
+ "suffix": false
204
+ }
205
+ },
206
+ {
207
+ "token": "לימודיו",
208
+ "syntax": {
209
+ "word": "לימודיו",
210
+ "dep_head_idx": 2,
211
+ "dep_func": "obj",
212
+ "dep_head": "השלים"
213
+ },
214
+ "seg": [
215
+ "לימודיו"
216
+ ],
217
+ "lex": "לימוד",
218
+ "morph": {
219
+ "token": "לימודיו",
220
+ "pos": "NOUN",
221
+ "feats": {
222
+ "Gender": "Masc",
223
+ "Number": "Plur"
224
+ },
225
+ "prefixes": [],
226
+ "suffix": "PRON",
227
+ "suffix_feats": {
228
+ "Gender": "Masc",
229
+ "Number": "Sing",
230
+ "Person": "3"
231
+ }
232
+ }
233
+ },
234
+ {
235
+ "token": "בפיסול",
236
+ "syntax": {
237
+ "word": "בפיסול",
238
+ "dep_head_idx": 6,
239
+ "dep_func": "nmod",
240
+ "dep_head": "לימודיו"
241
+ },
242
+ "seg": [
243
+ "ב",
244
+ "פיסול"
245
+ ],
246
+ "lex": "פיסול",
247
+ "morph": {
248
+ "token": "בפיסול",
249
+ "pos": "NOUN",
250
+ "feats": {
251
+ "Gender": "Masc",
252
+ "Number": "Sing"
253
+ },
254
+ "prefixes": [
255
+ "ADP"
256
+ ],
257
+ "suffix": false
258
+ }
259
+ },
260
+ {
261
+ "token": "מתכת",
262
+ "syntax": {
263
+ "word": "מתכת",
264
+ "dep_head_idx": 7,
265
+ "dep_func": "compound",
266
+ "dep_head": "בפיסול"
267
+ },
268
+ "seg": [
269
+ "מתכת"
270
+ ],
271
+ "lex": "מתכת",
272
+ "morph": {
273
+ "token": "מתכת",
274
+ "pos": "NOUN",
275
+ "feats": {
276
+ "Gender": "Fem",
277
+ "Number": "Sing"
278
+ },
279
+ "prefixes": [],
280
+ "suffix": false
281
+ }
282
+ },
283
+ {
284
+ "token": "ובתולדות",
285
+ "syntax": {
286
+ "word": "ובתולדות",
287
+ "dep_head_idx": 7,
288
+ "dep_func": "conj",
289
+ "dep_head": "בפיסול"
290
+ },
291
+ "seg": [
292
+ "וב",
293
+ "תולדות"
294
+ ],
295
+ "lex": "תולדה",
296
+ "morph": {
297
+ "token": "ובתולדות",
298
+ "pos": "NOUN",
299
+ "feats": {
300
+ "Gender": "Fem",
301
+ "Number": "Plur"
302
+ },
303
+ "prefixes": [
304
+ "CCONJ",
305
+ "ADP"
306
+ ],
307
+ "suffix": false
308
+ }
309
+ },
310
+ {
311
+ "token": "האמנות",
312
+ "syntax": {
313
+ "word": "האמנות",
314
+ "dep_head_idx": 9,
315
+ "dep_func": "compound",
316
+ "dep_head": "ובתולדות"
317
+ },
318
+ "seg": [
319
+ "ה",
320
+ "אמנות"
321
+ ],
322
+ "lex": "אומנות",
323
+ "morph": {
324
+ "token": "האמנות",
325
+ "pos": "NOUN",
326
+ "feats": {
327
+ "Gender": "Fem",
328
+ "Number": "Sing"
329
+ },
330
+ "prefixes": [
331
+ "DET"
332
+ ],
333
+ "suffix": false
334
+ }
335
+ },
336
+ {
337
+ "token": "והחל",
338
+ "syntax": {
339
+ "word": "והחל",
340
+ "dep_head_idx": 2,
341
+ "dep_func": "conj",
342
+ "dep_head": "השלים"
343
+ },
344
+ "seg": [
345
+ "ו",
346
+ "החל"
347
+ ],
348
+ "lex": "החל",
349
+ "morph": {
350
+ "token": "והחל",
351
+ "pos": "VERB",
352
+ "feats": {
353
+ "Gender": "Masc",
354
+ "Number": "Sing",
355
+ "Person": "3",
356
+ "Tense": "Past"
357
+ },
358
+ "prefixes": [
359
+ "CCONJ"
360
+ ],
361
+ "suffix": false
362
+ }
363
+ },
364
+ {
365
+ "token": "לפרסם",
366
+ "syntax": {
367
+ "word": "לפרסם",
368
+ "dep_head_idx": 11,
369
+ "dep_func": "xcomp",
370
+ "dep_head": "והחל"
371
+ },
372
+ "seg": [
373
+ "לפרסם"
374
+ ],
375
+ "lex": "פרסם",
376
+ "morph": {
377
+ "token": "לפרסם",
378
+ "pos": "VERB",
379
+ "feats": {},
380
+ "prefixes": [],
381
+ "suffix": false
382
+ }
383
+ },
384
+ {
385
+ "token": "מאמרים",
386
+ "syntax": {
387
+ "word": "מא��רים",
388
+ "dep_head_idx": 12,
389
+ "dep_func": "obj",
390
+ "dep_head": "לפרסם"
391
+ },
392
+ "seg": [
393
+ "מאמרים"
394
+ ],
395
+ "lex": "מאמר",
396
+ "morph": {
397
+ "token": "מאמרים",
398
+ "pos": "NOUN",
399
+ "feats": {
400
+ "Gender": "Masc",
401
+ "Number": "Plur"
402
+ },
403
+ "prefixes": [],
404
+ "suffix": false
405
+ }
406
+ },
407
+ {
408
+ "token": "הומוריסטיים",
409
+ "syntax": {
410
+ "word": "הומוריסטיים",
411
+ "dep_head_idx": 13,
412
+ "dep_func": "amod",
413
+ "dep_head": "מאמרים"
414
+ },
415
+ "seg": [
416
+ "הומוריסטיים"
417
+ ],
418
+ "lex": "הומוריסטי",
419
+ "morph": {
420
+ "token": "הומוריסטיים",
421
+ "pos": "ADJ",
422
+ "feats": {
423
+ "Gender": "Masc",
424
+ "Number": "Plur"
425
+ },
426
+ "prefixes": [],
427
+ "suffix": false
428
+ }
429
+ }
430
+ ],
431
+ "root_idx": 2,
432
+ "ner_entities": [
433
+ {
434
+ "phrase": "1948",
435
+ "label": "TIMEX"
436
+ },
437
+ {
438
+ "phrase": "אפרים קישון",
439
+ "label": "PER"
440
+ }
441
+ ]
442
+ }
443
+ ]
444
+ ```
445
+
446
+ You can also choose to get your response in UD format:
447
+
448
+ ```python
449
+ sentence = 'בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים'
450
+ print(model.predict([sentence], tokenizer, output_style='ud'))
451
+ ```
452
+
453
+ Results:
454
+ ```json
455
+ [
456
+ [
457
+ "# sent_id = 1",
458
+ "# text = בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים",
459
+ "1-2\tבשנת\t_\t_\t_\t_\t_\t_\t_\t_",
460
+ "1\tב\tב\tADP\tADP\t_\t2\tcase\t_\t_",
461
+ "2\tשנת\tשנה\tNOUN\tNOUN\tGender=Fem|Number=Sing\t4\tobl\t_\t_",
462
+ "3\t1948\t1948\tNUM\tNUM\t\t2\tcompound:smixut\t_\t_",
463
+ "4\tהשלים\tהשלים\tVERB\tVERB\tGender=Masc|Number=Sing|Person=3|Tense=Past\t0\troot\t_\t_",
464
+ "5\tאפרים\tאפרים\tPROPN\tPROPN\t\t4\tnsubj\t_\t_",
465
+ "6\tקישון\tקישון\tPROPN\tPROPN\t\t5\tflat\t_\t_",
466
+ "7\tאת\tאת\tADP\tADP\t\t8\tcase:acc\t_\t_",
467
+ "8-10\tלימודיו\t_\t_\t_\t_\t_\t_\t_\t_",
468
+ "8\tלימוד_\tלימוד\tNOUN\tNOUN\tGender=Masc|Number=Plur\t4\tobj\t_\t_",
469
+ "9\t_של_\tשל\tADP\tADP\t_\t10\tcase\t_\t_",
470
+ "10\t_הוא\tהוא\tPRON\tPRON\tGender=Masc|Number=Sing|Person=3\t8\tnmod:poss\t_\t_",
471
+ "11-12\tבפיסול\t_\t_\t_\t_\t_\t_\t_\t_",
472
+ "11\tב\tב\tADP\tADP\t_\t12\tcase\t_\t_",
473
+ "12\tפיסול\tפיסול\tNOUN\tNOUN\tGender=Masc|Number=Sing\t8\tnmod\t_\t_",
474
+ "13\tמתכת\tמתכת\tNOUN\tNOUN\tGender=Fem|Number=Sing\t12\tcompound:smixut\t_\t_",
475
+ "14-16\tובתולדות\t_\t_\t_\t_\t_\t_\t_\t_",
476
+ "14\tו\tו\tCCONJ\tCCONJ\t_\t16\tcc\t_\t_",
477
+ "15\tב\tב\tADP\tADP\t_\t16\tcase\t_\t_",
478
+ "16\tתולדות\tתולדה\tNOUN\tNOUN\tGender=Fem|Number=Plur\t12\tconj\t_\t_",
479
+ "17-18\tהאמנות\t_\t_\t_\t_\t_\t_\t_\t_",
480
+ "17\tה\tה\tDET\tDET\t_\t18\tdet\t_\t_",
481
+ "18\tאמנות\tאומנות\tNOUN\tNOUN\tGender=Fem|Number=Sing\t16\tcompound:smixut\t_\t_",
482
+ "19-20\tוהחל\t_\t_\t_\t_\t_\t_\t_\t_",
483
+ "19\tו\tו\tCCONJ\tCCONJ\t_\t20\tcc\t_\t_",
484
+ "20\tהחל\tהחל\tVERB\tVERB\tGender=Masc|Number=Sing|Person=3|Tense=Past\t4\tconj\t_\t_",
485
+ "21\tלפרסם\tפרסם\tVERB\tVERB\t\t20\txcomp\t_\t_",
486
+ "22\tמאמרים\tמאמר\tNOUN\tNOUN\tGender=Masc|Number=Plur\t21\tobj\t_\t_",
487
+ "23\tהומוריסטיים\tהומוריסטי\tADJ\tADJ\tGender=Masc|Number=Plur\t22\tamod\t_\t_"
488
+ ]
489
+ ]
490
+ ```
491
+
492
+
493
+ ## Citation
494
+
495
+ If you use DictaBERT-joint in your research, please cite ```MRL Parsing without Tears: The Case of Hebrew```
496
+
497
+ **BibTeX:**
498
+
499
+ ```bibtex
500
+ @misc{shmidman2024mrl,
501
+ title={MRL Parsing Without Tears: The Case of Hebrew},
502
+ author={Shaltiel Shmidman and Avi Shmidman and Moshe Koppel and Reut Tsarfaty},
503
+ year={2024},
504
+ eprint={2403.06970},
505
+ archivePrefix={arXiv},
506
+ primaryClass={cs.CL}
507
+ }
508
+ ```
509
+
510
+ ## License
511
+
512
+ Shield: [![CC BY 4.0][cc-by-shield]][cc-by]
513
+
514
+ This work is licensed under a
515
+ [Creative Commons Attribution 4.0 International License][cc-by].
516
+
517
+ [![CC BY 4.0][cc-by-image]][cc-by]
518
+
519
+ [cc-by]: http://creativecommons.org/licenses/by/4.0/
520
+ [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
521
+ [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg
dictabert-joint/config.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForJointParsing"
4
+ ],
5
+ "auto_map": {
6
+ "AutoModel": "BertForJointParsing.BertForJointParsing"
7
+ },
8
+ "attention_probs_dropout_prob": 0.1,
9
+ "classifier_dropout": null,
10
+ "do_lex": true,
11
+ "do_morph": true,
12
+ "do_ner": true,
13
+ "do_prefix": true,
14
+ "do_syntax": true,
15
+ "gradient_checkpointing": false,
16
+ "hidden_act": "gelu",
17
+ "hidden_dropout_prob": 0.1,
18
+ "hidden_size": 768,
19
+ "id2label": {
20
+ "0": "B-ANG",
21
+ "1": "B-DUC",
22
+ "2": "B-EVE",
23
+ "3": "B-FAC",
24
+ "4": "B-GPE",
25
+ "5": "B-LOC",
26
+ "6": "B-ORG",
27
+ "7": "B-PER",
28
+ "8": "B-WOA",
29
+ "9": "B-INFORMAL",
30
+ "10": "B-MISC",
31
+ "11": "B-TIMEX",
32
+ "12": "B-TTL",
33
+ "13": "I-DUC",
34
+ "14": "I-EVE",
35
+ "15": "I-FAC",
36
+ "16": "I-GPE",
37
+ "17": "I-LOC",
38
+ "18": "I-ORG",
39
+ "19": "I-PER",
40
+ "20": "I-WOA",
41
+ "21": "I-ANG",
42
+ "22": "I-INFORMAL",
43
+ "23": "I-MISC",
44
+ "24": "I-TIMEX",
45
+ "25": "I-TTL",
46
+ "26": "O"
47
+ },
48
+ "initializer_range": 0.02,
49
+ "intermediate_size": 3072,
50
+ "label2id": {
51
+ "B-ANG": 0,
52
+ "B-DUC": 1,
53
+ "B-EVE": 2,
54
+ "B-FAC": 3,
55
+ "B-GPE": 4,
56
+ "B-INFORMAL": 9,
57
+ "B-LOC": 5,
58
+ "B-MISC": 10,
59
+ "B-ORG": 6,
60
+ "B-PER": 7,
61
+ "B-TIMEX": 11,
62
+ "B-TTL": 12,
63
+ "B-WOA": 8,
64
+ "I-ANG": 21,
65
+ "I-DUC": 13,
66
+ "I-EVE": 14,
67
+ "I-FAC": 15,
68
+ "I-GPE": 16,
69
+ "I-INFORMAL": 22,
70
+ "I-LOC": 17,
71
+ "I-MISC": 23,
72
+ "I-ORG": 18,
73
+ "I-PER": 19,
74
+ "I-TIMEX": 24,
75
+ "I-TTL": 25,
76
+ "I-WOA": 20,
77
+ "O": 26
78
+ },
79
+ "layer_norm_eps": 1e-12,
80
+ "max_position_embeddings": 512,
81
+ "model_type": "bert",
82
+ "newmodern": true,
83
+ "num_attention_heads": 12,
84
+ "num_hidden_layers": 12,
85
+ "pad_token_id": 0,
86
+ "position_embedding_type": "absolute",
87
+ "syntax_head_size": 128,
88
+ "torch_dtype": "float32",
89
+ "transformers_version": "4.36.2",
90
+ "type_vocab_size": 2,
91
+ "use_cache": true,
92
+ "vocab_size": 128000
93
+ }
dictabert-joint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c59db02c002b69fd3c072c310e3ba578d036386477248d1c8576d896cc06aa1c
3
+ size 744080096
dictabert-joint/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26ce128baa792901b22cecfdd6a7dc783307e60d4b766dcd3aa4d1eaeb3a36d2
3
+ size 744148153
dictabert-joint/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/dicta-il/dictabert-joint
dictabert-joint/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
dictabert-joint/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
dictabert-joint/tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BLANK]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "clean_up_tokenization_spaces": true,
53
+ "cls_token": "[CLS]",
54
+ "do_lower_case": true,
55
+ "mask_token": "[MASK]",
56
+ "model_max_length": 512,
57
+ "pad_token": "[PAD]",
58
+ "sep_token": "[SEP]",
59
+ "strip_accents": null,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "unk_token": "[UNK]"
63
+ }
dictabert-joint/vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb90bfa35244d26f0065d1fcd0b5becc3da3d44d616a7e2aacaf6320b9fa2d0
3
+ size 1500244
dictabert-large-char-menaked/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
dictabert-large-char-menaked/BertForDiacritization.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional, Tuple, Union
3
+ import torch
4
+ from torch import nn
5
+ from transformers.utils import ModelOutput
6
+ from transformers import BertPreTrainedModel, BertModel, BertTokenizerFast
7
+
8
+ # MAT_LECT => Matres Lectionis, known in Hebrew as Em Kriaa.
9
+ MAT_LECT_TOKEN = '<MAT_LECT>'
10
+ NIKUD_CLASSES = ['', MAT_LECT_TOKEN, '\u05BC', '\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', '\u05BA', '\u05BB', '\u05BC\u05B0', '\u05BC\u05B1', '\u05BC\u05B2', '\u05BC\u05B3', '\u05BC\u05B4', '\u05BC\u05B5', '\u05BC\u05B6', '\u05BC\u05B7', '\u05BC\u05B8', '\u05BC\u05B9', '\u05BC\u05BA', '\u05BC\u05BB', '\u05C7', '\u05BC\u05C7']
11
+ SHIN_CLASSES = ['\u05C1', '\u05C2'] # shin, sin
12
+
13
+ @dataclass
14
+ class MenakedLogitsOutput(ModelOutput):
15
+ nikud_logits: torch.FloatTensor = None
16
+ shin_logits: torch.FloatTensor = None
17
+
18
+ def detach(self):
19
+ return MenakedLogitsOutput(self.nikud_logits.detach(), self.shin_logits.detach())
20
+
21
+ @dataclass
22
+ class MenakedOutput(ModelOutput):
23
+ loss: Optional[torch.FloatTensor] = None
24
+ logits: Optional[MenakedLogitsOutput] = None
25
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
26
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
27
+
28
+ @dataclass
29
+ class MenakedLabels(ModelOutput):
30
+ nikud_labels: Optional[torch.FloatTensor] = None
31
+ shin_labels: Optional[torch.FloatTensor] = None
32
+
33
+ def detach(self):
34
+ return MenakedLabels(self.nikud_labels.detach(), self.shin_labels.detach())
35
+
36
+ def to(self, device):
37
+ return MenakedLabels(self.nikud_labels.to(device), self.shin_labels.to(device))
38
+
39
+ class BertMenakedHead(nn.Module):
40
+ def __init__(self, config):
41
+ super().__init__()
42
+ self.config = config
43
+
44
+ if not hasattr(config, 'nikud_classes'):
45
+ config.nikud_classes = NIKUD_CLASSES
46
+ config.shin_classes = SHIN_CLASSES
47
+ config.mat_lect_token = MAT_LECT_TOKEN
48
+
49
+ self.num_nikud_classes = len(config.nikud_classes)
50
+ self.num_shin_classes = len(config.shin_classes)
51
+
52
+ # create our classifiers
53
+ self.nikud_cls = nn.Linear(config.hidden_size, self.num_nikud_classes)
54
+ self.shin_cls = nn.Linear(config.hidden_size, self.num_shin_classes)
55
+
56
+ def forward(
57
+ self,
58
+ hidden_states: torch.Tensor,
59
+ labels: Optional[MenakedLabels] = None):
60
+
61
+ # run each of the classifiers on the transformed output
62
+ nikud_logits = self.nikud_cls(hidden_states)
63
+ shin_logits = self.shin_cls(hidden_states)
64
+
65
+ loss = None
66
+ if labels is not None:
67
+ loss_fct = nn.CrossEntropyLoss()
68
+ loss = loss_fct(nikud_logits.view(-1, self.num_nikud_classes), labels.nikud_labels.view(-1))
69
+ loss += loss_fct(shin_logits.view(-1, self.num_shin_classes), labels.shin_labels.view(-1))
70
+
71
+ return loss, MenakedLogitsOutput(nikud_logits, shin_logits)
72
+
73
+ class BertForDiacritization(BertPreTrainedModel):
74
+ def __init__(self, config):
75
+ super().__init__(config)
76
+ self.config = config
77
+ self.bert = BertModel(config, add_pooling_layer=False)
78
+
79
+ classifier_dropout = config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
80
+ self.dropout = nn.Dropout(classifier_dropout)
81
+
82
+ self.menaked = BertMenakedHead(config)
83
+
84
+ # Initialize weights and apply final processing
85
+ self.post_init()
86
+
87
+ def forward(
88
+ self,
89
+ input_ids: Optional[torch.Tensor] = None,
90
+ attention_mask: Optional[torch.Tensor] = None,
91
+ token_type_ids: Optional[torch.Tensor] = None,
92
+ position_ids: Optional[torch.Tensor] = None,
93
+ head_mask: Optional[torch.Tensor] = None,
94
+ inputs_embeds: Optional[torch.Tensor] = None,
95
+ labels: Optional[torch.Tensor] = None,
96
+ output_attentions: Optional[bool] = None,
97
+ output_hidden_states: Optional[bool] = None,
98
+ return_dict: Optional[bool] = None,
99
+ ) -> Union[Tuple[torch.Tensor], MenakedOutput]:
100
+
101
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
102
+
103
+ bert_outputs = self.bert(
104
+ input_ids,
105
+ attention_mask=attention_mask,
106
+ token_type_ids=token_type_ids,
107
+ position_ids=position_ids,
108
+ head_mask=head_mask,
109
+ inputs_embeds=inputs_embeds,
110
+ output_attentions=output_attentions,
111
+ output_hidden_states=output_hidden_states,
112
+ return_dict=return_dict,
113
+ )
114
+
115
+ hidden_states = bert_outputs[0]
116
+ hidden_states = self.dropout(hidden_states)
117
+
118
+ loss, logits = self.menaked(hidden_states, labels)
119
+
120
+ if not return_dict:
121
+ return (loss,logits) + bert_outputs[2:]
122
+
123
+ return MenakedOutput(
124
+ loss=loss,
125
+ logits=logits,
126
+ hidden_states=bert_outputs.hidden_states,
127
+ attentions=bert_outputs.attentions,
128
+ )
129
+
130
+ def predict(self, sentences: List[str], tokenizer: BertTokenizerFast, mark_matres_lectionis: str = None, padding='longest'):
131
+ sentences = [remove_nikkud(sentence) for sentence in sentences]
132
+ # assert the lengths aren't out of range
133
+ assert all(len(sentence) + 2 <= tokenizer.model_max_length for sentence in sentences), f'All sentences must be <= {tokenizer.model_max_length}, please segment and try again'
134
+
135
+ # tokenize the inputs and convert them to relevant device
136
+ inputs = tokenizer(sentences, padding=padding, truncation=True, return_tensors='pt', return_offsets_mapping=True)
137
+ offset_mapping = inputs.pop('offset_mapping')
138
+ inputs = {k:v.to(self.device) for k,v in inputs.items()}
139
+
140
+ # calculate the predictions
141
+ logits = self.forward(**inputs, return_dict=True).logits
142
+ nikud_predictions = logits.nikud_logits.argmax(dim=-1).tolist()
143
+ shin_predictions = logits.shin_logits.argmax(dim=-1).tolist()
144
+
145
+ ret = []
146
+ for sent_idx,(sentence,sent_offsets) in enumerate(zip(sentences, offset_mapping)):
147
+ # assign the nikud to each letter!
148
+ output = []
149
+ prev_index = 0
150
+ for idx,offsets in enumerate(sent_offsets):
151
+ # add in anything we missed
152
+ if offsets[0] > prev_index:
153
+ output.append(sentence[prev_index:offsets[0]])
154
+ if offsets[1] - offsets[0] != 1: continue
155
+
156
+ # get our next char
157
+ char = sentence[offsets[0]:offsets[1]]
158
+ prev_index = offsets[1]
159
+ if not is_hebrew_letter(char):
160
+ output.append(char)
161
+ continue
162
+
163
+ nikud = self.config.nikud_classes[nikud_predictions[sent_idx][idx]]
164
+ shin = '' if char != 'ש' else self.config.shin_classes[shin_predictions[sent_idx][idx]]
165
+
166
+ # check for matres lectionis
167
+ if nikud == self.config.mat_lect_token:
168
+ if not is_matres_letter(char): nikud = '' # don't allow matres on irrelevant letters
169
+ elif mark_matres_lectionis is not None: nikud = mark_matres_lectionis
170
+ else: continue
171
+
172
+ output.append(char + shin + nikud)
173
+ output.append(sentence[prev_index:])
174
+ ret.append(''.join(output))
175
+
176
+ return ret
177
+
178
+ ALEF_ORD = ord('א')
179
+ TAF_ORD = ord('ת')
180
+ def is_hebrew_letter(char):
181
+ return ALEF_ORD <= ord(char) <= TAF_ORD
182
+
183
+ MATRES_LETTERS = list('אוי')
184
+ def is_matres_letter(char):
185
+ return char in MATRES_LETTERS
186
+
187
+ import re
188
+ nikud_pattern = re.compile(r'[\u05B0-\u05BD\u05C1\u05C2\u05C7]')
189
+ def remove_nikkud(text):
190
+ return nikud_pattern.sub('', text)
dictabert-large-char-menaked/README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - he
5
+ inference: false
6
+ ---
7
+ # DictaBERT-large-char-menaked: An open-source BERT-based model for adding diacritiziation marks ("nikud") to Hebrew texts
8
+
9
+ This model is a fine-tuned version of [DictaBERT-large-char](https://huggingface.co/dicta-il/dictabert-large-char), dedicated to the task of adding nikud (diacritics) to Hebrew text.
10
+
11
+ The model was trained on a corpus of modern Hebrew texts manually diacritized by linguistic experts.
12
+ As of 2025-03, this model provides SOTA performance on all modern Hebrew vocalization benchmarks as compared to all other open-source alternatives, as well as when compared with commercial generative LLMs.
13
+
14
+ Note: this model is trained to handle a wide variety of genres of modern Hebrew prose. However, it is not intended for earlier layers of Hebrew (e.g. Biblical, Rabbinic, Premodern), nor for poetic texts.
15
+
16
+ Sample usage:
17
+
18
+ ```python
19
+ from transformers import AutoModel, AutoTokenizer
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-large-char-menaked')
22
+ model = AutoModel.from_pretrained('dicta-il/dictabert-large-char-menaked', trust_remote_code=True)
23
+
24
+ model.eval()
25
+
26
+ sentence = 'בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים'
27
+ print(model.predict([sentence], tokenizer))
28
+ ```
29
+
30
+ Output:
31
+ ```json
32
+ ['בִּשְׁנַת 1948 הִשְׁלִים אֶפְרַיִם קִישׁוֹן אֶת לִמּוּדָיו בְּפִסּוּל מַתֶּכֶת וּבְתוֹלְדוֹת הָאׇמָּנוּת וְהֵחֵל לְפַרְסֵם מַאֲמָרִים הוּמוֹרִיסְטִיִּים']
33
+ ```
34
+
35
+ ### Matres Lectionis (אימות קריאה)
36
+
37
+ As can be seen, the predict method automatically removed all the matres-lectionis (אימות קריאה). If you wish to keep them in, you can specify that to the predict function:
38
+
39
+ ```python
40
+ print(model.predict([sentence], tokenizer, mark_matres_lectionis = '*'))
41
+ ```
42
+
43
+ Output:
44
+
45
+ ```json
46
+ ['בִּשְׁנַת 1948 הִשְׁלִים אֶפְרַיִם קִישׁוֹן אֶת לִי*מּוּדָיו בְּפִי*סּוּל מַתֶּכֶת וּבְתוֹלְדוֹת הָאׇמָּנוּת וְהֵחֵל לְפַרְסֵם מַאֲמָרִים הוּמוֹרִיסְטִיִּים']
47
+ ```
48
+
49
+ ### Community Project
50
+
51
+ A third-party project, [dicta-onnx](https://github.com/thewh1teagle/dicta-onnx), offers a lightweight ONNX-based tool built on top of our model for adding Hebrew diacritics. We're not affiliated, but it's a cool and practical application you might find useful.
52
+
53
+ ## License
54
+
55
+ Shield: [![CC BY 4.0][cc-by-shield]][cc-by]
56
+
57
+ This work is licensed under a
58
+ [Creative Commons Attribution 4.0 International License][cc-by].
59
+
60
+ [![CC BY 4.0][cc-by-image]][cc-by]
61
+
62
+ [cc-by]: http://creativecommons.org/licenses/by/4.0/
63
+ [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png
64
+ [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg
65
+
66
+
67
+
68
+
69
+
dictabert-large-char-menaked/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForDiacritization"
4
+ ],
5
+ "auto_map": {
6
+ "AutoModel": "BertForDiacritization.BertForDiacritization"
7
+ },
8
+ "attention_probs_dropout_prob": 0.1,
9
+ "classifier_dropout": null,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-12,
16
+ "mat_lect_token": "<MAT_LECT>",
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "bert",
19
+ "nikud_classes": [
20
+ "",
21
+ "<MAT_LECT>",
22
+ "\u05bc",
23
+ "\u05b0",
24
+ "\u05b1",
25
+ "\u05b2",
26
+ "\u05b3",
27
+ "\u05b4",
28
+ "\u05b5",
29
+ "\u05b6",
30
+ "\u05b7",
31
+ "\u05b8",
32
+ "\u05b9",
33
+ "\u05ba",
34
+ "\u05bb",
35
+ "\u05bc\u05b0",
36
+ "\u05bc\u05b1",
37
+ "\u05bc\u05b2",
38
+ "\u05bc\u05b3",
39
+ "\u05bc\u05b4",
40
+ "\u05bc\u05b5",
41
+ "\u05bc\u05b6",
42
+ "\u05bc\u05b7",
43
+ "\u05bc\u05b8",
44
+ "\u05bc\u05b9",
45
+ "\u05bc\u05ba",
46
+ "\u05bc\u05bb",
47
+ "\u05c7",
48
+ "\u05bc\u05c7"
49
+ ],
50
+ "num_attention_heads": 16,
51
+ "num_hidden_layers": 24,
52
+ "pad_token_id": 0,
53
+ "position_embedding_type": "absolute",
54
+ "shin_classes": [
55
+ "\u05c1",
56
+ "\u05c2"
57
+ ],
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.42.4",
60
+ "type_vocab_size": 2,
61
+ "use_cache": true,
62
+ "vocab_size": 1024
63
+ }
dictabert-large-char-menaked/issues.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ------------------------------------------------------------------------
2
+ #3 Different variations of diacritics
3
+ ------------------------------------------------------------------------
4
+
5
+ [thewh1teagle] Apr 3, 2025
6
+
7
+ I would like to get multiple variations of diacritics for sentence
8
+
9
+ For instance with 'Shalom Olam'
10
+ שלום עולם
11
+ The diacritics are 'Shlom Olam'
12
+ שְׁלוֹם עוֹלָם
13
+
14
+ I tried to implement beam search but couldn't get different variations
15
+ Thanks
16
+
17
+ thewh1teagle changed discussion title from Beam search example to Different variations of diacritics Apr 3, 2025
18
+
19
+ [johnlockejrr] Apr 3, 2025
20
+
21
+ Seems the vocalization is for peace of world not Hello world! :)
22
+
23
+ [Shaltiel, DICTA: The Israel Center for Text Analysis.org] Apr 8, 2025
24
+
25
+ Indeed, the current architecture does not allow retrieving multiple variations of diacritics for each word/the sentence. We are looking into training a model with a different architecture, but that is currently only in research.
26
+
27
+ [thewh1teagle] Apr 17, 2025
28
+
29
+ I noticed some differences in the nikud from Dicta website in terms of modernity
30
+ For instance when I hit שלום עולם in Dicta website it's Shalom Olam but in the model it's Shlom Olam, it's like the model nikud is a bit 'less modern' than Dicta website. That's why I asked for a way to get variations.
31
+
32
+ If you plan another one, I wish that you can include more modern nikud, and Shva Na and Atama'a! ; )
33
+ Thank you very much for the model. very appreciated.
34
+
35
+
dictabert-large-char-menaked/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2927643c61f408c7d5ff1652b605b322ed896fa07ded344bd508a02b76bf50e
3
+ size 1222010788