Instructions to use guymorlan/levanti_diacritics2translit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use guymorlan/levanti_diacritics2translit with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="guymorlan/levanti_diacritics2translit")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit") model = AutoModelForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit") - Notebooks
- Google Colab
- Kaggle
Update README.md
Browse files
README.md
CHANGED
|
@@ -1,3 +1,133 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: cc-by-nc-4.0
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
language:
|
| 4 |
+
- ar
|
| 5 |
+
pipeline_tag: token-classification
|
| 6 |
+
datasets:
|
| 7 |
+
- guymorlan/levanti
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Levanti Transliterator
|
| 11 |
+
|
| 12 |
+
This model converts diacritics in Palestinian colloquial Arabic to their estimated pronunciation via Hebrew vowels. It can be used to transliterate diacritized Palestinian Arabic text into Hebrew or English. The model is trained on a special subset of the Levanti dataset (to be released later).
|
| 13 |
+
The model is fine-tuned from Google's [CANINE-s](https://huggingface.co/google/canine-s) character level LM with a token classification head.
|
| 14 |
+
Each token (letter) of the input is classified into either of 7 classes: 'O' if not a diacritic, or one of 6 Hebrew vowels (see `model.config.id2label`).
|
| 15 |
+
|
| 16 |
+
# Example Usage
|
| 17 |
+
|
| 18 |
+
```python
|
| 19 |
+
from transformers import CanineForTokenClassification, AutoTokenizer
|
| 20 |
+
import torch
|
| 21 |
+
|
| 22 |
+
model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
|
| 23 |
+
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")
|
| 24 |
+
|
| 25 |
+
def diacritics2hebrew_vowels(text, model, tokenizer):
|
| 26 |
+
tokens = tokenizer(text, return_tensors="pt")
|
| 27 |
+
with torch.no_grad():
|
| 28 |
+
pred = model(**tokens)
|
| 29 |
+
pred = pred.logits.argmax(-1).tolist()
|
| 30 |
+
|
| 31 |
+
pred = pred[0][1:-1] # remove CLS and SEP
|
| 32 |
+
output = []
|
| 33 |
+
for p, c in zip(pred, text):
|
| 34 |
+
if p != model.config.label2id["O"]:
|
| 35 |
+
output.append(model.config.id2label[p])
|
| 36 |
+
else:
|
| 37 |
+
output.append(c)
|
| 38 |
+
output = "".join(output)
|
| 39 |
+
return output
|
| 40 |
+
|
| 41 |
+
# to convert arabic diacritics to Hebrew diacritics (Tsere, Holam, Patah, Shva, Kubutz, Hiriq)
|
| 42 |
+
text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر "
|
| 43 |
+
heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
|
| 44 |
+
heb_vowels
|
| 45 |
+
```
|
| 46 |
+
```
|
| 47 |
+
Out[1]: 'لַازֵم نִعְطִي رַشַّات وִقַائִيֵّة لִلشַّجַر '
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
arabic_to_hebrew = {
|
| 52 |
+
# regular letters
|
| 53 |
+
"ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א",
|
| 54 |
+
"آ": "אא", "ى": "א", "ب": "ב", "ت": "ת", "ث": "ת'", "ج": "ג'",
|
| 55 |
+
"ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז",
|
| 56 |
+
"س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'",
|
| 57 |
+
"ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כ", "ل": "ל",
|
| 58 |
+
"م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
|
| 59 |
+
# special characters
|
| 60 |
+
"،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
final_letters = {
|
| 64 |
+
"ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
def to_taatik(arabic):
|
| 68 |
+
taatik = []
|
| 69 |
+
for index, letter in enumerate(arabic):
|
| 70 |
+
if (
|
| 71 |
+
(index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and
|
| 72 |
+
letter in final_letters
|
| 73 |
+
):
|
| 74 |
+
taatik.append(final_letters[letter])
|
| 75 |
+
elif letter not in arabic_to_hebrew:
|
| 76 |
+
taatik.append(letter)
|
| 77 |
+
else:
|
| 78 |
+
taatik.append(arabic_to_hebrew[letter])
|
| 79 |
+
return "".join(taatik)
|
| 80 |
+
|
| 81 |
+
# to convert consonants and create full hebrew transliteration (Taatik)
|
| 82 |
+
to_taatik(heb_vowels)
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
```
|
| 86 |
+
Out[2]: "לַאזֵם נִעְטִי רַשַّאת וִקַאאִיֵّה לִלשַّג'ַר "```
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
```python
|
| 90 |
+
arabic_to_english = {
|
| 91 |
+
"ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
|
| 92 |
+
"آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
|
| 93 |
+
"ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
|
| 94 |
+
"س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
|
| 95 |
+
"ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
|
| 96 |
+
"م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
|
| 97 |
+
"َ": "a", "ُ": "u", "ِ": "i",
|
| 98 |
+
"،": ",",
|
| 99 |
+
"ֹ": "o", # holam
|
| 100 |
+
"ַ": "a", # patah
|
| 101 |
+
"ִ": "i", # hiriq
|
| 102 |
+
"ְ": "", # shva
|
| 103 |
+
"ֻ": "u", # kubutz
|
| 104 |
+
'ֵ': "e",
|
| 105 |
+
"ّ": "SHADDA" # shadda
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def to_translit(arabic):
|
| 112 |
+
translit = []
|
| 113 |
+
for letter in arabic:
|
| 114 |
+
if letter not in arabic_to_english:
|
| 115 |
+
translit.append([letter, letter])
|
| 116 |
+
else:
|
| 117 |
+
if arabic_to_english[letter] == "SHADDA":
|
| 118 |
+
if translit[-1][0] in vowels:
|
| 119 |
+
translit[-2][1] = translit[-2][1].upper()
|
| 120 |
+
else:
|
| 121 |
+
translit[-1][1] = translit[-1][1].upper()
|
| 122 |
+
|
| 123 |
+
else:
|
| 124 |
+
translit.append([letter, arabic_to_english[letter]])
|
| 125 |
+
|
| 126 |
+
return "".join([x[1] for x in translit])
|
| 127 |
+
|
| 128 |
+
# to convert letters to latin representation (English transliteration)
|
| 129 |
+
to_translit(heb_vowels)
|
| 130 |
+
```
|
| 131 |
+
```
|
| 132 |
+
Out[3]: 'laazem niatiy raSHaat wiqaaaiYeh lilSHajar '
|
| 133 |
+
```
|