Spaces:
Running
Running
| #%% | |
| from transformers import RobertaForTokenClassification, AutoTokenizer | |
| model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics") | |
| tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics") | |
| #%% | |
| label2diacritic = {0: 'ّ', # SHADDA | |
| 1: 'َ', # FATHA | |
| 2: 'ِ', # KASRA | |
| 3: 'ُ', # DAMMA | |
| 4: 'ْ'} # SUKKUN | |
| def arabic2diacritics(text, model, tokenizer): | |
| tokens = tokenizer(text, return_tensors="pt") | |
| preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS | |
| new_text = [] | |
| for p, c in zip(preds, text): | |
| new_text.append(c) | |
| for i in range(1, 5): | |
| if p[i]: | |
| new_text.append(label2diacritic[i]) | |
| # check shadda last | |
| if p[0]: | |
| new_text.append(label2diacritic[0]) | |
| new_text = "".join(new_text) | |
| return new_text | |
| def diacritize(text): | |
| return arabic2diacritics(text, model, tokenizer) | |
| def diacritize_if_not_already(text): | |
| if any(c in label2diacritic.values() for c in text): | |
| return text | |
| else: | |
| return arabic2diacritics(text, model, tokenizer) | |
| #%% | |
| # text = "بديش اروح عالمدرسة بكرا" | |
| # arabic2diacritics(text, model, tokenizer) | |
| # %% | |