| from typing import Dict, List, Any | |
| import numpy as np | |
| import torch | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| SAHIDIC_TAG = "з" | |
| BOHAIRIC_TAG = "б" | |
| GENERATION_KWARGS = { | |
| "max_new_tokens": 128, | |
| "min_new_tokens": 1, | |
| "early_stopping": True, | |
| "num_beams": 5, | |
| "output_scores": True, | |
| "return_dict_in_generate": True, | |
| } | |
| GREEK_TO_COPTIC = { | |
| "α": "ⲁ", | |
| "β": "ⲃ", | |
| "γ": "ⲅ", | |
| "δ": "ⲇ", | |
| "ε": "ⲉ", | |
| "ϛ": "ⲋ", | |
| "ζ": "ⲍ", | |
| "η": "ⲏ", | |
| "θ": "ⲑ", | |
| "ι": "ⲓ", | |
| "κ": "ⲕ", | |
| "λ": "ⲗ", | |
| "μ": "ⲙ", | |
| "ν": "ⲛ", | |
| "ξ": "ⲝ", | |
| "ο": "ⲟ", | |
| "π": "ⲡ", | |
| "ρ": "ⲣ", | |
| "σ": "ⲥ", | |
| "τ": "ⲧ", | |
| "υ": "ⲩ", | |
| "φ": "ⲫ", | |
| "χ": "ⲭ", | |
| "ψ": "ⲯ", | |
| "ω": "ⲱ", | |
| "s": "ϣ", | |
| "f": "ϥ", | |
| "k": "ϧ", | |
| "h": "ϩ", | |
| "j": "ϫ", | |
| "c": "ϭ", | |
| "t": "ϯ", | |
| } | |
| def degreekify(greek_text: str) -> str: | |
| chars = [] | |
| for c in greek_text: | |
| l_c = c.lower() | |
| chars.append(GREEK_TO_COPTIC.get(l_c, l_c)) | |
| return "".join(chars) | |
| class EndpointHandler: | |
| def __init__(self, path: str = ""): | |
| self.model = AutoModelForSeq2SeqLM.from_pretrained(path) | |
| self.tokenizer = AutoTokenizer.from_pretrained(path) | |
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| """ | |
| Translate English text to Coptic. | |
| data args: | |
| inputs (:obj:`str`): English text to translate | |
| to_bohairic (:obj:`bool`, optional): Use Bohairic dialect (default: Sahidic) | |
| output_confidence (:obj:`bool`, optional): Include confidence score | |
| Return: | |
| A :obj:`list` of :obj:`dict`: translation results | |
| """ | |
| inputs = data.pop("inputs", data) | |
| to_bohairic = data.pop("to_bohairic", False) | |
| output_confidence = data.pop("output_confidence", False) | |
| if to_bohairic: | |
| text = f"{BOHAIRIC_TAG} {inputs}" | |
| else: | |
| text = f"{SAHIDIC_TAG} {inputs}" | |
| input_tensors = self.tokenizer.encode(text, return_tensors="pt") | |
| outputs = self.model.generate( | |
| input_tensors[:, : self.tokenizer.model_max_length], | |
| **GENERATION_KWARGS, | |
| ) | |
| translated_text = self.tokenizer.decode( | |
| outputs.sequences[0], skip_special_tokens=True | |
| ) | |
| translated_text = degreekify(translated_text) | |
| if not output_confidence: | |
| return [{"translation": translated_text}] | |
| scores = outputs.scores | |
| confidences = [ | |
| torch.softmax(score, dim=-1).max().item() for score in scores | |
| ] | |
| num_words = len(translated_text.split()) | |
| scaled_probability = np.exp(sum(np.log(confidences)) / num_words) | |
| return [{"translation": translated_text, "confidence": scaled_probability}] | |