| from typing import Dict, List, Any | |
| import numpy as np | |
| import torch | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| SAHIDIC_TAG = "з" | |
| BOHAIRIC_TAG = "б" | |
| GENERATION_KWARGS = { | |
| "max_new_tokens": 128, | |
| "min_new_tokens": 1, | |
| "early_stopping": True, | |
| "num_beams": 5, | |
| "output_scores": True, | |
| "return_dict_in_generate": True, | |
| } | |
| COPTIC_TO_GREEK = { | |
| "ⲁ": "α", | |
| "ⲃ": "β", | |
| "ⲅ": "γ", | |
| "ⲇ": "δ", | |
| "ⲉ": "ε", | |
| "ⲋ": "ϛ", | |
| "ⲍ": "ζ", | |
| "ⲏ": "η", | |
| "ⲑ": "θ", | |
| "ⲓ": "ι", | |
| "ⲕ": "κ", | |
| "ⲗ": "λ", | |
| "ⲙ": "μ", | |
| "ⲛ": "ν", | |
| "ⲝ": "ξ", | |
| "ⲟ": "ο", | |
| "ⲡ": "π", | |
| "ⲣ": "ρ", | |
| "ⲥ": "σ", | |
| "ⲧ": "τ", | |
| "ⲩ": "υ", | |
| "ⲫ": "φ", | |
| "ⲭ": "χ", | |
| "ⲯ": "ψ", | |
| "ⲱ": "ω", | |
| "ϣ": "s", | |
| "ϥ": "f", | |
| "ϧ": "k", | |
| "ϩ": "h", | |
| "ϫ": "j", | |
| "ϭ": "c", | |
| "ϯ": "t", | |
| } | |
| def greekify(coptic_text: str) -> str: | |
| chars = [] | |
| for c in coptic_text: | |
| l_c = c.lower() | |
| chars.append(COPTIC_TO_GREEK.get(l_c, l_c)) | |
| return "".join(chars) | |
| class EndpointHandler: | |
| def __init__(self, path: str = ""): | |
| self.model = AutoModelForSeq2SeqLM.from_pretrained(path) | |
| self.tokenizer = AutoTokenizer.from_pretrained(path) | |
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| """ | |
| Translate Coptic text to English. | |
| data args: | |
| inputs (:obj:`str`): Coptic text to translate | |
| from_bohairic (:obj:`bool`, optional): Input is Bohairic dialect (default: Sahidic) | |
| output_confidence (:obj:`bool`, optional): Include confidence score | |
| Return: | |
| A :obj:`list` of :obj:`dict`: translation results | |
| """ | |
| inputs = data.pop("inputs", data) | |
| from_bohairic = data.pop("from_bohairic", False) | |
| output_confidence = data.pop("output_confidence", False) | |
| text = greekify(inputs.lower()) | |
| if from_bohairic: | |
| text = f"{BOHAIRIC_TAG} {text}" | |
| else: | |
| text = f"{SAHIDIC_TAG} {text}" | |
| input_tensors = self.tokenizer.encode(text, return_tensors="pt") | |
| outputs = self.model.generate( | |
| input_tensors[:, : self.tokenizer.model_max_length], | |
| **GENERATION_KWARGS, | |
| ) | |
| translated_text = self.tokenizer.decode( | |
| outputs.sequences[0], skip_special_tokens=True | |
| ) | |
| if not output_confidence: | |
| return [{"translation": translated_text}] | |
| scores = outputs.scores | |
| confidences = [ | |
| torch.softmax(score, dim=-1).max().item() for score in scores | |
| ] | |
| num_words = len(translated_text.split()) | |
| scaled_probability = np.exp(sum(np.log(confidences)) / num_words) | |
| return [{"translation": translated_text, "confidence": scaled_probability}] | |