from typing import Dict, List, Any import numpy as np import torch from transformers import AutoModelForSeq2SeqLM, AutoTokenizer SAHIDIC_TAG = "з" BOHAIRIC_TAG = "б" GENERATION_KWARGS = { "max_new_tokens": 128, "min_new_tokens": 1, "early_stopping": True, "num_beams": 5, "output_scores": True, "return_dict_in_generate": True, } GREEK_TO_COPTIC = { "α": "ⲁ", "β": "ⲃ", "γ": "ⲅ", "δ": "ⲇ", "ε": "ⲉ", "ϛ": "ⲋ", "ζ": "ⲍ", "η": "ⲏ", "θ": "ⲑ", "ι": "ⲓ", "κ": "ⲕ", "λ": "ⲗ", "μ": "ⲙ", "ν": "ⲛ", "ξ": "ⲝ", "ο": "ⲟ", "π": "ⲡ", "ρ": "ⲣ", "σ": "ⲥ", "τ": "ⲧ", "υ": "ⲩ", "φ": "ⲫ", "χ": "ⲭ", "ψ": "ⲯ", "ω": "ⲱ", "s": "ϣ", "f": "ϥ", "k": "ϧ", "h": "ϩ", "j": "ϫ", "c": "ϭ", "t": "ϯ", } def degreekify(greek_text: str) -> str: chars = [] for c in greek_text: l_c = c.lower() chars.append(GREEK_TO_COPTIC.get(l_c, l_c)) return "".join(chars) class EndpointHandler: def __init__(self, path: str = ""): self.model = AutoModelForSeq2SeqLM.from_pretrained(path) self.tokenizer = AutoTokenizer.from_pretrained(path) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Translate English text to Coptic. data args: inputs (:obj:`str`): English text to translate to_bohairic (:obj:`bool`, optional): Use Bohairic dialect (default: Sahidic) output_confidence (:obj:`bool`, optional): Include confidence score Return: A :obj:`list` of :obj:`dict`: translation results """ inputs = data.pop("inputs", data) to_bohairic = data.pop("to_bohairic", False) output_confidence = data.pop("output_confidence", False) if to_bohairic: text = f"{BOHAIRIC_TAG} {inputs}" else: text = f"{SAHIDIC_TAG} {inputs}" input_tensors = self.tokenizer.encode(text, return_tensors="pt") outputs = self.model.generate( input_tensors[:, : self.tokenizer.model_max_length], **GENERATION_KWARGS, ) translated_text = self.tokenizer.decode( outputs.sequences[0], skip_special_tokens=True ) translated_text = degreekify(translated_text) if not output_confidence: return [{"translation": translated_text}] scores = outputs.scores confidences = [ torch.softmax(score, dim=-1).max().item() for score in scores ] num_words = len(translated_text.split()) scaled_probability = np.exp(sum(np.log(confidences)) / num_words) return [{"translation": translated_text, "confidence": scaled_probability}]