File size: 2,873 Bytes
615275a 63da4c1 615275a 63da4c1 615275a 63da4c1 615275a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | from typing import Dict, List, Any
import numpy as np
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
SAHIDIC_TAG = "з"
BOHAIRIC_TAG = "б"
GENERATION_KWARGS = {
"max_new_tokens": 128,
"min_new_tokens": 1,
"early_stopping": True,
"num_beams": 5,
"output_scores": True,
"return_dict_in_generate": True,
}
GREEK_TO_COPTIC = {
"α": "ⲁ",
"β": "ⲃ",
"γ": "ⲅ",
"δ": "ⲇ",
"ε": "ⲉ",
"ϛ": "ⲋ",
"ζ": "ⲍ",
"η": "ⲏ",
"θ": "ⲑ",
"ι": "ⲓ",
"κ": "ⲕ",
"λ": "ⲗ",
"μ": "ⲙ",
"ν": "ⲛ",
"ξ": "ⲝ",
"ο": "ⲟ",
"π": "ⲡ",
"ρ": "ⲣ",
"σ": "ⲥ",
"τ": "ⲧ",
"υ": "ⲩ",
"φ": "ⲫ",
"χ": "ⲭ",
"ψ": "ⲯ",
"ω": "ⲱ",
"s": "ϣ",
"f": "ϥ",
"k": "ϧ",
"h": "ϩ",
"j": "ϫ",
"c": "ϭ",
"t": "ϯ",
}
def degreekify(greek_text: str) -> str:
chars = []
for c in greek_text:
l_c = c.lower()
chars.append(GREEK_TO_COPTIC.get(l_c, l_c))
return "".join(chars)
class EndpointHandler:
def __init__(self, path: str = ""):
self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
self.tokenizer = AutoTokenizer.from_pretrained(path)
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Translate English text to Coptic.
data args:
inputs (:obj:`str`): English text to translate
to_bohairic (:obj:`bool`, optional): Use Bohairic dialect (default: Sahidic)
output_confidence (:obj:`bool`, optional): Include confidence score
Return:
A :obj:`list` of :obj:`dict`: translation results
"""
inputs = data.pop("inputs", data)
to_bohairic = data.pop("to_bohairic", False)
output_confidence = data.pop("output_confidence", False)
if to_bohairic:
text = f"{BOHAIRIC_TAG} {inputs}"
else:
text = f"{SAHIDIC_TAG} {inputs}"
input_tensors = self.tokenizer.encode(text, return_tensors="pt")
outputs = self.model.generate(
input_tensors[:, : self.tokenizer.model_max_length],
**GENERATION_KWARGS,
)
translated_text = self.tokenizer.decode(
outputs.sequences[0], skip_special_tokens=True
)
translated_text = degreekify(translated_text)
if not output_confidence:
return [{"translation": translated_text}]
scores = outputs.scores
confidences = [
torch.softmax(score, dim=-1).max().item() for score in scores
]
num_words = len(translated_text.split())
scaled_probability = np.exp(sum(np.log(confidences)) / num_words)
return [{"translation": translated_text, "confidence": scaled_probability}]
|