File size: 2,868 Bytes

from typing import Dict, List, Any

import numpy as np
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

SAHIDIC_TAG = "з"
BOHAIRIC_TAG = "б"

GENERATION_KWARGS = {
    "max_new_tokens": 128,
    "min_new_tokens": 1,
    "early_stopping": True,
    "num_beams": 5,
    "output_scores": True,
    "return_dict_in_generate": True,
}

COPTIC_TO_GREEK = {
    "ⲁ": "α",
    "ⲃ": "β",
    "ⲅ": "γ",
    "ⲇ": "δ",
    "ⲉ": "ε",
    "ⲋ": "ϛ",
    "ⲍ": "ζ",
    "ⲏ": "η",
    "ⲑ": "θ",
    "ⲓ": "ι",
    "ⲕ": "κ",
    "ⲗ": "λ",
    "ⲙ": "μ",
    "ⲛ": "ν",
    "ⲝ": "ξ",
    "ⲟ": "ο",
    "ⲡ": "π",
    "ⲣ": "ρ",
    "ⲥ": "σ",
    "ⲧ": "τ",
    "ⲩ": "υ",
    "ⲫ": "φ",
    "ⲭ": "χ",
    "ⲯ": "ψ",
    "ⲱ": "ω",
    "ϣ": "s",
    "ϥ": "f",
    "ϧ": "k",
    "ϩ": "h",
    "ϫ": "j",
    "ϭ": "c",
    "ϯ": "t",
}


def greekify(coptic_text: str) -> str:
    chars = []
    for c in coptic_text:
        l_c = c.lower()
        chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
    return "".join(chars)


class EndpointHandler:
    def __init__(self, path: str = ""):
        self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
        self.tokenizer = AutoTokenizer.from_pretrained(path)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Translate Coptic text to English.

        data args:
            inputs (:obj:`str`): Coptic text to translate
            from_bohairic (:obj:`bool`, optional): Input is Bohairic dialect (default: Sahidic)
            output_confidence (:obj:`bool`, optional): Include confidence score

        Return:
            A :obj:`list` of :obj:`dict`: translation results
        """
        inputs = data.pop("inputs", data)
        from_bohairic = data.pop("from_bohairic", False)
        output_confidence = data.pop("output_confidence", False)

        text = greekify(inputs.lower())

        if from_bohairic:
            text = f"{BOHAIRIC_TAG} {text}"
        else:
            text = f"{SAHIDIC_TAG} {text}"

        input_tensors = self.tokenizer.encode(text, return_tensors="pt")
        outputs = self.model.generate(
            input_tensors[:, : self.tokenizer.model_max_length],
            **GENERATION_KWARGS,
        )

        translated_text = self.tokenizer.decode(
            outputs.sequences[0], skip_special_tokens=True
        )

        if not output_confidence:
            return [{"translation": translated_text}]

        scores = outputs.scores
        confidences = [
            torch.softmax(score, dim=-1).max().item() for score in scores
        ]
        num_words = len(translated_text.split())
        scaled_probability = np.exp(sum(np.log(confidences)) / num_words)

        return [{"translation": translated_text, "confidence": scaled_probability}]