File size: 2,868 Bytes
de0dd09
 
 
 
6eedef4
de0dd09
 
 
 
6eedef4
 
 
 
 
 
 
 
de0dd09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6eedef4
de0dd09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from typing import Dict, List, Any

import numpy as np
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

SAHIDIC_TAG = "з"
BOHAIRIC_TAG = "б"

GENERATION_KWARGS = {
    "max_new_tokens": 128,
    "min_new_tokens": 1,
    "early_stopping": True,
    "num_beams": 5,
    "output_scores": True,
    "return_dict_in_generate": True,
}

COPTIC_TO_GREEK = {
    "ⲁ": "α",
    "ⲃ": "β",
    "ⲅ": "γ",
    "ⲇ": "δ",
    "ⲉ": "ε",
    "ⲋ": "ϛ",
    "ⲍ": "ζ",
    "ⲏ": "η",
    "ⲑ": "θ",
    "ⲓ": "ι",
    "ⲕ": "κ",
    "ⲗ": "λ",
    "ⲙ": "μ",
    "ⲛ": "ν",
    "ⲝ": "ξ",
    "ⲟ": "ο",
    "ⲡ": "π",
    "ⲣ": "ρ",
    "ⲥ": "σ",
    "ⲧ": "τ",
    "ⲩ": "υ",
    "ⲫ": "φ",
    "ⲭ": "χ",
    "ⲯ": "ψ",
    "ⲱ": "ω",
    "ϣ": "s",
    "ϥ": "f",
    "ϧ": "k",
    "ϩ": "h",
    "ϫ": "j",
    "ϭ": "c",
    "ϯ": "t",
}


def greekify(coptic_text: str) -> str:
    chars = []
    for c in coptic_text:
        l_c = c.lower()
        chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
    return "".join(chars)


class EndpointHandler:
    def __init__(self, path: str = ""):
        self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
        self.tokenizer = AutoTokenizer.from_pretrained(path)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Translate Coptic text to English.

        data args:
            inputs (:obj:`str`): Coptic text to translate
            from_bohairic (:obj:`bool`, optional): Input is Bohairic dialect (default: Sahidic)
            output_confidence (:obj:`bool`, optional): Include confidence score

        Return:
            A :obj:`list` of :obj:`dict`: translation results
        """
        inputs = data.pop("inputs", data)
        from_bohairic = data.pop("from_bohairic", False)
        output_confidence = data.pop("output_confidence", False)

        text = greekify(inputs.lower())

        if from_bohairic:
            text = f"{BOHAIRIC_TAG} {text}"
        else:
            text = f"{SAHIDIC_TAG} {text}"

        input_tensors = self.tokenizer.encode(text, return_tensors="pt")
        outputs = self.model.generate(
            input_tensors[:, : self.tokenizer.model_max_length],
            **GENERATION_KWARGS,
        )

        translated_text = self.tokenizer.decode(
            outputs.sequences[0], skip_special_tokens=True
        )

        if not output_confidence:
            return [{"translation": translated_text}]

        scores = outputs.scores
        confidences = [
            torch.softmax(score, dim=-1).max().item() for score in scores
        ]
        num_words = len(translated_text.split())
        scaled_probability = np.exp(sum(np.log(confidences)) / num_words)

        return [{"translation": translated_text, "confidence": scaled_probability}]