File size: 3,444 Bytes
648a514
 
 
 
e75cd74
648a514
8848631
 
 
 
648a514
 
8848631
 
e46e0ab
8848631
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e735818
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import torch
from functools import partial
original_torch_load = torch.load
torch.load = partial(original_torch_load, weights_only=False)

# Now rest of imports
import gradio as gr
import unicodedata
from fairseq.models.transformer import TransformerModel



# ── Load model once at startup ──────────────────────────────────────────────
model = TransformerModel.from_pretrained(
    model_name_or_path=".",
    checkpoint_file="fairseq_3lkh_best.pt",
    data_name_or_path=".",        # looks for dict.en.txt / dict.ne.txt here
    task="translation_multi_simple_epoch",
    source_lang="en",
    target_lang="ne",
    lang_dict="lang_list.txt",
    lang_pairs="en-ne",
    beam=5,
)
model.eval()


# ── Inference helpers ───────────────────────────────────────────────────────

def transliterate_word(word):
    """Transliterate a single romanized word to Devanagari."""
    char_separated = " ".join(list(word.strip()))
    prediction = model.translate(char_separated)
    # Join space-separated Devanagari chars back into a word
    result = "".join(prediction.strip().split())
    return unicodedata.normalize("NFC", result)


def transliterate_sentence(sentence):
    """
    Split sentence into words, transliterate each, then rejoin.
    Preserves punctuation attached to words.
    """
    if not sentence.strip():
        return ""
    
    words = sentence.strip().split()
    transliterated = []
    
    for word in words:
        # Separate leading/trailing punctuation from the word
        prefix, core, suffix = extract_punctuation(word)
        if core:
            deva = transliterate_word(core)
            transliterated.append(prefix + deva + suffix)
        else:
            transliterated.append(word)   # punctuation-only token, keep as is
    
    return " ".join(transliterated)


def extract_punctuation(word):
    """
    Split a token like 'ghar,' into ('', 'ghar', ',')
    so punctuation is not fed into the model.
    """
    prefix = ""
    suffix = ""
    
    # Strip leading punctuation
    while word and not word[0].isalpha():
        prefix += word[0]
        word = word[1:]
    
    # Strip trailing punctuation
    while word and not word[-1].isalpha():
        suffix = word[-1] + suffix
        word = word[:-1]
    
    return prefix, word, suffix


# ── Gradio UI ───────────────────────────────────────────────────────────────

def run(sentence):
    try:
        return transliterate_sentence(sentence)
    except Exception as e:
        return f"Error: {str(e)}"


iface = gr.Interface(
    fn=run,
    inputs=gr.Textbox(
        lines=3,
        placeholder="Type romanized Nepali sentence here... e.g. ma ghar janxu",
        label="Romanized Nepali (Input)"
    ),
    outputs=gr.Textbox(
        lines=3,
        label="Devanagari (Output)"
    ),
    title="Nepali Transliteration",
    description="Type a sentence in romanized Nepali and get the Devanagari output.",
    examples=[
        ["ma ghar janxu"],
        ["aama ra baa ghar ma xan"],
        ["nepali basa sajilo xa"],
    ],
)

iface.launch(server_name="0.0.0.0", server_port=7860)