YeMinNaing commited on
Commit
4e17f2c
·
verified ·
1 Parent(s): ece6197

dvb app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import spaces
3
+ import gradio as gr
4
+ from sacremoses import MosesPunctNormalizer
5
+ from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+ from flores import code_mapping
8
+ import platform
9
+ import torch
10
+ import nltk
11
+ from functools import lru_cache
12
+
13
+ nltk.download("punkt_tab")
14
+ REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"}
15
+ device = "cpu" if platform.system() == "Darwin" else "cuda"
16
+ MODEL_NAME = "facebook/nllb-200-3.3B"
17
+ code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[0]))
18
+ flores_codes = list(code_mapping.keys())
19
+ target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES]
20
+
21
+ def load_model():
22
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
23
+ print(f"Model loaded in {device}")
24
+ return model
25
+
26
+ model = load_model()
27
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
+ punct_normalizer = MosesPunctNormalizer(lang="en")
29
+
30
+ @lru_cache(maxsize=202)
31
+ def get_language_specific_sentence_splitter(language_code):
32
+ short_code = language_code[:3]
33
+ splitter = get_split_algo(short_code, "default")
34
+ return splitter
35
+
36
+ @lru_cache(maxsize=100)
37
+ def translate(text: str, src_lang: str, tgt_lang: str):
38
+ if not src_lang:
39
+ raise gr.Error("The source language is empty! Please choose it in the dropdown list.")
40
+ if not tgt_lang:
41
+ raise gr.Error("The target language is empty! Please choose it in the dropdown list.")
42
+ return _translate(text, src_lang, tgt_lang)
43
+
44
+ @spaces.GPU
45
+ def _translate(text: str, src_lang: str, tgt_lang: str):
46
+ src_code = code_mapping[src_lang]
47
+ tgt_code = code_mapping[tgt_lang]
48
+ tokenizer.src_lang = src_code
49
+ tokenizer.tgt_lang = tgt_code
50
+ text = punct_normalizer.normalize(text)
51
+ paragraphs = text.split("\n")
52
+ translated_paragraphs = []
53
+ for paragraph in paragraphs:
54
+ splitter = get_language_specific_sentence_splitter(src_code)
55
+ sentences = list(splitter(paragraph))
56
+ translated_sentences = []
57
+ for sentence in sentences:
58
+ input_tokens = tokenizer(sentence, return_tensors="pt").input_ids[0]
59
+ input_tokens = input_tokens.cpu().numpy().tolist() # Ensure tensor is on CPU before calling numpy()
60
+ translated_chunk = model.generate(
61
+ input_ids=torch.tensor([input_tokens]).to("cpu"), # Ensure tensor is on CPU
62
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
63
+ max_length=len(input_tokens) + 50,
64
+ num_return_sequences=1,
65
+ num_beams=5,
66
+ no_repeat_ngram_size=4,
67
+ renormalize_logits=True,
68
+ )
69
+ translated_chunk = tokenizer.decode(
70
+ translated_chunk[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
71
+ )
72
+ translated_sentences.append(translated_chunk)
73
+ translated_paragraph = " ".join(translated_sentences)
74
+ translated_paragraphs.append(translated_paragraph)
75
+ return "\n".join(translated_paragraphs)
76
+
77
+ pass
78
+
79
+ description = """
80
+ <div style="text-align: center;">
81
+ <img src="https://burmese.dvb.no/logo-with-letters.png" alt="DVB Meta Hugging Face Banner" style="max-width: 800px; width: 100%; margin: 0 auto;">
82
+ <h1 style="color: #0077be;">DVB Language Translator, powered by Meta and Hugging Face</h1>
83
+ </div>
84
+ """
85
+ #examples_inputs = [["The DVB, Scientific and Cultural Organization is a specialized agency of DVB with the aim of promoting world peace and security through international cooperation in education, arts, sciences and culture. ","English","Ayacucho Quechua"],]
86
+ with gr.Blocks() as demo:
87
+ gr.Markdown(description)
88
+ with gr.Row():
89
+ src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
90
+ target_lang = gr.Dropdown(label="Target Language", choices=target_languages)
91
+ with gr.Row():
92
+ input_text = gr.Textbox(label="Input Text", lines=6)
93
+ with gr.Row():
94
+ btn = gr.Button("Translate text")
95
+ with gr.Row():
96
+ output = gr.Textbox(label="Output Text", lines=6)
97
+ btn.click(
98
+ translate,
99
+ inputs=[input_text, src_lang, target_lang],
100
+ outputs=output,
101
+ )
102
+ # examples = gr.Examples(examples=examples_inputs, inputs=[input_text, src_lang, target_lang], fn=translate, outputs=output, cache_examples=True)
103
+ # with gr.Row():
104
+ # gr.Markdown(disclaimer)
105
+ demo.launch()