|
|
import spaces |
|
|
import gradio as gr |
|
|
from sacremoses import MosesPunctNormalizer |
|
|
from transformers import pipeline |
|
|
from cultural_model import CulturalM2M100 |
|
|
from cultural_tokenizer import CulturalTokenizer |
|
|
import platform |
|
|
import torch |
|
|
import nltk |
|
|
from functools import lru_cache |
|
|
from config import LANGUAGE_MAPPING, ENDANGERED_LANGS, MODEL_NAME |
|
|
|
|
|
|
|
|
nltk.download("punkt_tab") |
|
|
nltk.download("punkt") |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
def load_model(): |
|
|
model = CulturalM2M100.from_pretrained(MODEL_NAME).to(device) |
|
|
print(f"Loaded UNESCO Translator on {device.upper()}") |
|
|
return model |
|
|
|
|
|
model = load_model() |
|
|
tokenizer = CulturalTokenizer.from_pretrained(MODEL_NAME) |
|
|
punct_normalizer = MosesPunctNormalizer(lang="en") |
|
|
|
|
|
@lru_cache(maxsize=202) |
|
|
def get_language_specific_sentence_splitter(language_code): |
|
|
"""Return a sentence splitter function for the given language""" |
|
|
|
|
|
if language_code in ["qu", "ay", "chr"]: |
|
|
return lambda text: nltk.sent_tokenize(text, language="english") |
|
|
|
|
|
return nltk.sent_tokenize |
|
|
|
|
|
@spaces.GPU |
|
|
def translate(text: str, src_lang: str, tgt_lang: str): |
|
|
if not text.strip(): |
|
|
return "" |
|
|
|
|
|
src_info = LANGUAGE_MAPPING.get(src_lang) |
|
|
tgt_info = LANGUAGE_MAPPING.get(tgt_lang) |
|
|
if not src_info or not tgt_info: |
|
|
raise gr.Error("Invalid language selection") |
|
|
src_code = src_info["code"] |
|
|
tgt_code = tgt_info["code"] |
|
|
|
|
|
|
|
|
cultural_preservation = tgt_lang in ENDANGERED_LANGS |
|
|
|
|
|
|
|
|
text = punct_normalizer.normalize(text) |
|
|
|
|
|
paragraphs = text.split("\n") |
|
|
translated_paragraphs = [] |
|
|
|
|
|
for paragraph in paragraphs: |
|
|
if not paragraph.strip(): |
|
|
translated_paragraphs.append("") |
|
|
continue |
|
|
|
|
|
splitter = get_language_specific_sentence_splitter(src_code) |
|
|
sentences = splitter(paragraph) |
|
|
translated_sentences = [] |
|
|
|
|
|
for sentence in sentences: |
|
|
|
|
|
tokenizer.src_lang = src_code |
|
|
tokenizer.tgt_lang = tgt_code |
|
|
|
|
|
|
|
|
inputs = tokenizer( |
|
|
sentence, |
|
|
return_tensors="pt", |
|
|
truncation=True, |
|
|
max_length=512 |
|
|
).to(device) |
|
|
|
|
|
|
|
|
generated_tokens = model.generate( |
|
|
**inputs, |
|
|
forced_bos_token_id=tokenizer.get_lang_id(tgt_code), |
|
|
max_length=512, |
|
|
num_beams=5, |
|
|
no_repeat_ngram_size=3, |
|
|
cultural_preservation=cultural_preservation |
|
|
) |
|
|
|
|
|
translated = tokenizer.batch_decode( |
|
|
generated_tokens, |
|
|
skip_special_tokens=True |
|
|
)[0] |
|
|
translated_sentences.append(translated) |
|
|
|
|
|
translated_paragraph = " ".join(translated_sentences) |
|
|
translated_paragraphs.append(translated_paragraph) |
|
|
|
|
|
return "\n".join(translated_paragraphs) |
|
|
|
|
|
|
|
|
description = """ |
|
|
<div style="text-align: center;"> |
|
|
<h1 style="color: #0066cc;">UNESCO Language Translator ๐</h1> |
|
|
<img src="/file=unesco_logo.png" alt="UNESCO Logo" style="max-width: 200px; margin: 0 auto;"> |
|
|
<p>Preserving endangered languages through AI-powered translation</p> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
disclaimer = """ |
|
|
## Ethical Guidelines |
|
|
- Always verify translations for cultural sensitivity |
|
|
- Report inaccurate translations to help improve the system |
|
|
- Use translations responsibly for cultural preservation |
|
|
""" |
|
|
|
|
|
|
|
|
source_langs = sorted(LANGUAGE_MAPPING.keys()) |
|
|
target_langs = sorted(ENDANGERED_LANGS) |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(description) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
src_lang = gr.Dropdown( |
|
|
label="Source Language", |
|
|
choices=source_langs, |
|
|
value="English" |
|
|
) |
|
|
input_text = gr.Textbox( |
|
|
label="Text to Translate", |
|
|
lines=5, |
|
|
placeholder="Enter text to translate" |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
tgt_lang = gr.Dropdown( |
|
|
label="Target Language", |
|
|
choices=target_langs, |
|
|
value="Quechua" |
|
|
) |
|
|
output_text = gr.Textbox( |
|
|
label="Translation", |
|
|
lines=5, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
translate_btn = gr.Button("Translate", variant="primary") |
|
|
translate_btn.click( |
|
|
translate, |
|
|
inputs=[input_text, src_lang, tgt_lang], |
|
|
outputs=output_text |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["Cultural heritage must be preserved for future generations", "English", "Quechua"], |
|
|
["Traditional knowledge connects us to our ancestors", "English", "Aymara"], |
|
|
["Language diversity is essential to human heritage", "French", "Cherokee"] |
|
|
], |
|
|
inputs=[input_text, src_lang, tgt_lang], |
|
|
outputs=output_text |
|
|
) |
|
|
|
|
|
gr.Markdown(disclaimer) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |