Navya-Sree's picture
Update app.py
0597381 verified
import spaces
import gradio as gr
from sacremoses import MosesPunctNormalizer
from transformers import pipeline
from cultural_model import CulturalM2M100
from cultural_tokenizer import CulturalTokenizer
import platform
import torch
import nltk
from functools import lru_cache
from config import LANGUAGE_MAPPING, ENDANGERED_LANGS, MODEL_NAME
# Download required NLTK data
nltk.download("punkt_tab")
nltk.download("punkt")
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
def load_model():
model = CulturalM2M100.from_pretrained(MODEL_NAME).to(device)
print(f"Loaded UNESCO Translator on {device.upper()}")
return model
model = load_model()
tokenizer = CulturalTokenizer.from_pretrained(MODEL_NAME)
punct_normalizer = MosesPunctNormalizer(lang="en")
@lru_cache(maxsize=202)
def get_language_specific_sentence_splitter(language_code):
"""Return a sentence splitter function for the given language"""
# For endangered languages, use NLTK with language-specific tokenizer
if language_code in ["qu", "ay", "chr"]: # Endangered language codes
return lambda text: nltk.sent_tokenize(text, language="english")
# For other languages, use NLTK with default tokenizer
return nltk.sent_tokenize
@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
if not text.strip():
return ""
src_info = LANGUAGE_MAPPING.get(src_lang)
tgt_info = LANGUAGE_MAPPING.get(tgt_lang)
if not src_info or not tgt_info:
raise gr.Error("Invalid language selection")
src_code = src_info["code"]
tgt_code = tgt_info["code"]
# Enable cultural preservation for endangered languages
cultural_preservation = tgt_lang in ENDANGERED_LANGS
# Normalize punctuation
text = punct_normalizer.normalize(text)
paragraphs = text.split("\n")
translated_paragraphs = []
for paragraph in paragraphs:
if not paragraph.strip():
translated_paragraphs.append("")
continue
splitter = get_language_specific_sentence_splitter(src_code)
sentences = splitter(paragraph)
translated_sentences = []
for sentence in sentences:
# Set language context
tokenizer.src_lang = src_code
tokenizer.tgt_lang = tgt_code
# Encode with cultural context
inputs = tokenizer(
sentence,
return_tensors="pt",
truncation=True,
max_length=512
).to(device)
# Generate with cultural preservation
generated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.get_lang_id(tgt_code),
max_length=512,
num_beams=5,
no_repeat_ngram_size=3,
cultural_preservation=cultural_preservation
)
translated = tokenizer.batch_decode(
generated_tokens,
skip_special_tokens=True
)[0]
translated_sentences.append(translated)
translated_paragraph = " ".join(translated_sentences)
translated_paragraphs.append(translated_paragraph)
return "\n".join(translated_paragraphs)
# UI Components
description = """
<div style="text-align: center;">
<h1 style="color: #0066cc;">UNESCO Language Translator ๐ŸŒ</h1>
<img src="/file=unesco_logo.png" alt="UNESCO Logo" style="max-width: 200px; margin: 0 auto;">
<p>Preserving endangered languages through AI-powered translation</p>
</div>
"""
disclaimer = """
## Ethical Guidelines
- Always verify translations for cultural sensitivity
- Report inaccurate translations to help improve the system
- Use translations responsibly for cultural preservation
"""
# Language lists
source_langs = sorted(LANGUAGE_MAPPING.keys())
target_langs = sorted(ENDANGERED_LANGS)
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(description)
with gr.Row():
with gr.Column():
src_lang = gr.Dropdown(
label="Source Language",
choices=source_langs,
value="English"
)
input_text = gr.Textbox(
label="Text to Translate",
lines=5,
placeholder="Enter text to translate"
)
with gr.Column():
tgt_lang = gr.Dropdown(
label="Target Language",
choices=target_langs,
value="Quechua"
)
output_text = gr.Textbox(
label="Translation",
lines=5,
interactive=False
)
translate_btn = gr.Button("Translate", variant="primary")
translate_btn.click(
translate,
inputs=[input_text, src_lang, tgt_lang],
outputs=output_text
)
gr.Examples(
examples=[
["Cultural heritage must be preserved for future generations", "English", "Quechua"],
["Traditional knowledge connects us to our ancestors", "English", "Aymara"],
["Language diversity is essential to human heritage", "French", "Cherokee"]
],
inputs=[input_text, src_lang, tgt_lang],
outputs=output_text
)
gr.Markdown(disclaimer)
if __name__ == "__main__":
demo.launch()