import spaces import gradio as gr from sacremoses import MosesPunctNormalizer from transformers import pipeline from cultural_model import CulturalM2M100 from cultural_tokenizer import CulturalTokenizer import platform import torch import nltk from functools import lru_cache from config import LANGUAGE_MAPPING, ENDANGERED_LANGS, MODEL_NAME # Download required NLTK data nltk.download("punkt_tab") nltk.download("punkt") # Device configuration device = "cuda" if torch.cuda.is_available() else "cpu" def load_model(): model = CulturalM2M100.from_pretrained(MODEL_NAME).to(device) print(f"Loaded UNESCO Translator on {device.upper()}") return model model = load_model() tokenizer = CulturalTokenizer.from_pretrained(MODEL_NAME) punct_normalizer = MosesPunctNormalizer(lang="en") @lru_cache(maxsize=202) def get_language_specific_sentence_splitter(language_code): """Return a sentence splitter function for the given language""" # For endangered languages, use NLTK with language-specific tokenizer if language_code in ["qu", "ay", "chr"]: # Endangered language codes return lambda text: nltk.sent_tokenize(text, language="english") # For other languages, use NLTK with default tokenizer return nltk.sent_tokenize @spaces.GPU def translate(text: str, src_lang: str, tgt_lang: str): if not text.strip(): return "" src_info = LANGUAGE_MAPPING.get(src_lang) tgt_info = LANGUAGE_MAPPING.get(tgt_lang) if not src_info or not tgt_info: raise gr.Error("Invalid language selection") src_code = src_info["code"] tgt_code = tgt_info["code"] # Enable cultural preservation for endangered languages cultural_preservation = tgt_lang in ENDANGERED_LANGS # Normalize punctuation text = punct_normalizer.normalize(text) paragraphs = text.split("\n") translated_paragraphs = [] for paragraph in paragraphs: if not paragraph.strip(): translated_paragraphs.append("") continue splitter = get_language_specific_sentence_splitter(src_code) sentences = splitter(paragraph) translated_sentences = [] for sentence in sentences: # Set language context tokenizer.src_lang = src_code tokenizer.tgt_lang = tgt_code # Encode with cultural context inputs = tokenizer( sentence, return_tensors="pt", truncation=True, max_length=512 ).to(device) # Generate with cultural preservation generated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt_code), max_length=512, num_beams=5, no_repeat_ngram_size=3, cultural_preservation=cultural_preservation ) translated = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True )[0] translated_sentences.append(translated) translated_paragraph = " ".join(translated_sentences) translated_paragraphs.append(translated_paragraph) return "\n".join(translated_paragraphs) # UI Components description = """

UNESCO Language Translator 🌍

UNESCO Logo

Preserving endangered languages through AI-powered translation

""" disclaimer = """ ## Ethical Guidelines - Always verify translations for cultural sensitivity - Report inaccurate translations to help improve the system - Use translations responsibly for cultural preservation """ # Language lists source_langs = sorted(LANGUAGE_MAPPING.keys()) target_langs = sorted(ENDANGERED_LANGS) with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(description) with gr.Row(): with gr.Column(): src_lang = gr.Dropdown( label="Source Language", choices=source_langs, value="English" ) input_text = gr.Textbox( label="Text to Translate", lines=5, placeholder="Enter text to translate" ) with gr.Column(): tgt_lang = gr.Dropdown( label="Target Language", choices=target_langs, value="Quechua" ) output_text = gr.Textbox( label="Translation", lines=5, interactive=False ) translate_btn = gr.Button("Translate", variant="primary") translate_btn.click( translate, inputs=[input_text, src_lang, tgt_lang], outputs=output_text ) gr.Examples( examples=[ ["Cultural heritage must be preserved for future generations", "English", "Quechua"], ["Traditional knowledge connects us to our ancestors", "English", "Aymara"], ["Language diversity is essential to human heritage", "French", "Cherokee"] ], inputs=[input_text, src_lang, tgt_lang], outputs=output_text ) gr.Markdown(disclaimer) if __name__ == "__main__": demo.launch()