import spaces import gradio as gr from sacremoses import MosesPunctNormalizer from transformers import pipeline from cultural_model import CulturalM2M100 from cultural_tokenizer import CulturalTokenizer import platform import torch import nltk from functools import lru_cache from config import LANGUAGE_MAPPING, ENDANGERED_LANGS, MODEL_NAME # Download required NLTK data nltk.download("punkt_tab") nltk.download("punkt") # Device configuration device = "cuda" if torch.cuda.is_available() else "cpu" def load_model(): model = CulturalM2M100.from_pretrained(MODEL_NAME).to(device) print(f"Loaded UNESCO Translator on {device.upper()}") return model model = load_model() tokenizer = CulturalTokenizer.from_pretrained(MODEL_NAME) punct_normalizer = MosesPunctNormalizer(lang="en") @lru_cache(maxsize=202) def get_language_specific_sentence_splitter(language_code): """Return a sentence splitter function for the given language""" # For endangered languages, use NLTK with language-specific tokenizer if language_code in ["qu", "ay", "chr"]: # Endangered language codes return lambda text: nltk.sent_tokenize(text, language="english") # For other languages, use NLTK with default tokenizer return nltk.sent_tokenize @spaces.GPU def translate(text: str, src_lang: str, tgt_lang: str): if not text.strip(): return "" src_info = LANGUAGE_MAPPING.get(src_lang) tgt_info = LANGUAGE_MAPPING.get(tgt_lang) if not src_info or not tgt_info: raise gr.Error("Invalid language selection") src_code = src_info["code"] tgt_code = tgt_info["code"] # Enable cultural preservation for endangered languages cultural_preservation = tgt_lang in ENDANGERED_LANGS # Normalize punctuation text = punct_normalizer.normalize(text) paragraphs = text.split("\n") translated_paragraphs = [] for paragraph in paragraphs: if not paragraph.strip(): translated_paragraphs.append("") continue splitter = get_language_specific_sentence_splitter(src_code) sentences = splitter(paragraph) translated_sentences = [] for sentence in sentences: # Set language context tokenizer.src_lang = src_code tokenizer.tgt_lang = tgt_code # Encode with cultural context inputs = tokenizer( sentence, return_tensors="pt", truncation=True, max_length=512 ).to(device) # Generate with cultural preservation generated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.get_lang_id(tgt_code), max_length=512, num_beams=5, no_repeat_ngram_size=3, cultural_preservation=cultural_preservation ) translated = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True )[0] translated_sentences.append(translated) translated_paragraph = " ".join(translated_sentences) translated_paragraphs.append(translated_paragraph) return "\n".join(translated_paragraphs) # UI Components description = """
Preserving endangered languages through AI-powered translation