File size: 5,487 Bytes
b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 0597381 b4768ab 0597381 b4768ab 0597381 b4768ab 0597381 b4768ab 0597381 b4768ab 0597381 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 b4768ab 4ed6d87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import spaces
import gradio as gr
from sacremoses import MosesPunctNormalizer
from transformers import pipeline
from cultural_model import CulturalM2M100
from cultural_tokenizer import CulturalTokenizer
import platform
import torch
import nltk
from functools import lru_cache
from config import LANGUAGE_MAPPING, ENDANGERED_LANGS, MODEL_NAME
# Download required NLTK data
nltk.download("punkt_tab")
nltk.download("punkt")
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
def load_model():
model = CulturalM2M100.from_pretrained(MODEL_NAME).to(device)
print(f"Loaded UNESCO Translator on {device.upper()}")
return model
model = load_model()
tokenizer = CulturalTokenizer.from_pretrained(MODEL_NAME)
punct_normalizer = MosesPunctNormalizer(lang="en")
@lru_cache(maxsize=202)
def get_language_specific_sentence_splitter(language_code):
"""Return a sentence splitter function for the given language"""
# For endangered languages, use NLTK with language-specific tokenizer
if language_code in ["qu", "ay", "chr"]: # Endangered language codes
return lambda text: nltk.sent_tokenize(text, language="english")
# For other languages, use NLTK with default tokenizer
return nltk.sent_tokenize
@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
if not text.strip():
return ""
src_info = LANGUAGE_MAPPING.get(src_lang)
tgt_info = LANGUAGE_MAPPING.get(tgt_lang)
if not src_info or not tgt_info:
raise gr.Error("Invalid language selection")
src_code = src_info["code"]
tgt_code = tgt_info["code"]
# Enable cultural preservation for endangered languages
cultural_preservation = tgt_lang in ENDANGERED_LANGS
# Normalize punctuation
text = punct_normalizer.normalize(text)
paragraphs = text.split("\n")
translated_paragraphs = []
for paragraph in paragraphs:
if not paragraph.strip():
translated_paragraphs.append("")
continue
splitter = get_language_specific_sentence_splitter(src_code)
sentences = splitter(paragraph)
translated_sentences = []
for sentence in sentences:
# Set language context
tokenizer.src_lang = src_code
tokenizer.tgt_lang = tgt_code
# Encode with cultural context
inputs = tokenizer(
sentence,
return_tensors="pt",
truncation=True,
max_length=512
).to(device)
# Generate with cultural preservation
generated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.get_lang_id(tgt_code),
max_length=512,
num_beams=5,
no_repeat_ngram_size=3,
cultural_preservation=cultural_preservation
)
translated = tokenizer.batch_decode(
generated_tokens,
skip_special_tokens=True
)[0]
translated_sentences.append(translated)
translated_paragraph = " ".join(translated_sentences)
translated_paragraphs.append(translated_paragraph)
return "\n".join(translated_paragraphs)
# UI Components
description = """
<div style="text-align: center;">
<h1 style="color: #0066cc;">UNESCO Language Translator 🌍</h1>
<img src="/file=unesco_logo.png" alt="UNESCO Logo" style="max-width: 200px; margin: 0 auto;">
<p>Preserving endangered languages through AI-powered translation</p>
</div>
"""
disclaimer = """
## Ethical Guidelines
- Always verify translations for cultural sensitivity
- Report inaccurate translations to help improve the system
- Use translations responsibly for cultural preservation
"""
# Language lists
source_langs = sorted(LANGUAGE_MAPPING.keys())
target_langs = sorted(ENDANGERED_LANGS)
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(description)
with gr.Row():
with gr.Column():
src_lang = gr.Dropdown(
label="Source Language",
choices=source_langs,
value="English"
)
input_text = gr.Textbox(
label="Text to Translate",
lines=5,
placeholder="Enter text to translate"
)
with gr.Column():
tgt_lang = gr.Dropdown(
label="Target Language",
choices=target_langs,
value="Quechua"
)
output_text = gr.Textbox(
label="Translation",
lines=5,
interactive=False
)
translate_btn = gr.Button("Translate", variant="primary")
translate_btn.click(
translate,
inputs=[input_text, src_lang, tgt_lang],
outputs=output_text
)
gr.Examples(
examples=[
["Cultural heritage must be preserved for future generations", "English", "Quechua"],
["Traditional knowledge connects us to our ancestors", "English", "Aymara"],
["Language diversity is essential to human heritage", "French", "Cherokee"]
],
inputs=[input_text, src_lang, tgt_lang],
outputs=output_text
)
gr.Markdown(disclaimer)
if __name__ == "__main__":
demo.launch() |