s2s-translator / build_cache_local.py
EnochQuayson's picture
Upload build_cache_local.py
a15b110 verified
"""
S2S Local Cache Builder - Runs INSIDE HuggingFace Space
Add this as a separate file in your HF Space repo.
It calls the translation functions directly without HTTP.
Run via: python build_cache_local.py
Output: translation_cache.json
"""
import json
import os
import time
from datetime import datetime
# ── Import translation functions directly from app.py ─────────────────────────
import torch
from pathlib import Path
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
from transformers import VitsModel, AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from huggingface_hub import snapshot_download
import tempfile
import scipy.io.wavfile as wavfile
OUTPUT_FILE = "translation_cache.json"
SAVE_EVERY = 50
LANGUAGES = ["fr", "tw", "ee", "hau", "fuv"]
LANG_LABEL_MAP = {
"fr": "French 🇫🇷",
"tw": "Asante Twi 🇬🇭",
"ee": "Ewe 🇬🇭",
"hau": "Hausa 🇬🇭",
"fuv": "Fulani 🇬🇭",
}
PHRASES = [
"Hello", "Hi", "Hey", "Good morning", "Good afternoon", "Good evening",
"Good night", "Goodbye", "Bye", "See you later", "See you tomorrow",
"See you soon", "Take care", "Have a good day", "Have a nice day",
"Have a good evening", "Have a good night", "Have a safe journey",
"Safe travels", "Welcome", "Welcome back", "Come in", "Sit down",
"How are you", "How are you doing", "How is your day",
"How was your day", "How is the family", "How is your health",
"I am fine", "I am good", "I am okay", "I am not well",
"I am tired", "I am happy", "I am sad", "I am busy",
"Nice to meet you", "It is nice to see you", "Long time no see",
"Thank you", "Thank you very much", "Thanks a lot", "Many thanks",
"You are welcome", "No problem", "Do not worry",
"Please", "Sorry", "I am sorry", "Excuse me", "Pardon me",
"Congratulations", "Well done", "Good job", "Bravo",
"What is your name", "My name is John", "My name is Mary",
"Where are you from", "I am from Ghana", "I am from Nigeria",
"I am from France", "I live in Accra", "I live in Kumasi",
"How old are you", "I am twenty years old", "I am thirty years old",
"What do you do", "I am a doctor", "I am a teacher",
"I am a nurse", "I am an engineer", "I am a farmer",
"I am a student", "I am a businessman", "I am a trader",
"I am retired", "I work in a hospital", "I work in a school",
"Are you married", "I am married", "I am single",
"I have children", "I have two children", "I have a son",
"I have a daughter", "This is my husband", "This is my wife",
"My mother", "My father", "My parents", "My brother", "My sister",
"My son", "My daughter", "My children", "My baby",
"My husband", "My wife", "My family", "My friend",
"My grandfather", "My grandmother", "My uncle", "My aunt",
"My cousin", "My nephew", "My niece",
"The baby is crying", "The child is sleeping",
"My child is sick", "We are a big family", "I love my family",
"The children are playing", "Feed the baby",
"Take care of the children", "The baby needs milk",
"What time is it", "What day is it today", "What is the date",
"What is this", "What is that", "What happened",
"Where are you", "Where are you going", "Where do you live",
"Where is the toilet", "Where is the bathroom",
"Where is the hospital", "Where is the pharmacy",
"Where is the police station", "Where is the market",
"Where is the church", "Where is the mosque",
"Where is the school", "Where is the office",
"Where is the hotel", "Where is the restaurant",
"Where is the bus station", "Where is the airport",
"Where is the bank", "Where is the ATM",
"When does it open", "When does it close",
"How far is it", "How long will it take",
"How much does it cost", "How many do you want",
"Can you help me", "Can I help you",
"Do you understand", "Do you speak English",
"I do not understand", "I do not know",
"Can you repeat that", "Please speak slowly",
"I am sick", "I am not feeling well", "I feel weak",
"I need a doctor", "I need a nurse", "Call an ambulance",
"Take me to the hospital", "Please call for help",
"I have a headache", "I have a stomachache", "I have a backache",
"I have a toothache", "I have chest pain", "I have a fever",
"I have a cold", "I have a cough", "I have diarrhea",
"I am vomiting", "I feel dizzy", "I cannot breathe",
"I am bleeding", "I am in pain", "The pain is severe",
"I broke my arm", "I broke my leg", "I sprained my ankle",
"I was in an accident", "I fell down",
"I am pregnant", "I am due soon", "I need a midwife",
"The baby is coming", "I need pain relief",
"I am allergic to penicillin", "I am diabetic",
"I have malaria", "I have typhoid", "I have high blood pressure",
"I need my medication", "What is this medicine for",
"How many times a day", "Take it with water",
"Take it after meals", "Take it before sleeping",
"I need a blood test", "What is my diagnosis",
"Will I be okay", "I want to go home",
"The patient needs rest", "This is urgent",
"I am hungry", "I am very hungry", "I am starving",
"I am thirsty", "I want water", "I want food",
"I want to eat", "I want to drink",
"I want rice", "I want bread", "I want soup",
"I want chicken", "I want fish", "I want beef",
"I want vegetables", "I want fruit", "I want eggs",
"I want porridge", "I want fufu", "I want banku",
"I want kenkey", "I want jollof rice", "I want waakye",
"I want yam", "I want plantain", "I want cassava",
"I want groundnut soup", "I want palm nut soup",
"Can I have tea", "Can I have coffee", "Can I have milk",
"I am vegetarian", "I do not eat pork", "I do not eat meat",
"The food is delicious", "This is too spicy", "This is too salty",
"The bill please", "How much is it", "Keep the change",
"How much is this", "What is the price", "That is too expensive",
"Can you reduce the price", "Do you have a discount",
"I want to buy this", "I will take it",
"Do you have this in another colour", "Do you have a bigger size",
"Do you have a smaller size", "I am just looking",
"Can I pay by card", "Do you accept mobile money",
"I want a receipt", "I want to return this",
"This is damaged", "I want a refund",
"Where is the airport", "I need to go to the airport",
"Where is the bus station", "I need a taxi",
"Take me to Accra", "Take me to Kumasi",
"How much is the fare", "How long is the journey",
"When does the bus leave", "When does it arrive",
"I missed my bus", "I missed my flight",
"My luggage is lost", "I need to check in",
"I am a tourist", "I am visiting family",
"I am here for work", "I have a visa",
"I am lost", "Can you show me on the map",
"Turn left", "Turn right", "Go straight",
"Stop here", "Wait for me",
"Where is my hotel", "I have a reservation",
"Check in please", "Check out please",
"I have a meeting", "I am looking for work",
"I need a job", "I am the manager",
"I want to place an order", "When can you deliver",
"We need it urgently", "Please sign here",
"This is the invoice", "This is the receipt",
"The payment has been made", "I need a bank transfer",
"I need a quotation", "Send me the proposal",
"I agree to the terms", "The deal is done",
"What is your phone number", "What is your email address",
"I will call you back", "I am running late",
"I am a student", "I want to learn", "I want to study",
"I do not understand the lesson", "Can you explain again",
"I have homework", "When is the exam",
"I passed the exam", "I failed the exam",
"I graduated", "I have a degree", "I need school fees",
"God bless you", "God is good", "Praise God",
"Peace be with you", "Have a blessed day",
"I am Christian", "I am Muslim", "I am going to church",
"I am fasting", "Happy Easter", "Happy Christmas",
"Happy New Year", "Happy birthday", "Happy anniversary",
"I am praying for you", "God will provide",
"The funeral is tomorrow", "I am sorry for your loss",
"May their soul rest in peace",
"The wedding is on Saturday", "Congratulations on your wedding",
"We are celebrating", "This is our tradition",
"Help", "Help me", "I need help", "Emergency",
"Fire", "There is a fire", "Call the fire service",
"Call the police", "I need the police",
"I have been robbed", "My phone was stolen",
"My wallet was stolen", "I lost my passport",
"There has been an accident", "Someone is hurt",
"The car broke down", "I have a flat tyre",
"I am stuck", "The road is blocked",
"Stay inside", "It is not safe outside",
"I am in danger", "Please save me",
"I am happy", "I am very happy", "I am excited",
"I am sad", "I am very sad", "I am crying",
"I am angry", "I am frustrated", "I am disappointed",
"I am scared", "I am worried", "I am stressed",
"I am nervous", "I am confused", "I am shocked",
"I am tired", "I am exhausted", "I am bored",
"I am lonely", "I miss you", "I love you",
"I care about you", "I am proud of you",
"Do not give up", "Stay strong", "Everything will be fine",
"I believe in you", "You can do it",
"It is raining", "It is sunny", "It is very hot",
"It is cold today", "The weather is nice",
"There is a flood", "There is strong wind",
"Plant the seeds", "Water the plants",
"The harvest is good", "Feed the animals",
"Do not litter", "Keep the environment clean",
"Plant more trees", "Save water",
"My phone is dead", "I need to charge my phone",
"Do you have wifi", "What is the wifi password",
"The internet is slow", "My battery is low",
"I need to make a call", "I need to send a message",
"Send me on whatsapp", "I will call you later",
"Take a photo of me", "Send me the photo",
"Today", "Tomorrow", "Yesterday",
"This week", "Next week", "This month",
"Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday",
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December",
"Do not be late", "I will wait for you",
"It will take one hour", "I need more time",
"One", "Two", "Three", "Four", "Five",
"Six", "Seven", "Eight", "Nine", "Ten",
"Twenty", "Thirty", "Fifty", "One hundred", "One thousand",
"I have no money", "I need money", "Pay me back",
"It is free", "Mobile money", "Pay cash",
"Turn left", "Turn right", "Go straight",
"It is nearby", "It is far away",
"Upstairs", "Downstairs", "Inside", "Outside",
"Behind the market", "In front of the school",
"Next to the church", "Across the road",
"You have arrived", "This is the place",
"Good", "Bad", "Very good", "Big", "Small",
"Long", "Short", "Heavy", "Light", "Full", "Empty",
"Hot", "Cold", "Fast", "Slow", "New", "Old",
"Clean", "Dirty", "Beautiful", "Strong", "Weak",
"Rich", "Poor", "Cheap", "Expensive",
"Easy", "Difficult", "Safe", "Dangerous",
"Correct", "Wrong", "True", "False", "Ready",
"Come", "Go", "Stop", "Wait", "Run", "Walk",
"Sit", "Stand", "Sleep", "Wake up", "Eat", "Drink",
"Cook", "Buy", "Sell", "Give", "Take", "Send",
"Work", "Rest", "Play", "Sing", "Dance",
"Read", "Write", "Listen", "Speak", "Call",
"Open", "Close", "Lock", "Help", "Fix", "Clean",
"Wash", "Cut", "Build", "Start", "Finish",
"Return", "Leave", "Arrive", "Enter", "Exit",
"Know", "Understand", "Remember", "Forget", "Learn",
"Think", "Believe", "Want", "Need", "Love", "Like",
"You can do it", "Do not give up", "Keep trying",
"Stay strong", "Be brave", "Be patient",
"Work hard", "Study hard", "Try your best",
"You are talented", "You are smart", "You are beautiful",
"Be kind", "Be honest", "Be respectful",
"Health is wealth", "Education is the key",
"Every day is a blessing", "All shall be well",
"Trust the process", "Never stop learning",
"Wake up early", "Brush your teeth", "Take a bath",
"Get dressed", "Eat breakfast", "Go to school",
"Go to work", "Come home early", "Eat dinner",
"Do your homework", "Go to bed early",
"Lock the door", "Turn off the lights",
"Do the laundry", "Wash the dishes",
"Sweep the house", "Buy groceries", "Prepare the food",
"Feed the children", "Pay the bills",
"Exercise daily", "Drink more water",
"Get enough sleep", "Take your medication",
"Call your parents", "Spend time with family",
"We help each other", "Teamwork is important",
"Let us work together", "Respect the elderly",
"Take care of the children", "Stand up for justice",
"Do not bribe", "Pay your taxes", "Obey the law",
"Keep the community clean", "Save electricity",
"Use water wisely", "Support local businesses",
"Our culture is our identity", "Preserve our traditions",
"Teach children our language", "Pass on our values",
]
# Deduplicate
seen = set()
PHRASES = [p for p in PHRASES if not (p in seen or seen.add(p))]
def load_models():
"""Load all translation models directly."""
print("Loading models...")
MODEL_REPO = "EnochQuayson/s2s-onnx-model"
MODELS_DIR = Path("./models_cache")
from huggingface_hub import snapshot_download
snapshot_download(repo_id=MODEL_REPO, local_dir=str(MODELS_DIR), repo_type="model")
# French MT
mt_fr_path = str(MODELS_DIR / "mt/opus-mt-tc-big-en-fr")
try:
mt_fr_model = ORTModelForSeq2SeqLM.from_pretrained(mt_fr_path)
mt_fr_tokenizer = MarianTokenizer.from_pretrained(mt_fr_path)
except:
mt_fr_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-fr")
mt_fr_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-fr")
# NLLB multilingual
mt_nllb_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
mt_nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
print("Models loaded!")
return mt_fr_tokenizer, mt_fr_model, mt_nllb_tokenizer, mt_nllb_model
LANG_CODES = {
"tw": "twi_Latn",
"ee": "ewe_Latn",
"hau": "hau_Latn",
"fuv": "fuv_Latn",
}
def translate_text(text, lang, mt_fr_tokenizer, mt_fr_model, mt_nllb_tokenizer, mt_nllb_model):
"""Translate text directly using loaded models."""
try:
if lang == "fr":
inputs = mt_fr_tokenizer([text], return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
out = mt_fr_model.generate(**inputs)
return mt_fr_tokenizer.decode(out[0], skip_special_tokens=True)
else:
lang_code = LANG_CODES.get(lang)
if not lang_code:
return None
token_id = mt_nllb_tokenizer.convert_tokens_to_ids(lang_code)
mt_nllb_tokenizer.src_lang = "eng_Latn"
inputs = mt_nllb_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
out = mt_nllb_model.generate(
**inputs,
forced_bos_token_id=token_id,
max_length=256,
num_beams=4,
early_stopping=True
)
return mt_nllb_tokenizer.decode(out[0], skip_special_tokens=True)
except Exception as e:
print(f" Error translating '{text}' to {lang}: {e}")
return None
def load_cache(filepath):
if os.path.exists(filepath):
try:
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
except:
pass
return {}
def save_cache(cache, filepath):
with open(filepath, "w", encoding="utf-8") as f:
json.dump(cache, f, ensure_ascii=False, indent=2)
def main():
print(f"\nS2S Local Cache Builder")
print(f"Phrases: {len(PHRASES)} | Languages: {len(LANGUAGES)}")
print(f"Total: {len(PHRASES) * len(LANGUAGES)}")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)
# Load models once
mt_fr_tok, mt_fr_mod, nllb_tok, nllb_mod = load_models()
cache = load_cache(OUTPUT_FILE)
print(f"Existing cache: {len(cache)} entries\n")
total = len(PHRASES) * len(LANGUAGES)
done = 0
new_count = 0
failed = 0
for phrase in PHRASES:
for lang in LANGUAGES:
key = f"{phrase}|{lang}"
done += 1
if key in cache:
continue
pct = (done / total) * 100
print(f"[{done}/{total} {pct:.1f}%] '{phrase}' → {lang} ...", end=" ", flush=True)
result = translate_text(phrase, lang, mt_fr_tok, mt_fr_mod, nllb_tok, nllb_mod)
if result:
cache[key] = {
"source_text": phrase,
"translated_text": result,
"target_language": lang,
"audio_url": None,
"cached_at": datetime.now().isoformat()
}
new_count += 1
print(f"✓ '{result[:50]}'")
if new_count % SAVE_EVERY == 0:
save_cache(cache, OUTPUT_FILE)
kb = os.path.getsize(OUTPUT_FILE) / 1024
print(f" → Saved {len(cache)} entries ({kb:.0f} KB)")
else:
failed += 1
print(f"✗ Failed")
save_cache(cache, OUTPUT_FILE)
kb = os.path.getsize(OUTPUT_FILE) / 1024
print("\n" + "=" * 60)
print(f"COMPLETE! {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total: {len(cache)} | New: {new_count} | Failed: {failed}")
print(f"File: {OUTPUT_FILE} ({kb:.0f} KB)")
print(f"\nDownload {OUTPUT_FILE} from the Files tab in your HF Space.")
if __name__ == "__main__":
main()