File size: 18,176 Bytes
a15b110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
"""
S2S Local Cache Builder - Runs INSIDE HuggingFace Space
Add this as a separate file in your HF Space repo.
It calls the translation functions directly without HTTP.
Run via: python build_cache_local.py
Output: translation_cache.json
"""

import json
import os
import time
from datetime import datetime

# ── Import translation functions directly from app.py ─────────────────────────
import torch
from pathlib import Path
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
from transformers import VitsModel, AutoTokenizer
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from huggingface_hub import snapshot_download
import tempfile
import scipy.io.wavfile as wavfile

OUTPUT_FILE = "translation_cache.json"
SAVE_EVERY = 50

LANGUAGES = ["fr", "tw", "ee", "hau", "fuv"]

LANG_LABEL_MAP = {
    "fr":  "French πŸ‡«πŸ‡·",
    "tw":  "Asante Twi πŸ‡¬πŸ‡­",
    "ee":  "Ewe πŸ‡¬πŸ‡­",
    "hau": "Hausa πŸ‡¬πŸ‡­",
    "fuv": "Fulani πŸ‡¬πŸ‡­",
}

PHRASES = [
    "Hello", "Hi", "Hey", "Good morning", "Good afternoon", "Good evening",
    "Good night", "Goodbye", "Bye", "See you later", "See you tomorrow",
    "See you soon", "Take care", "Have a good day", "Have a nice day",
    "Have a good evening", "Have a good night", "Have a safe journey",
    "Safe travels", "Welcome", "Welcome back", "Come in", "Sit down",
    "How are you", "How are you doing", "How is your day",
    "How was your day", "How is the family", "How is your health",
    "I am fine", "I am good", "I am okay", "I am not well",
    "I am tired", "I am happy", "I am sad", "I am busy",
    "Nice to meet you", "It is nice to see you", "Long time no see",
    "Thank you", "Thank you very much", "Thanks a lot", "Many thanks",
    "You are welcome", "No problem", "Do not worry",
    "Please", "Sorry", "I am sorry", "Excuse me", "Pardon me",
    "Congratulations", "Well done", "Good job", "Bravo",
    "What is your name", "My name is John", "My name is Mary",
    "Where are you from", "I am from Ghana", "I am from Nigeria",
    "I am from France", "I live in Accra", "I live in Kumasi",
    "How old are you", "I am twenty years old", "I am thirty years old",
    "What do you do", "I am a doctor", "I am a teacher",
    "I am a nurse", "I am an engineer", "I am a farmer",
    "I am a student", "I am a businessman", "I am a trader",
    "I am retired", "I work in a hospital", "I work in a school",
    "Are you married", "I am married", "I am single",
    "I have children", "I have two children", "I have a son",
    "I have a daughter", "This is my husband", "This is my wife",
    "My mother", "My father", "My parents", "My brother", "My sister",
    "My son", "My daughter", "My children", "My baby",
    "My husband", "My wife", "My family", "My friend",
    "My grandfather", "My grandmother", "My uncle", "My aunt",
    "My cousin", "My nephew", "My niece",
    "The baby is crying", "The child is sleeping",
    "My child is sick", "We are a big family", "I love my family",
    "The children are playing", "Feed the baby",
    "Take care of the children", "The baby needs milk",
    "What time is it", "What day is it today", "What is the date",
    "What is this", "What is that", "What happened",
    "Where are you", "Where are you going", "Where do you live",
    "Where is the toilet", "Where is the bathroom",
    "Where is the hospital", "Where is the pharmacy",
    "Where is the police station", "Where is the market",
    "Where is the church", "Where is the mosque",
    "Where is the school", "Where is the office",
    "Where is the hotel", "Where is the restaurant",
    "Where is the bus station", "Where is the airport",
    "Where is the bank", "Where is the ATM",
    "When does it open", "When does it close",
    "How far is it", "How long will it take",
    "How much does it cost", "How many do you want",
    "Can you help me", "Can I help you",
    "Do you understand", "Do you speak English",
    "I do not understand", "I do not know",
    "Can you repeat that", "Please speak slowly",
    "I am sick", "I am not feeling well", "I feel weak",
    "I need a doctor", "I need a nurse", "Call an ambulance",
    "Take me to the hospital", "Please call for help",
    "I have a headache", "I have a stomachache", "I have a backache",
    "I have a toothache", "I have chest pain", "I have a fever",
    "I have a cold", "I have a cough", "I have diarrhea",
    "I am vomiting", "I feel dizzy", "I cannot breathe",
    "I am bleeding", "I am in pain", "The pain is severe",
    "I broke my arm", "I broke my leg", "I sprained my ankle",
    "I was in an accident", "I fell down",
    "I am pregnant", "I am due soon", "I need a midwife",
    "The baby is coming", "I need pain relief",
    "I am allergic to penicillin", "I am diabetic",
    "I have malaria", "I have typhoid", "I have high blood pressure",
    "I need my medication", "What is this medicine for",
    "How many times a day", "Take it with water",
    "Take it after meals", "Take it before sleeping",
    "I need a blood test", "What is my diagnosis",
    "Will I be okay", "I want to go home",
    "The patient needs rest", "This is urgent",
    "I am hungry", "I am very hungry", "I am starving",
    "I am thirsty", "I want water", "I want food",
    "I want to eat", "I want to drink",
    "I want rice", "I want bread", "I want soup",
    "I want chicken", "I want fish", "I want beef",
    "I want vegetables", "I want fruit", "I want eggs",
    "I want porridge", "I want fufu", "I want banku",
    "I want kenkey", "I want jollof rice", "I want waakye",
    "I want yam", "I want plantain", "I want cassava",
    "I want groundnut soup", "I want palm nut soup",
    "Can I have tea", "Can I have coffee", "Can I have milk",
    "I am vegetarian", "I do not eat pork", "I do not eat meat",
    "The food is delicious", "This is too spicy", "This is too salty",
    "The bill please", "How much is it", "Keep the change",
    "How much is this", "What is the price", "That is too expensive",
    "Can you reduce the price", "Do you have a discount",
    "I want to buy this", "I will take it",
    "Do you have this in another colour", "Do you have a bigger size",
    "Do you have a smaller size", "I am just looking",
    "Can I pay by card", "Do you accept mobile money",
    "I want a receipt", "I want to return this",
    "This is damaged", "I want a refund",
    "Where is the airport", "I need to go to the airport",
    "Where is the bus station", "I need a taxi",
    "Take me to Accra", "Take me to Kumasi",
    "How much is the fare", "How long is the journey",
    "When does the bus leave", "When does it arrive",
    "I missed my bus", "I missed my flight",
    "My luggage is lost", "I need to check in",
    "I am a tourist", "I am visiting family",
    "I am here for work", "I have a visa",
    "I am lost", "Can you show me on the map",
    "Turn left", "Turn right", "Go straight",
    "Stop here", "Wait for me",
    "Where is my hotel", "I have a reservation",
    "Check in please", "Check out please",
    "I have a meeting", "I am looking for work",
    "I need a job", "I am the manager",
    "I want to place an order", "When can you deliver",
    "We need it urgently", "Please sign here",
    "This is the invoice", "This is the receipt",
    "The payment has been made", "I need a bank transfer",
    "I need a quotation", "Send me the proposal",
    "I agree to the terms", "The deal is done",
    "What is your phone number", "What is your email address",
    "I will call you back", "I am running late",
    "I am a student", "I want to learn", "I want to study",
    "I do not understand the lesson", "Can you explain again",
    "I have homework", "When is the exam",
    "I passed the exam", "I failed the exam",
    "I graduated", "I have a degree", "I need school fees",
    "God bless you", "God is good", "Praise God",
    "Peace be with you", "Have a blessed day",
    "I am Christian", "I am Muslim", "I am going to church",
    "I am fasting", "Happy Easter", "Happy Christmas",
    "Happy New Year", "Happy birthday", "Happy anniversary",
    "I am praying for you", "God will provide",
    "The funeral is tomorrow", "I am sorry for your loss",
    "May their soul rest in peace",
    "The wedding is on Saturday", "Congratulations on your wedding",
    "We are celebrating", "This is our tradition",
    "Help", "Help me", "I need help", "Emergency",
    "Fire", "There is a fire", "Call the fire service",
    "Call the police", "I need the police",
    "I have been robbed", "My phone was stolen",
    "My wallet was stolen", "I lost my passport",
    "There has been an accident", "Someone is hurt",
    "The car broke down", "I have a flat tyre",
    "I am stuck", "The road is blocked",
    "Stay inside", "It is not safe outside",
    "I am in danger", "Please save me",
    "I am happy", "I am very happy", "I am excited",
    "I am sad", "I am very sad", "I am crying",
    "I am angry", "I am frustrated", "I am disappointed",
    "I am scared", "I am worried", "I am stressed",
    "I am nervous", "I am confused", "I am shocked",
    "I am tired", "I am exhausted", "I am bored",
    "I am lonely", "I miss you", "I love you",
    "I care about you", "I am proud of you",
    "Do not give up", "Stay strong", "Everything will be fine",
    "I believe in you", "You can do it",
    "It is raining", "It is sunny", "It is very hot",
    "It is cold today", "The weather is nice",
    "There is a flood", "There is strong wind",
    "Plant the seeds", "Water the plants",
    "The harvest is good", "Feed the animals",
    "Do not litter", "Keep the environment clean",
    "Plant more trees", "Save water",
    "My phone is dead", "I need to charge my phone",
    "Do you have wifi", "What is the wifi password",
    "The internet is slow", "My battery is low",
    "I need to make a call", "I need to send a message",
    "Send me on whatsapp", "I will call you later",
    "Take a photo of me", "Send me the photo",
    "Today", "Tomorrow", "Yesterday",
    "This week", "Next week", "This month",
    "Monday", "Tuesday", "Wednesday", "Thursday",
    "Friday", "Saturday", "Sunday",
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December",
    "Do not be late", "I will wait for you",
    "It will take one hour", "I need more time",
    "One", "Two", "Three", "Four", "Five",
    "Six", "Seven", "Eight", "Nine", "Ten",
    "Twenty", "Thirty", "Fifty", "One hundred", "One thousand",
    "I have no money", "I need money", "Pay me back",
    "It is free", "Mobile money", "Pay cash",
    "Turn left", "Turn right", "Go straight",
    "It is nearby", "It is far away",
    "Upstairs", "Downstairs", "Inside", "Outside",
    "Behind the market", "In front of the school",
    "Next to the church", "Across the road",
    "You have arrived", "This is the place",
    "Good", "Bad", "Very good", "Big", "Small",
    "Long", "Short", "Heavy", "Light", "Full", "Empty",
    "Hot", "Cold", "Fast", "Slow", "New", "Old",
    "Clean", "Dirty", "Beautiful", "Strong", "Weak",
    "Rich", "Poor", "Cheap", "Expensive",
    "Easy", "Difficult", "Safe", "Dangerous",
    "Correct", "Wrong", "True", "False", "Ready",
    "Come", "Go", "Stop", "Wait", "Run", "Walk",
    "Sit", "Stand", "Sleep", "Wake up", "Eat", "Drink",
    "Cook", "Buy", "Sell", "Give", "Take", "Send",
    "Work", "Rest", "Play", "Sing", "Dance",
    "Read", "Write", "Listen", "Speak", "Call",
    "Open", "Close", "Lock", "Help", "Fix", "Clean",
    "Wash", "Cut", "Build", "Start", "Finish",
    "Return", "Leave", "Arrive", "Enter", "Exit",
    "Know", "Understand", "Remember", "Forget", "Learn",
    "Think", "Believe", "Want", "Need", "Love", "Like",
    "You can do it", "Do not give up", "Keep trying",
    "Stay strong", "Be brave", "Be patient",
    "Work hard", "Study hard", "Try your best",
    "You are talented", "You are smart", "You are beautiful",
    "Be kind", "Be honest", "Be respectful",
    "Health is wealth", "Education is the key",
    "Every day is a blessing", "All shall be well",
    "Trust the process", "Never stop learning",
    "Wake up early", "Brush your teeth", "Take a bath",
    "Get dressed", "Eat breakfast", "Go to school",
    "Go to work", "Come home early", "Eat dinner",
    "Do your homework", "Go to bed early",
    "Lock the door", "Turn off the lights",
    "Do the laundry", "Wash the dishes",
    "Sweep the house", "Buy groceries", "Prepare the food",
    "Feed the children", "Pay the bills",
    "Exercise daily", "Drink more water",
    "Get enough sleep", "Take your medication",
    "Call your parents", "Spend time with family",
    "We help each other", "Teamwork is important",
    "Let us work together", "Respect the elderly",
    "Take care of the children", "Stand up for justice",
    "Do not bribe", "Pay your taxes", "Obey the law",
    "Keep the community clean", "Save electricity",
    "Use water wisely", "Support local businesses",
    "Our culture is our identity", "Preserve our traditions",
    "Teach children our language", "Pass on our values",
]

# Deduplicate
seen = set()
PHRASES = [p for p in PHRASES if not (p in seen or seen.add(p))]


def load_models():
    """Load all translation models directly."""
    print("Loading models...")
    MODEL_REPO = "EnochQuayson/s2s-onnx-model"
    MODELS_DIR = Path("./models_cache")

    from huggingface_hub import snapshot_download
    snapshot_download(repo_id=MODEL_REPO, local_dir=str(MODELS_DIR), repo_type="model")

    # French MT
    mt_fr_path = str(MODELS_DIR / "mt/opus-mt-tc-big-en-fr")
    try:
        mt_fr_model = ORTModelForSeq2SeqLM.from_pretrained(mt_fr_path)
        mt_fr_tokenizer = MarianTokenizer.from_pretrained(mt_fr_path)
    except:
        mt_fr_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-fr")
        mt_fr_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-fr")

    # NLLB multilingual
    mt_nllb_tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
    mt_nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

    print("Models loaded!")
    return mt_fr_tokenizer, mt_fr_model, mt_nllb_tokenizer, mt_nllb_model


LANG_CODES = {
    "tw": "twi_Latn",
    "ee": "ewe_Latn",
    "hau": "hau_Latn",
    "fuv": "fuv_Latn",
}


def translate_text(text, lang, mt_fr_tokenizer, mt_fr_model, mt_nllb_tokenizer, mt_nllb_model):
    """Translate text directly using loaded models."""
    try:
        if lang == "fr":
            inputs = mt_fr_tokenizer([text], return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                out = mt_fr_model.generate(**inputs)
            return mt_fr_tokenizer.decode(out[0], skip_special_tokens=True)
        else:
            lang_code = LANG_CODES.get(lang)
            if not lang_code:
                return None
            token_id = mt_nllb_tokenizer.convert_tokens_to_ids(lang_code)
            mt_nllb_tokenizer.src_lang = "eng_Latn"
            inputs = mt_nllb_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                out = mt_nllb_model.generate(
                    **inputs,
                    forced_bos_token_id=token_id,
                    max_length=256,
                    num_beams=4,
                    early_stopping=True
                )
            return mt_nllb_tokenizer.decode(out[0], skip_special_tokens=True)
    except Exception as e:
        print(f"  Error translating '{text}' to {lang}: {e}")
        return None


def load_cache(filepath):
    if os.path.exists(filepath):
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                return json.load(f)
        except:
            pass
    return {}


def save_cache(cache, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(cache, f, ensure_ascii=False, indent=2)


def main():
    print(f"\nS2S Local Cache Builder")
    print(f"Phrases: {len(PHRASES)} | Languages: {len(LANGUAGES)}")
    print(f"Total: {len(PHRASES) * len(LANGUAGES)}")
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 60)

    # Load models once
    mt_fr_tok, mt_fr_mod, nllb_tok, nllb_mod = load_models()

    cache = load_cache(OUTPUT_FILE)
    print(f"Existing cache: {len(cache)} entries\n")

    total = len(PHRASES) * len(LANGUAGES)
    done = 0
    new_count = 0
    failed = 0

    for phrase in PHRASES:
        for lang in LANGUAGES:
            key = f"{phrase}|{lang}"
            done += 1

            if key in cache:
                continue

            pct = (done / total) * 100
            print(f"[{done}/{total} {pct:.1f}%] '{phrase}' β†’ {lang} ...", end=" ", flush=True)

            result = translate_text(phrase, lang, mt_fr_tok, mt_fr_mod, nllb_tok, nllb_mod)

            if result:
                cache[key] = {
                    "source_text": phrase,
                    "translated_text": result,
                    "target_language": lang,
                    "audio_url": None,
                    "cached_at": datetime.now().isoformat()
                }
                new_count += 1
                print(f"βœ“ '{result[:50]}'")

                if new_count % SAVE_EVERY == 0:
                    save_cache(cache, OUTPUT_FILE)
                    kb = os.path.getsize(OUTPUT_FILE) / 1024
                    print(f"  β†’ Saved {len(cache)} entries ({kb:.0f} KB)")
            else:
                failed += 1
                print(f"βœ— Failed")

    save_cache(cache, OUTPUT_FILE)
    kb = os.path.getsize(OUTPUT_FILE) / 1024

    print("\n" + "=" * 60)
    print(f"COMPLETE! {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Total: {len(cache)} | New: {new_count} | Failed: {failed}")
    print(f"File: {OUTPUT_FILE} ({kb:.0f} KB)")
    print(f"\nDownload {OUTPUT_FILE} from the Files tab in your HF Space.")


if __name__ == "__main__":
    main()