pythonprincess commited on
Commit
f583ae5
·
verified ·
1 Parent(s): 4d0d018

Upload translation_utils.py

Browse files
models/translation/translation_utils.py ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/translation/translation_utils.py
2
+
3
+ """
4
+ Translation Model Utilities for PENNY Project
5
+ Handles multilingual translation using NLLB-200 for civic engagement accessibility.
6
+ Provides async translation with structured error handling and language code normalization.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ from typing import Dict, Any, Optional, List
12
+
13
+ # --- Logging Imports ---
14
+ from app.logging_utils import log_interaction, sanitize_for_logging
15
+
16
+ # --- Model Loader Import ---
17
+ try:
18
+ from app.model_loader import load_model_pipeline
19
+ MODEL_LOADER_AVAILABLE = True
20
+ except ImportError:
21
+ MODEL_LOADER_AVAILABLE = False
22
+ import logging
23
+ logging.getLogger(__name__).warning("Could not import load_model_pipeline. Translation service unavailable.")
24
+
25
+ # Global variable to store the loaded pipeline for re-use
26
+ TRANSLATION_PIPELINE: Optional[Any] = None
27
+ AGENT_NAME = "penny-translate-agent"
28
+ INITIALIZATION_ATTEMPTED = False
29
+
30
+ # NLLB-200 Language Code Mapping (Common languages for civic engagement)
31
+ LANGUAGE_CODES = {
32
+ # English variants
33
+ "english": "eng_Latn",
34
+ "en": "eng_Latn",
35
+
36
+ # Spanish variants
37
+ "spanish": "spa_Latn",
38
+ "es": "spa_Latn",
39
+ "español": "spa_Latn",
40
+
41
+ # French
42
+ "french": "fra_Latn",
43
+ "fr": "fra_Latn",
44
+ "français": "fra_Latn",
45
+
46
+ # Mandarin Chinese
47
+ "chinese": "zho_Hans",
48
+ "mandarin": "zho_Hans",
49
+ "zh": "zho_Hans",
50
+
51
+ # Arabic
52
+ "arabic": "arb_Arab",
53
+ "ar": "arb_Arab",
54
+
55
+ # Hindi
56
+ "hindi": "hin_Deva",
57
+ "hi": "hin_Deva",
58
+
59
+ # Portuguese
60
+ "portuguese": "por_Latn",
61
+ "pt": "por_Latn",
62
+
63
+ # Russian
64
+ "russian": "rus_Cyrl",
65
+ "ru": "rus_Cyrl",
66
+
67
+ # German
68
+ "german": "deu_Latn",
69
+ "de": "deu_Latn",
70
+
71
+ # Vietnamese
72
+ "vietnamese": "vie_Latn",
73
+ "vi": "vie_Latn",
74
+
75
+ # Tagalog
76
+ "tagalog": "tgl_Latn",
77
+ "tl": "tgl_Latn",
78
+
79
+ # Urdu
80
+ "urdu": "urd_Arab",
81
+ "ur": "urd_Arab",
82
+
83
+ # Swahili
84
+ "swahili": "swh_Latn",
85
+ "sw": "swh_Latn",
86
+ }
87
+
88
+ # Pre-translated civic phrases for common queries
89
+ CIVIC_PHRASES = {
90
+ "eng_Latn": {
91
+ "voting_location": "Where is my polling place?",
92
+ "voter_registration": "How do I register to vote?",
93
+ "city_services": "What city services are available?",
94
+ "report_issue": "I want to report a problem.",
95
+ "contact_city": "How do I contact city hall?",
96
+ },
97
+ "spa_Latn": {
98
+ "voting_location": "¿Dónde está mi lugar de votación?",
99
+ "voter_registration": "¿Cómo me registro para votar?",
100
+ "city_services": "¿Qué servicios de la ciudad están disponibles?",
101
+ "report_issue": "Quiero reportar un problema.",
102
+ "contact_city": "¿Cómo contacto al ayuntamiento?",
103
+ }
104
+ }
105
+
106
+
107
+ def _initialize_translation_pipeline() -> bool:
108
+ """
109
+ Initializes the translation pipeline only once.
110
+
111
+ Returns:
112
+ bool: True if initialization succeeded, False otherwise.
113
+ """
114
+ global TRANSLATION_PIPELINE, INITIALIZATION_ATTEMPTED
115
+
116
+ if INITIALIZATION_ATTEMPTED:
117
+ return TRANSLATION_PIPELINE is not None
118
+
119
+ INITIALIZATION_ATTEMPTED = True
120
+
121
+ if not MODEL_LOADER_AVAILABLE:
122
+ log_interaction(
123
+ intent="translation_initialization",
124
+ success=False,
125
+ error="model_loader unavailable"
126
+ )
127
+ return False
128
+
129
+ try:
130
+ log_interaction(
131
+ intent="translation_initialization",
132
+ success=None,
133
+ details=f"Loading {AGENT_NAME}"
134
+ )
135
+
136
+ TRANSLATION_PIPELINE = load_model_pipeline(AGENT_NAME)
137
+
138
+ if TRANSLATION_PIPELINE is None:
139
+ log_interaction(
140
+ intent="translation_initialization",
141
+ success=False,
142
+ error="Pipeline returned None"
143
+ )
144
+ return False
145
+
146
+ log_interaction(
147
+ intent="translation_initialization",
148
+ success=True,
149
+ details=f"Model {AGENT_NAME} loaded successfully"
150
+ )
151
+ return True
152
+
153
+ except Exception as e:
154
+ log_interaction(
155
+ intent="translation_initialization",
156
+ success=False,
157
+ error=str(e)
158
+ )
159
+ return False
160
+
161
+
162
+ # Attempt initialization at module load
163
+ _initialize_translation_pipeline()
164
+
165
+
166
+ def is_translation_available() -> bool:
167
+ """
168
+ Check if translation service is available.
169
+
170
+ Returns:
171
+ bool: True if translation pipeline is loaded and ready.
172
+ """
173
+ return TRANSLATION_PIPELINE is not None
174
+
175
+
176
+ def normalize_language_code(lang: str) -> str:
177
+ """
178
+ Converts common language names/codes to NLLB-200 format.
179
+
180
+ Args:
181
+ lang: Language name or code (e.g., "spanish", "es", "español")
182
+
183
+ Returns:
184
+ NLLB-200 language code (e.g., "spa_Latn")
185
+ """
186
+ if not lang or not isinstance(lang, str):
187
+ return "eng_Latn" # Default to English
188
+
189
+ lang_lower = lang.lower().strip()
190
+
191
+ # Check if it's already in NLLB format (contains underscore)
192
+ if "_" in lang_lower:
193
+ return lang_lower
194
+
195
+ # Look up in mapping
196
+ return LANGUAGE_CODES.get(lang_lower, lang_lower)
197
+
198
+
199
+ def get_supported_languages() -> List[str]:
200
+ """
201
+ Get list of supported language codes.
202
+
203
+ Returns:
204
+ List of NLLB-200 language codes supported by PENNY.
205
+ """
206
+ return list(set(LANGUAGE_CODES.values()))
207
+
208
+
209
+ async def translate_text(
210
+ text: str,
211
+ source_language: str = "eng_Latn",
212
+ target_language: str = "spa_Latn",
213
+ tenant_id: Optional[str] = None
214
+ ) -> Dict[str, Any]:
215
+ """
216
+ Translates text from source language to target language using NLLB-200.
217
+
218
+ Args:
219
+ text: The text to translate.
220
+ source_language: Source language code (e.g., "eng_Latn", "spanish", "es")
221
+ target_language: Target language code (e.g., "spa_Latn", "french", "fr")
222
+ tenant_id: Optional tenant identifier for logging.
223
+
224
+ Returns:
225
+ A dictionary containing:
226
+ - translated_text (str): The translated text
227
+ - source_lang (str): Normalized source language code
228
+ - target_lang (str): Normalized target language code
229
+ - original_text (str): The input text
230
+ - available (bool): Whether the service was available
231
+ - error (str, optional): Error message if translation failed
232
+ - response_time_ms (int, optional): Translation time in milliseconds
233
+ """
234
+ start_time = time.time()
235
+
236
+ global TRANSLATION_PIPELINE
237
+
238
+ # Check availability
239
+ if not is_translation_available():
240
+ log_interaction(
241
+ intent="translation",
242
+ tenant_id=tenant_id,
243
+ success=False,
244
+ error="Translation pipeline not available",
245
+ fallback_used=True
246
+ )
247
+ return {
248
+ "translated_text": text, # Return original text as fallback
249
+ "source_lang": source_language,
250
+ "target_lang": target_language,
251
+ "original_text": text,
252
+ "available": False,
253
+ "error": "Translation service is temporarily unavailable."
254
+ }
255
+
256
+ # Validate input
257
+ if not text or not isinstance(text, str):
258
+ log_interaction(
259
+ intent="translation",
260
+ tenant_id=tenant_id,
261
+ success=False,
262
+ error="Invalid text input"
263
+ )
264
+ return {
265
+ "translated_text": "",
266
+ "source_lang": source_language,
267
+ "target_lang": target_language,
268
+ "original_text": text if isinstance(text, str) else "",
269
+ "available": True,
270
+ "error": "Invalid text input provided."
271
+ }
272
+
273
+ # Check text length (prevent processing extremely long texts)
274
+ if len(text) > 5000: # 5k character limit for translation
275
+ log_interaction(
276
+ intent="translation",
277
+ tenant_id=tenant_id,
278
+ success=False,
279
+ error=f"Text too long: {len(text)} characters",
280
+ text_preview=sanitize_for_logging(text[:100])
281
+ )
282
+ return {
283
+ "translated_text": text,
284
+ "source_lang": source_language,
285
+ "target_lang": target_language,
286
+ "original_text": text,
287
+ "available": True,
288
+ "error": "Text is too long for translation (max 5,000 characters)."
289
+ }
290
+
291
+ # Normalize language codes
292
+ src_lang = normalize_language_code(source_language)
293
+ tgt_lang = normalize_language_code(target_language)
294
+
295
+ # Skip translation if source and target are the same
296
+ if src_lang == tgt_lang:
297
+ log_interaction(
298
+ intent="translation_skipped",
299
+ tenant_id=tenant_id,
300
+ success=True,
301
+ details="Source and target languages are identical"
302
+ )
303
+ return {
304
+ "translated_text": text,
305
+ "source_lang": src_lang,
306
+ "target_lang": tgt_lang,
307
+ "original_text": text,
308
+ "available": True,
309
+ "skipped": True
310
+ }
311
+
312
+ try:
313
+ loop = asyncio.get_event_loop()
314
+
315
+ # Run model inference in thread executor
316
+ # load_model_pipeline returns a wrapper that calls client.predict()
317
+ # predict() returns: {"translation": "...", "source_lang": "...", "target_lang": "...", "success": True}
318
+ result_dict = await loop.run_in_executor(
319
+ None,
320
+ lambda: TRANSLATION_PIPELINE(
321
+ text,
322
+ source_lang=src_lang,
323
+ target_lang=tgt_lang
324
+ )
325
+ )
326
+
327
+ response_time_ms = int((time.time() - start_time) * 1000)
328
+
329
+ # Validate results - check if predict() returned an error
330
+ if not result_dict or not isinstance(result_dict, dict):
331
+ log_interaction(
332
+ intent="translation",
333
+ tenant_id=tenant_id,
334
+ success=False,
335
+ error="Empty or invalid model output",
336
+ response_time_ms=response_time_ms,
337
+ source_lang=src_lang,
338
+ target_lang=tgt_lang
339
+ )
340
+ return {
341
+ "translated_text": text, # Fallback to original
342
+ "source_lang": src_lang,
343
+ "target_lang": tgt_lang,
344
+ "original_text": text,
345
+ "available": True,
346
+ "error": "Translation returned unexpected format."
347
+ }
348
+
349
+ # Check for error in result
350
+ if not result_dict.get("success", False) or "error" in result_dict:
351
+ error_msg = result_dict.get("error", "Translation failed")
352
+ log_interaction(
353
+ intent="translation",
354
+ tenant_id=tenant_id,
355
+ success=False,
356
+ error=error_msg,
357
+ response_time_ms=response_time_ms,
358
+ source_lang=src_lang,
359
+ target_lang=tgt_lang
360
+ )
361
+ return {
362
+ "translated_text": text, # Fallback to original
363
+ "source_lang": src_lang,
364
+ "target_lang": tgt_lang,
365
+ "original_text": text,
366
+ "available": False,
367
+ "error": error_msg
368
+ }
369
+
370
+ # Extract translation from predict() result format
371
+ # predict() returns: {"translation": "...", "source_lang": "...", "target_lang": "...", "success": True}
372
+ translated = result_dict.get('translation', '').strip()
373
+
374
+ if not translated:
375
+ log_interaction(
376
+ intent="translation",
377
+ tenant_id=tenant_id,
378
+ success=False,
379
+ error="Empty translation result",
380
+ response_time_ms=response_time_ms,
381
+ source_lang=src_lang,
382
+ target_lang=tgt_lang
383
+ )
384
+ return {
385
+ "translated_text": text, # Fallback to original
386
+ "source_lang": src_lang,
387
+ "target_lang": tgt_lang,
388
+ "original_text": text,
389
+ "available": True,
390
+ "error": "Translation produced empty result."
391
+ }
392
+
393
+ # Log slow translations
394
+ if response_time_ms > 5000: # 5 seconds
395
+ log_interaction(
396
+ intent="translation_slow",
397
+ tenant_id=tenant_id,
398
+ success=True,
399
+ response_time_ms=response_time_ms,
400
+ details="Slow translation detected",
401
+ source_lang=src_lang,
402
+ target_lang=tgt_lang,
403
+ text_length=len(text)
404
+ )
405
+
406
+ log_interaction(
407
+ intent="translation",
408
+ tenant_id=tenant_id,
409
+ success=True,
410
+ response_time_ms=response_time_ms,
411
+ source_lang=src_lang,
412
+ target_lang=tgt_lang,
413
+ text_length=len(text)
414
+ )
415
+
416
+ return {
417
+ "translated_text": translated,
418
+ "source_lang": src_lang,
419
+ "target_lang": tgt_lang,
420
+ "original_text": text,
421
+ "available": True,
422
+ "response_time_ms": response_time_ms
423
+ }
424
+
425
+ except asyncio.CancelledError:
426
+ log_interaction(
427
+ intent="translation",
428
+ tenant_id=tenant_id,
429
+ success=False,
430
+ error="Translation cancelled",
431
+ source_lang=src_lang,
432
+ target_lang=tgt_lang
433
+ )
434
+ raise
435
+
436
+ except Exception as e:
437
+ response_time_ms = int((time.time() - start_time) * 1000)
438
+
439
+ log_interaction(
440
+ intent="translation",
441
+ tenant_id=tenant_id,
442
+ success=False,
443
+ error=str(e),
444
+ response_time_ms=response_time_ms,
445
+ source_lang=src_lang,
446
+ target_lang=tgt_lang,
447
+ text_preview=sanitize_for_logging(text[:100]),
448
+ fallback_used=True
449
+ )
450
+
451
+ return {
452
+ "translated_text": text, # Fallback to original
453
+ "source_lang": src_lang,
454
+ "target_lang": tgt_lang,
455
+ "original_text": text,
456
+ "available": False,
457
+ "error": str(e),
458
+ "response_time_ms": response_time_ms
459
+ }
460
+
461
+
462
+ async def detect_and_translate(
463
+ text: str,
464
+ target_language: str = "eng_Latn",
465
+ tenant_id: Optional[str] = None
466
+ ) -> Dict[str, Any]:
467
+ """
468
+ Attempts to detect the source language and translate to target.
469
+
470
+ Note: This is a simplified heuristic-based detection. For production,
471
+ consider integrating a dedicated language detection model.
472
+
473
+ Args:
474
+ text: The text to translate
475
+ target_language: Target language code
476
+ tenant_id: Optional tenant identifier for logging
477
+
478
+ Returns:
479
+ Translation result dictionary
480
+ """
481
+ if not text or not isinstance(text, str):
482
+ return {
483
+ "translated_text": "",
484
+ "detected_lang": "unknown",
485
+ "target_lang": target_language,
486
+ "original_text": text if isinstance(text, str) else "",
487
+ "available": True,
488
+ "error": "Invalid text input."
489
+ }
490
+
491
+ # Simple heuristic: check for common non-English characters
492
+ detected_lang = "eng_Latn" # Default assumption
493
+
494
+ # Check for Spanish characters
495
+ if any(char in text for char in ['¿', '¡', 'ñ', 'á', 'é', 'í', 'ó', 'ú']):
496
+ detected_lang = "spa_Latn"
497
+ # Check for Chinese characters
498
+ elif any('\u4e00' <= char <= '\u9fff' for char in text):
499
+ detected_lang = "zho_Hans"
500
+ # Check for Arabic script
501
+ elif any('\u0600' <= char <= '\u06ff' for char in text):
502
+ detected_lang = "arb_Arab"
503
+ # Check for Cyrillic (Russian)
504
+ elif any('\u0400' <= char <= '\u04ff' for char in text):
505
+ detected_lang = "rus_Cyrl"
506
+ # Check for Devanagari (Hindi)
507
+ elif any('\u0900' <= char <= '\u097f' for char in text):
508
+ detected_lang = "hin_Deva"
509
+
510
+ log_interaction(
511
+ intent="language_detection",
512
+ tenant_id=tenant_id,
513
+ success=True,
514
+ detected_lang=detected_lang,
515
+ text_preview=sanitize_for_logging(text[:50])
516
+ )
517
+
518
+ result = await translate_text(text, detected_lang, target_language, tenant_id)
519
+ result["detected_lang"] = detected_lang
520
+
521
+ return result
522
+
523
+
524
+ async def batch_translate(
525
+ texts: List[str],
526
+ source_language: str = "eng_Latn",
527
+ target_language: str = "spa_Latn",
528
+ tenant_id: Optional[str] = None
529
+ ) -> List[Dict[str, Any]]:
530
+ """
531
+ Translate multiple texts at once.
532
+
533
+ Args:
534
+ texts: List of strings to translate
535
+ source_language: Source language code
536
+ target_language: Target language code
537
+ tenant_id: Optional tenant identifier for logging
538
+
539
+ Returns:
540
+ List of translation result dictionaries
541
+ """
542
+ if not texts or not isinstance(texts, list):
543
+ log_interaction(
544
+ intent="batch_translation",
545
+ tenant_id=tenant_id,
546
+ success=False,
547
+ error="Invalid texts input"
548
+ )
549
+ return []
550
+
551
+ # Filter valid texts and limit batch size
552
+ valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
553
+ if len(valid_texts) > 50: # Batch size limit
554
+ valid_texts = valid_texts[:50]
555
+ log_interaction(
556
+ intent="batch_translation",
557
+ tenant_id=tenant_id,
558
+ success=None,
559
+ details=f"Batch size limited to 50 texts"
560
+ )
561
+
562
+ if not valid_texts:
563
+ log_interaction(
564
+ intent="batch_translation",
565
+ tenant_id=tenant_id,
566
+ success=False,
567
+ error="No valid texts in batch"
568
+ )
569
+ return []
570
+
571
+ start_time = time.time()
572
+ results = []
573
+
574
+ for text in valid_texts:
575
+ result = await translate_text(text, source_language, target_language, tenant_id)
576
+ results.append(result)
577
+
578
+ response_time_ms = int((time.time() - start_time) * 1000)
579
+
580
+ log_interaction(
581
+ intent="batch_translation",
582
+ tenant_id=tenant_id,
583
+ success=True,
584
+ response_time_ms=response_time_ms,
585
+ batch_size=len(valid_texts),
586
+ source_lang=normalize_language_code(source_language),
587
+ target_lang=normalize_language_code(target_language)
588
+ )
589
+
590
+ return results
591
+
592
+
593
+ def get_civic_phrase(
594
+ phrase_key: str,
595
+ language: str = "eng_Latn"
596
+ ) -> str:
597
+ """
598
+ Get a pre-translated civic phrase for common queries.
599
+
600
+ Args:
601
+ phrase_key: Key for the civic phrase (e.g., "voting_location")
602
+ language: Target language code
603
+
604
+ Returns:
605
+ Translated phrase or empty string if not found
606
+ """
607
+ if not phrase_key or not isinstance(phrase_key, str):
608
+ return ""
609
+
610
+ lang_code = normalize_language_code(language)
611
+ phrase = CIVIC_PHRASES.get(lang_code, {}).get(phrase_key, "")
612
+
613
+ if phrase:
614
+ log_interaction(
615
+ intent="civic_phrase_lookup",
616
+ success=True,
617
+ phrase_key=phrase_key,
618
+ language=lang_code
619
+ )
620
+
621
+ return phrase