pythonprincess commited on
Commit
31a15c5
·
verified ·
1 Parent(s): f2ac8aa

Delete translation_utils.py

Browse files
Files changed (1) hide show
  1. translation_utils.py +0 -598
translation_utils.py DELETED
@@ -1,598 +0,0 @@
1
- # models/translation/translation_utils.py
2
-
3
- """
4
- Translation Model Utilities for PENNY Project
5
- Handles multilingual translation using NLLB-200 for civic engagement accessibility.
6
- Provides async translation with structured error handling and language code normalization.
7
- """
8
-
9
- import asyncio
10
- import time
11
- from typing import Dict, Any, Optional, List
12
-
13
- # --- Logging Imports ---
14
- from app.logging_utils import log_interaction, sanitize_for_logging
15
-
16
- # --- Model Loader Import ---
17
- try:
18
- from app.model_loader import load_model_pipeline
19
- MODEL_LOADER_AVAILABLE = True
20
- except ImportError:
21
- MODEL_LOADER_AVAILABLE = False
22
- import logging
23
- logging.getLogger(__name__).warning("Could not import load_model_pipeline. Translation service unavailable.")
24
-
25
- # Global variable to store the loaded pipeline for re-use
26
- TRANSLATION_PIPELINE: Optional[Any] = None
27
- AGENT_NAME = "penny-translate-agent"
28
- INITIALIZATION_ATTEMPTED = False
29
-
30
- # NLLB-200 Language Code Mapping (Common languages for civic engagement)
31
- LANGUAGE_CODES = {
32
- # English variants
33
- "english": "eng_Latn",
34
- "en": "eng_Latn",
35
-
36
- # Spanish variants
37
- "spanish": "spa_Latn",
38
- "es": "spa_Latn",
39
- "español": "spa_Latn",
40
-
41
- # French
42
- "french": "fra_Latn",
43
- "fr": "fra_Latn",
44
- "français": "fra_Latn",
45
-
46
- # Mandarin Chinese
47
- "chinese": "zho_Hans",
48
- "mandarin": "zho_Hans",
49
- "zh": "zho_Hans",
50
-
51
- # Arabic
52
- "arabic": "arb_Arab",
53
- "ar": "arb_Arab",
54
-
55
- # Hindi
56
- "hindi": "hin_Deva",
57
- "hi": "hin_Deva",
58
-
59
- # Portuguese
60
- "portuguese": "por_Latn",
61
- "pt": "por_Latn",
62
-
63
- # Russian
64
- "russian": "rus_Cyrl",
65
- "ru": "rus_Cyrl",
66
-
67
- # German
68
- "german": "deu_Latn",
69
- "de": "deu_Latn",
70
-
71
- # Vietnamese
72
- "vietnamese": "vie_Latn",
73
- "vi": "vie_Latn",
74
-
75
- # Tagalog
76
- "tagalog": "tgl_Latn",
77
- "tl": "tgl_Latn",
78
-
79
- # Urdu
80
- "urdu": "urd_Arab",
81
- "ur": "urd_Arab",
82
-
83
- # Swahili
84
- "swahili": "swh_Latn",
85
- "sw": "swh_Latn",
86
- }
87
-
88
- # Pre-translated civic phrases for common queries
89
- CIVIC_PHRASES = {
90
- "eng_Latn": {
91
- "voting_location": "Where is my polling place?",
92
- "voter_registration": "How do I register to vote?",
93
- "city_services": "What city services are available?",
94
- "report_issue": "I want to report a problem.",
95
- "contact_city": "How do I contact city hall?",
96
- },
97
- "spa_Latn": {
98
- "voting_location": "¿Dónde está mi lugar de votación?",
99
- "voter_registration": "¿Cómo me registro para votar?",
100
- "city_services": "¿Qué servicios de la ciudad están disponibles?",
101
- "report_issue": "Quiero reportar un problema.",
102
- "contact_city": "¿Cómo contacto al ayuntamiento?",
103
- }
104
- }
105
-
106
-
107
- def _initialize_translation_pipeline() -> bool:
108
- """
109
- Initializes the translation pipeline only once.
110
-
111
- Returns:
112
- bool: True if initialization succeeded, False otherwise.
113
- """
114
- global TRANSLATION_PIPELINE, INITIALIZATION_ATTEMPTED
115
-
116
- if INITIALIZATION_ATTEMPTED:
117
- return TRANSLATION_PIPELINE is not None
118
-
119
- INITIALIZATION_ATTEMPTED = True
120
-
121
- if not MODEL_LOADER_AVAILABLE:
122
- log_interaction(
123
- intent="translation_initialization",
124
- success=False,
125
- error="model_loader unavailable"
126
- )
127
- return False
128
-
129
- try:
130
- log_interaction(
131
- intent="translation_initialization",
132
- success=None,
133
- details=f"Loading {AGENT_NAME}"
134
- )
135
-
136
- TRANSLATION_PIPELINE = load_model_pipeline(AGENT_NAME)
137
-
138
- if TRANSLATION_PIPELINE is None:
139
- log_interaction(
140
- intent="translation_initialization",
141
- success=False,
142
- error="Pipeline returned None"
143
- )
144
- return False
145
-
146
- log_interaction(
147
- intent="translation_initialization",
148
- success=True,
149
- details=f"Model {AGENT_NAME} loaded successfully"
150
- )
151
- return True
152
-
153
- except Exception as e:
154
- log_interaction(
155
- intent="translation_initialization",
156
- success=False,
157
- error=str(e)
158
- )
159
- return False
160
-
161
-
162
- # Attempt initialization at module load
163
- _initialize_translation_pipeline()
164
-
165
-
166
- def is_translation_available() -> bool:
167
- """
168
- Check if translation service is available.
169
-
170
- Returns:
171
- bool: True if translation pipeline is loaded and ready.
172
- """
173
- return TRANSLATION_PIPELINE is not None
174
-
175
-
176
- def normalize_language_code(lang: str) -> str:
177
- """
178
- Converts common language names/codes to NLLB-200 format.
179
-
180
- Args:
181
- lang: Language name or code (e.g., "spanish", "es", "español")
182
-
183
- Returns:
184
- NLLB-200 language code (e.g., "spa_Latn")
185
- """
186
- if not lang or not isinstance(lang, str):
187
- return "eng_Latn" # Default to English
188
-
189
- lang_lower = lang.lower().strip()
190
-
191
- # Check if it's already in NLLB format (contains underscore)
192
- if "_" in lang_lower:
193
- return lang_lower
194
-
195
- # Look up in mapping
196
- return LANGUAGE_CODES.get(lang_lower, lang_lower)
197
-
198
-
199
- def get_supported_languages() -> List[str]:
200
- """
201
- Get list of supported language codes.
202
-
203
- Returns:
204
- List of NLLB-200 language codes supported by PENNY.
205
- """
206
- return list(set(LANGUAGE_CODES.values()))
207
-
208
-
209
- async def translate_text(
210
- text: str,
211
- source_language: str = "eng_Latn",
212
- target_language: str = "spa_Latn",
213
- tenant_id: Optional[str] = None
214
- ) -> Dict[str, Any]:
215
- """
216
- Translates text from source language to target language using NLLB-200.
217
-
218
- Args:
219
- text: The text to translate.
220
- source_language: Source language code (e.g., "eng_Latn", "spanish", "es")
221
- target_language: Target language code (e.g., "spa_Latn", "french", "fr")
222
- tenant_id: Optional tenant identifier for logging.
223
-
224
- Returns:
225
- A dictionary containing:
226
- - translated_text (str): The translated text
227
- - source_lang (str): Normalized source language code
228
- - target_lang (str): Normalized target language code
229
- - original_text (str): The input text
230
- - available (bool): Whether the service was available
231
- - error (str, optional): Error message if translation failed
232
- - response_time_ms (int, optional): Translation time in milliseconds
233
- """
234
- start_time = time.time()
235
-
236
- global TRANSLATION_PIPELINE
237
-
238
- # Check availability
239
- if not is_translation_available():
240
- log_interaction(
241
- intent="translation",
242
- tenant_id=tenant_id,
243
- success=False,
244
- error="Translation pipeline not available",
245
- fallback_used=True
246
- )
247
- return {
248
- "translated_text": text, # Return original text as fallback
249
- "source_lang": source_language,
250
- "target_lang": target_language,
251
- "original_text": text,
252
- "available": False,
253
- "error": "Translation service is temporarily unavailable."
254
- }
255
-
256
- # Validate input
257
- if not text or not isinstance(text, str):
258
- log_interaction(
259
- intent="translation",
260
- tenant_id=tenant_id,
261
- success=False,
262
- error="Invalid text input"
263
- )
264
- return {
265
- "translated_text": "",
266
- "source_lang": source_language,
267
- "target_lang": target_language,
268
- "original_text": text if isinstance(text, str) else "",
269
- "available": True,
270
- "error": "Invalid text input provided."
271
- }
272
-
273
- # Check text length (prevent processing extremely long texts)
274
- if len(text) > 5000: # 5k character limit for translation
275
- log_interaction(
276
- intent="translation",
277
- tenant_id=tenant_id,
278
- success=False,
279
- error=f"Text too long: {len(text)} characters",
280
- text_preview=sanitize_for_logging(text[:100])
281
- )
282
- return {
283
- "translated_text": text,
284
- "source_lang": source_language,
285
- "target_lang": target_language,
286
- "original_text": text,
287
- "available": True,
288
- "error": "Text is too long for translation (max 5,000 characters)."
289
- }
290
-
291
- # Normalize language codes
292
- src_lang = normalize_language_code(source_language)
293
- tgt_lang = normalize_language_code(target_language)
294
-
295
- # Skip translation if source and target are the same
296
- if src_lang == tgt_lang:
297
- log_interaction(
298
- intent="translation_skipped",
299
- tenant_id=tenant_id,
300
- success=True,
301
- details="Source and target languages are identical"
302
- )
303
- return {
304
- "translated_text": text,
305
- "source_lang": src_lang,
306
- "target_lang": tgt_lang,
307
- "original_text": text,
308
- "available": True,
309
- "skipped": True
310
- }
311
-
312
- try:
313
- loop = asyncio.get_event_loop()
314
-
315
- # Run model inference in thread executor
316
- # NLLB pipeline expects text and language parameters
317
- results = await loop.run_in_executor(
318
- None,
319
- lambda: TRANSLATION_PIPELINE(
320
- text,
321
- src_lang=src_lang,
322
- tgt_lang=tgt_lang
323
- )
324
- )
325
-
326
- response_time_ms = int((time.time() - start_time) * 1000)
327
-
328
- # Validate results
329
- if not results or not isinstance(results, list) or len(results) == 0:
330
- log_interaction(
331
- intent="translation",
332
- tenant_id=tenant_id,
333
- success=False,
334
- error="Empty or invalid model output",
335
- response_time_ms=response_time_ms,
336
- source_lang=src_lang,
337
- target_lang=tgt_lang
338
- )
339
- return {
340
- "translated_text": text, # Fallback to original
341
- "source_lang": src_lang,
342
- "target_lang": tgt_lang,
343
- "original_text": text,
344
- "available": True,
345
- "error": "Translation returned unexpected format."
346
- }
347
-
348
- # NLLB returns format: [{'translation_text': '...'}]
349
- translated = results[0].get('translation_text', '').strip()
350
-
351
- if not translated:
352
- log_interaction(
353
- intent="translation",
354
- tenant_id=tenant_id,
355
- success=False,
356
- error="Empty translation result",
357
- response_time_ms=response_time_ms,
358
- source_lang=src_lang,
359
- target_lang=tgt_lang
360
- )
361
- return {
362
- "translated_text": text, # Fallback to original
363
- "source_lang": src_lang,
364
- "target_lang": tgt_lang,
365
- "original_text": text,
366
- "available": True,
367
- "error": "Translation produced empty result."
368
- }
369
-
370
- # Log slow translations
371
- if response_time_ms > 5000: # 5 seconds
372
- log_interaction(
373
- intent="translation_slow",
374
- tenant_id=tenant_id,
375
- success=True,
376
- response_time_ms=response_time_ms,
377
- details="Slow translation detected",
378
- source_lang=src_lang,
379
- target_lang=tgt_lang,
380
- text_length=len(text)
381
- )
382
-
383
- log_interaction(
384
- intent="translation",
385
- tenant_id=tenant_id,
386
- success=True,
387
- response_time_ms=response_time_ms,
388
- source_lang=src_lang,
389
- target_lang=tgt_lang,
390
- text_length=len(text)
391
- )
392
-
393
- return {
394
- "translated_text": translated,
395
- "source_lang": src_lang,
396
- "target_lang": tgt_lang,
397
- "original_text": text,
398
- "available": True,
399
- "response_time_ms": response_time_ms
400
- }
401
-
402
- except asyncio.CancelledError:
403
- log_interaction(
404
- intent="translation",
405
- tenant_id=tenant_id,
406
- success=False,
407
- error="Translation cancelled",
408
- source_lang=src_lang,
409
- target_lang=tgt_lang
410
- )
411
- raise
412
-
413
- except Exception as e:
414
- response_time_ms = int((time.time() - start_time) * 1000)
415
-
416
- log_interaction(
417
- intent="translation",
418
- tenant_id=tenant_id,
419
- success=False,
420
- error=str(e),
421
- response_time_ms=response_time_ms,
422
- source_lang=src_lang,
423
- target_lang=tgt_lang,
424
- text_preview=sanitize_for_logging(text[:100]),
425
- fallback_used=True
426
- )
427
-
428
- return {
429
- "translated_text": text, # Fallback to original
430
- "source_lang": src_lang,
431
- "target_lang": tgt_lang,
432
- "original_text": text,
433
- "available": False,
434
- "error": str(e),
435
- "response_time_ms": response_time_ms
436
- }
437
-
438
-
439
- async def detect_and_translate(
440
- text: str,
441
- target_language: str = "eng_Latn",
442
- tenant_id: Optional[str] = None
443
- ) -> Dict[str, Any]:
444
- """
445
- Attempts to detect the source language and translate to target.
446
-
447
- Note: This is a simplified heuristic-based detection. For production,
448
- consider integrating a dedicated language detection model.
449
-
450
- Args:
451
- text: The text to translate
452
- target_language: Target language code
453
- tenant_id: Optional tenant identifier for logging
454
-
455
- Returns:
456
- Translation result dictionary
457
- """
458
- if not text or not isinstance(text, str):
459
- return {
460
- "translated_text": "",
461
- "detected_lang": "unknown",
462
- "target_lang": target_language,
463
- "original_text": text if isinstance(text, str) else "",
464
- "available": True,
465
- "error": "Invalid text input."
466
- }
467
-
468
- # Simple heuristic: check for common non-English characters
469
- detected_lang = "eng_Latn" # Default assumption
470
-
471
- # Check for Spanish characters
472
- if any(char in text for char in ['¿', '¡', 'ñ', 'á', 'é', 'í', 'ó', 'ú']):
473
- detected_lang = "spa_Latn"
474
- # Check for Chinese characters
475
- elif any('\u4e00' <= char <= '\u9fff' for char in text):
476
- detected_lang = "zho_Hans"
477
- # Check for Arabic script
478
- elif any('\u0600' <= char <= '\u06ff' for char in text):
479
- detected_lang = "arb_Arab"
480
- # Check for Cyrillic (Russian)
481
- elif any('\u0400' <= char <= '\u04ff' for char in text):
482
- detected_lang = "rus_Cyrl"
483
- # Check for Devanagari (Hindi)
484
- elif any('\u0900' <= char <= '\u097f' for char in text):
485
- detected_lang = "hin_Deva"
486
-
487
- log_interaction(
488
- intent="language_detection",
489
- tenant_id=tenant_id,
490
- success=True,
491
- detected_lang=detected_lang,
492
- text_preview=sanitize_for_logging(text[:50])
493
- )
494
-
495
- result = await translate_text(text, detected_lang, target_language, tenant_id)
496
- result["detected_lang"] = detected_lang
497
-
498
- return result
499
-
500
-
501
- async def batch_translate(
502
- texts: List[str],
503
- source_language: str = "eng_Latn",
504
- target_language: str = "spa_Latn",
505
- tenant_id: Optional[str] = None
506
- ) -> List[Dict[str, Any]]:
507
- """
508
- Translate multiple texts at once.
509
-
510
- Args:
511
- texts: List of strings to translate
512
- source_language: Source language code
513
- target_language: Target language code
514
- tenant_id: Optional tenant identifier for logging
515
-
516
- Returns:
517
- List of translation result dictionaries
518
- """
519
- if not texts or not isinstance(texts, list):
520
- log_interaction(
521
- intent="batch_translation",
522
- tenant_id=tenant_id,
523
- success=False,
524
- error="Invalid texts input"
525
- )
526
- return []
527
-
528
- # Filter valid texts and limit batch size
529
- valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
530
- if len(valid_texts) > 50: # Batch size limit
531
- valid_texts = valid_texts[:50]
532
- log_interaction(
533
- intent="batch_translation",
534
- tenant_id=tenant_id,
535
- success=None,
536
- details=f"Batch size limited to 50 texts"
537
- )
538
-
539
- if not valid_texts:
540
- log_interaction(
541
- intent="batch_translation",
542
- tenant_id=tenant_id,
543
- success=False,
544
- error="No valid texts in batch"
545
- )
546
- return []
547
-
548
- start_time = time.time()
549
- results = []
550
-
551
- for text in valid_texts:
552
- result = await translate_text(text, source_language, target_language, tenant_id)
553
- results.append(result)
554
-
555
- response_time_ms = int((time.time() - start_time) * 1000)
556
-
557
- log_interaction(
558
- intent="batch_translation",
559
- tenant_id=tenant_id,
560
- success=True,
561
- response_time_ms=response_time_ms,
562
- batch_size=len(valid_texts),
563
- source_lang=normalize_language_code(source_language),
564
- target_lang=normalize_language_code(target_language)
565
- )
566
-
567
- return results
568
-
569
-
570
- def get_civic_phrase(
571
- phrase_key: str,
572
- language: str = "eng_Latn"
573
- ) -> str:
574
- """
575
- Get a pre-translated civic phrase for common queries.
576
-
577
- Args:
578
- phrase_key: Key for the civic phrase (e.g., "voting_location")
579
- language: Target language code
580
-
581
- Returns:
582
- Translated phrase or empty string if not found
583
- """
584
- if not phrase_key or not isinstance(phrase_key, str):
585
- return ""
586
-
587
- lang_code = normalize_language_code(language)
588
- phrase = CIVIC_PHRASES.get(lang_code, {}).get(phrase_key, "")
589
-
590
- if phrase:
591
- log_interaction(
592
- intent="civic_phrase_lookup",
593
- success=True,
594
- phrase_key=phrase_key,
595
- language=lang_code
596
- )
597
-
598
- return phrase