pythonprincess commited on
Commit
37a090b
·
verified ·
1 Parent(s): 247ae88

Delete translation_utils.py

Browse files
Files changed (1) hide show
  1. translation_utils.py +0 -578
translation_utils.py DELETED
@@ -1,578 +0,0 @@
1
- # models/translation/translation_utils.py
2
-
3
- """
4
- Translation Model Utilities for PENNY Project
5
- Handles multilingual translation using NLLB-200 for civic engagement accessibility.
6
- Provides async translation with structured error handling and language code normalization.
7
- """
8
-
9
- import asyncio
10
- import time
11
- import os
12
- import httpx
13
- from typing import Dict, Any, Optional, List
14
-
15
- # --- Logging Imports ---
16
- from app.logging_utils import log_interaction, sanitize_for_logging
17
-
18
- # --- Hugging Face API Configuration ---
19
- HF_API_URL = "https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-600M"
20
- HF_TOKEN = os.getenv("HF_TOKEN")
21
-
22
- AGENT_NAME = "penny-translate-agent"
23
- SERVICE_AVAILABLE = True # Assume available since we're using API
24
-
25
- # NLLB-200 Language Code Mapping (Common languages for civic engagement)
26
- LANGUAGE_CODES = {
27
- # English variants
28
- "english": "eng_Latn",
29
- "en": "eng_Latn",
30
-
31
- # Spanish variants
32
- "spanish": "spa_Latn",
33
- "es": "spa_Latn",
34
- "español": "spa_Latn",
35
-
36
- # French
37
- "french": "fra_Latn",
38
- "fr": "fra_Latn",
39
- "français": "fra_Latn",
40
-
41
- # Mandarin Chinese
42
- "chinese": "zho_Hans",
43
- "mandarin": "zho_Hans",
44
- "zh": "zho_Hans",
45
-
46
- # Arabic
47
- "arabic": "arb_Arab",
48
- "ar": "arb_Arab",
49
-
50
- # Hindi
51
- "hindi": "hin_Deva",
52
- "hi": "hin_Deva",
53
-
54
- # Portuguese
55
- "portuguese": "por_Latn",
56
- "pt": "por_Latn",
57
-
58
- # Russian
59
- "russian": "rus_Cyrl",
60
- "ru": "rus_Cyrl",
61
-
62
- # German
63
- "german": "deu_Latn",
64
- "de": "deu_Latn",
65
-
66
- # Vietnamese
67
- "vietnamese": "vie_Latn",
68
- "vi": "vie_Latn",
69
-
70
- # Tagalog
71
- "tagalog": "tgl_Latn",
72
- "tl": "tgl_Latn",
73
-
74
- # Urdu
75
- "urdu": "urd_Arab",
76
- "ur": "urd_Arab",
77
-
78
- # Swahili
79
- "swahili": "swh_Latn",
80
- "sw": "swh_Latn",
81
- }
82
-
83
- # Pre-translated civic phrases for common queries
84
- CIVIC_PHRASES = {
85
- "eng_Latn": {
86
- "voting_location": "Where is my polling place?",
87
- "voter_registration": "How do I register to vote?",
88
- "city_services": "What city services are available?",
89
- "report_issue": "I want to report a problem.",
90
- "contact_city": "How do I contact city hall?",
91
- },
92
- "spa_Latn": {
93
- "voting_location": "¿Dónde está mi lugar de votación?",
94
- "voter_registration": "¿Cómo me registro para votar?",
95
- "city_services": "¿Qué servicios de la ciudad están disponibles?",
96
- "report_issue": "Quiero reportar un problema.",
97
- "contact_city": "¿Cómo contacto al ayuntamiento?",
98
- }
99
- }
100
-
101
-
102
- def is_translation_available() -> bool:
103
- """
104
- Check if translation service is available.
105
-
106
- Returns:
107
- bool: True if translation API is configured and ready.
108
- """
109
- return HF_TOKEN is not None and len(HF_TOKEN) > 0
110
-
111
-
112
- def normalize_language_code(lang: str) -> str:
113
- """
114
- Converts common language names/codes to NLLB-200 format.
115
-
116
- Args:
117
- lang: Language name or code (e.g., "spanish", "es", "español")
118
-
119
- Returns:
120
- NLLB-200 language code (e.g., "spa_Latn")
121
- """
122
- if not lang or not isinstance(lang, str):
123
- return "eng_Latn" # Default to English
124
-
125
- lang_lower = lang.lower().strip()
126
-
127
- # Check if it's already in NLLB format (contains underscore)
128
- if "_" in lang_lower:
129
- return lang_lower
130
-
131
- # Look up in mapping
132
- return LANGUAGE_CODES.get(lang_lower, lang_lower)
133
-
134
-
135
- def get_supported_languages() -> List[str]:
136
- """
137
- Get list of supported language codes.
138
-
139
- Returns:
140
- List of NLLB-200 language codes supported by PENNY.
141
- """
142
- return list(set(LANGUAGE_CODES.values()))
143
-
144
-
145
- async def translate_text(
146
- text: str,
147
- source_language: str = "eng_Latn",
148
- target_language: str = "spa_Latn",
149
- tenant_id: Optional[str] = None
150
- ) -> Dict[str, Any]:
151
- """
152
- Translates text from source language to target language using NLLB-200.
153
-
154
- Args:
155
- text: The text to translate.
156
- source_language: Source language code (e.g., "eng_Latn", "spanish", "es")
157
- target_language: Target language code (e.g., "spa_Latn", "french", "fr")
158
- tenant_id: Optional tenant identifier for logging.
159
-
160
- Returns:
161
- A dictionary containing:
162
- - translated_text (str): The translated text
163
- - source_lang (str): Normalized source language code
164
- - target_lang (str): Normalized target language code
165
- - original_text (str): The input text
166
- - available (bool): Whether the service was available
167
- - error (str, optional): Error message if translation failed
168
- - response_time_ms (int, optional): Translation time in milliseconds
169
- """
170
- start_time = time.time()
171
-
172
- # Check availability
173
- if not is_translation_available():
174
- log_interaction(
175
- intent="translation",
176
- tenant_id=tenant_id,
177
- success=False,
178
- error="Translation API not configured (missing HF_TOKEN)",
179
- fallback_used=True
180
- )
181
- return {
182
- "translated_text": text, # Return original text as fallback
183
- "source_lang": source_language,
184
- "target_lang": target_language,
185
- "original_text": text,
186
- "available": False,
187
- "error": "Translation service is temporarily unavailable."
188
- }
189
-
190
- # Validate input
191
- if not text or not isinstance(text, str):
192
- log_interaction(
193
- intent="translation",
194
- tenant_id=tenant_id,
195
- success=False,
196
- error="Invalid text input"
197
- )
198
- return {
199
- "translated_text": "",
200
- "source_lang": source_language,
201
- "target_lang": target_language,
202
- "original_text": text if isinstance(text, str) else "",
203
- "available": True,
204
- "error": "Invalid text input provided."
205
- }
206
-
207
- # Check text length (prevent processing extremely long texts)
208
- if len(text) > 5000: # 5k character limit for translation
209
- log_interaction(
210
- intent="translation",
211
- tenant_id=tenant_id,
212
- success=False,
213
- error=f"Text too long: {len(text)} characters",
214
- text_preview=sanitize_for_logging(text[:100])
215
- )
216
- return {
217
- "translated_text": text,
218
- "source_lang": source_language,
219
- "target_lang": target_language,
220
- "original_text": text,
221
- "available": True,
222
- "error": "Text is too long for translation (max 5,000 characters)."
223
- }
224
-
225
- # Normalize language codes
226
- src_lang = normalize_language_code(source_language)
227
- tgt_lang = normalize_language_code(target_language)
228
-
229
- # Skip translation if source and target are the same
230
- if src_lang == tgt_lang:
231
- log_interaction(
232
- intent="translation_skipped",
233
- tenant_id=tenant_id,
234
- success=True,
235
- details="Source and target languages are identical"
236
- )
237
- return {
238
- "translated_text": text,
239
- "source_lang": src_lang,
240
- "target_lang": tgt_lang,
241
- "original_text": text,
242
- "available": True,
243
- "skipped": True
244
- }
245
-
246
- try:
247
- # Prepare API request
248
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
249
- payload = {
250
- "inputs": text,
251
- "parameters": {
252
- "src_lang": src_lang,
253
- "tgt_lang": tgt_lang
254
- }
255
- }
256
-
257
- # Call Hugging Face Inference API
258
- async with httpx.AsyncClient(timeout=30.0) as client:
259
- response = await client.post(HF_API_URL, json=payload, headers=headers)
260
-
261
- response_time_ms = int((time.time() - start_time) * 1000)
262
-
263
- if response.status_code != 200:
264
- log_interaction(
265
- intent="translation",
266
- tenant_id=tenant_id,
267
- success=False,
268
- error=f"API returned status {response.status_code}",
269
- response_time_ms=response_time_ms,
270
- source_lang=src_lang,
271
- target_lang=tgt_lang,
272
- fallback_used=True
273
- )
274
- return {
275
- "translated_text": text, # Fallback to original
276
- "source_lang": src_lang,
277
- "target_lang": tgt_lang,
278
- "original_text": text,
279
- "available": False,
280
- "error": f"Translation API error: {response.status_code}",
281
- "response_time_ms": response_time_ms
282
- }
283
-
284
- results = response.json()
285
-
286
- # Validate results
287
- if not results or not isinstance(results, list) or len(results) == 0:
288
- log_interaction(
289
- intent="translation",
290
- tenant_id=tenant_id,
291
- success=False,
292
- error="Empty or invalid model output",
293
- response_time_ms=response_time_ms,
294
- source_lang=src_lang,
295
- target_lang=tgt_lang
296
- )
297
- return {
298
- "translated_text": text, # Fallback to original
299
- "source_lang": src_lang,
300
- "target_lang": tgt_lang,
301
- "original_text": text,
302
- "available": True,
303
- "error": "Translation returned unexpected format."
304
- }
305
-
306
- # NLLB returns format: [{'translation_text': '...'}]
307
- translated = results[0].get('translation_text', '').strip()
308
-
309
- if not translated:
310
- log_interaction(
311
- intent="translation",
312
- tenant_id=tenant_id,
313
- success=False,
314
- error="Empty translation result",
315
- response_time_ms=response_time_ms,
316
- source_lang=src_lang,
317
- target_lang=tgt_lang
318
- )
319
- return {
320
- "translated_text": text, # Fallback to original
321
- "source_lang": src_lang,
322
- "target_lang": tgt_lang,
323
- "original_text": text,
324
- "available": True,
325
- "error": "Translation produced empty result."
326
- }
327
-
328
- # Log slow translations
329
- if response_time_ms > 5000: # 5 seconds
330
- log_interaction(
331
- intent="translation_slow",
332
- tenant_id=tenant_id,
333
- success=True,
334
- response_time_ms=response_time_ms,
335
- details="Slow translation detected",
336
- source_lang=src_lang,
337
- target_lang=tgt_lang,
338
- text_length=len(text)
339
- )
340
-
341
- log_interaction(
342
- intent="translation",
343
- tenant_id=tenant_id,
344
- success=True,
345
- response_time_ms=response_time_ms,
346
- source_lang=src_lang,
347
- target_lang=tgt_lang,
348
- text_length=len(text)
349
- )
350
-
351
- return {
352
- "translated_text": translated,
353
- "source_lang": src_lang,
354
- "target_lang": tgt_lang,
355
- "original_text": text,
356
- "available": True,
357
- "response_time_ms": response_time_ms
358
- }
359
-
360
- except httpx.TimeoutException:
361
- response_time_ms = int((time.time() - start_time) * 1000)
362
- log_interaction(
363
- intent="translation",
364
- tenant_id=tenant_id,
365
- success=False,
366
- error="Translation request timed out",
367
- response_time_ms=response_time_ms,
368
- source_lang=src_lang,
369
- target_lang=tgt_lang,
370
- fallback_used=True
371
- )
372
- return {
373
- "translated_text": text, # Fallback to original
374
- "source_lang": src_lang,
375
- "target_lang": tgt_lang,
376
- "original_text": text,
377
- "available": False,
378
- "error": "Translation request timed out.",
379
- "response_time_ms": response_time_ms
380
- }
381
-
382
- except asyncio.CancelledError:
383
- log_interaction(
384
- intent="translation",
385
- tenant_id=tenant_id,
386
- success=False,
387
- error="Translation cancelled",
388
- source_lang=src_lang,
389
- target_lang=tgt_lang
390
- )
391
- raise
392
-
393
- except Exception as e:
394
- response_time_ms = int((time.time() - start_time) * 1000)
395
-
396
- log_interaction(
397
- intent="translation",
398
- tenant_id=tenant_id,
399
- success=False,
400
- error=str(e),
401
- response_time_ms=response_time_ms,
402
- source_lang=src_lang,
403
- target_lang=tgt_lang,
404
- text_preview=sanitize_for_logging(text[:100]),
405
- fallback_used=True
406
- )
407
-
408
- return {
409
- "translated_text": text, # Fallback to original
410
- "source_lang": src_lang,
411
- "target_lang": tgt_lang,
412
- "original_text": text,
413
- "available": False,
414
- "error": str(e),
415
- "response_time_ms": response_time_ms
416
- }
417
-
418
-
419
- async def detect_and_translate(
420
- text: str,
421
- target_language: str = "eng_Latn",
422
- tenant_id: Optional[str] = None
423
- ) -> Dict[str, Any]:
424
- """
425
- Attempts to detect the source language and translate to target.
426
-
427
- Note: This is a simplified heuristic-based detection. For production,
428
- consider integrating a dedicated language detection model.
429
-
430
- Args:
431
- text: The text to translate
432
- target_language: Target language code
433
- tenant_id: Optional tenant identifier for logging
434
-
435
- Returns:
436
- Translation result dictionary
437
- """
438
- if not text or not isinstance(text, str):
439
- return {
440
- "translated_text": "",
441
- "detected_lang": "unknown",
442
- "target_lang": target_language,
443
- "original_text": text if isinstance(text, str) else "",
444
- "available": True,
445
- "error": "Invalid text input."
446
- }
447
-
448
- # Simple heuristic: check for common non-English characters
449
- detected_lang = "eng_Latn" # Default assumption
450
-
451
- # Check for Spanish characters
452
- if any(char in text for char in ['¿', '¡', 'ñ', 'á', 'é', 'í', 'ó', 'ú']):
453
- detected_lang = "spa_Latn"
454
- # Check for Chinese characters
455
- elif any('\u4e00' <= char <= '\u9fff' for char in text):
456
- detected_lang = "zho_Hans"
457
- # Check for Arabic script
458
- elif any('\u0600' <= char <= '\u06ff' for char in text):
459
- detected_lang = "arb_Arab"
460
- # Check for Cyrillic (Russian)
461
- elif any('\u0400' <= char <= '\u04ff' for char in text):
462
- detected_lang = "rus_Cyrl"
463
- # Check for Devanagari (Hindi)
464
- elif any('\u0900' <= char <= '\u097f' for char in text):
465
- detected_lang = "hin_Deva"
466
-
467
- log_interaction(
468
- intent="language_detection",
469
- tenant_id=tenant_id,
470
- success=True,
471
- detected_lang=detected_lang,
472
- text_preview=sanitize_for_logging(text[:50])
473
- )
474
-
475
- result = await translate_text(text, detected_lang, target_language, tenant_id)
476
- result["detected_lang"] = detected_lang
477
-
478
- return result
479
-
480
-
481
- async def batch_translate(
482
- texts: List[str],
483
- source_language: str = "eng_Latn",
484
- target_language: str = "spa_Latn",
485
- tenant_id: Optional[str] = None
486
- ) -> List[Dict[str, Any]]:
487
- """
488
- Translate multiple texts at once.
489
-
490
- Args:
491
- texts: List of strings to translate
492
- source_language: Source language code
493
- target_language: Target language code
494
- tenant_id: Optional tenant identifier for logging
495
-
496
- Returns:
497
- List of translation result dictionaries
498
- """
499
- if not texts or not isinstance(texts, list):
500
- log_interaction(
501
- intent="batch_translation",
502
- tenant_id=tenant_id,
503
- success=False,
504
- error="Invalid texts input"
505
- )
506
- return []
507
-
508
- # Filter valid texts and limit batch size
509
- valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
510
- if len(valid_texts) > 50: # Batch size limit
511
- valid_texts = valid_texts[:50]
512
- log_interaction(
513
- intent="batch_translation",
514
- tenant_id=tenant_id,
515
- success=None,
516
- details=f"Batch size limited to 50 texts"
517
- )
518
-
519
- if not valid_texts:
520
- log_interaction(
521
- intent="batch_translation",
522
- tenant_id=tenant_id,
523
- success=False,
524
- error="No valid texts in batch"
525
- )
526
- return []
527
-
528
- start_time = time.time()
529
- results = []
530
-
531
- for text in valid_texts:
532
- result = await translate_text(text, source_language, target_language, tenant_id)
533
- results.append(result)
534
-
535
- response_time_ms = int((time.time() - start_time) * 1000)
536
-
537
- log_interaction(
538
- intent="batch_translation",
539
- tenant_id=tenant_id,
540
- success=True,
541
- response_time_ms=response_time_ms,
542
- batch_size=len(valid_texts),
543
- source_lang=normalize_language_code(source_language),
544
- target_lang=normalize_language_code(target_language)
545
- )
546
-
547
- return results
548
-
549
-
550
- def get_civic_phrase(
551
- phrase_key: str,
552
- language: str = "eng_Latn"
553
- ) -> str:
554
- """
555
- Get a pre-translated civic phrase for common queries.
556
-
557
- Args:
558
- phrase_key: Key for the civic phrase (e.g., "voting_location")
559
- language: Target language code
560
-
561
- Returns:
562
- Translated phrase or empty string if not found
563
- """
564
- if not phrase_key or not isinstance(phrase_key, str):
565
- return ""
566
-
567
- lang_code = normalize_language_code(language)
568
- phrase = CIVIC_PHRASES.get(lang_code, {}).get(phrase_key, "")
569
-
570
- if phrase:
571
- log_interaction(
572
- intent="civic_phrase_lookup",
573
- success=True,
574
- phrase_key=phrase_key,
575
- language=lang_code
576
- )
577
-
578
- return phrase