pythonprincess commited on
Commit
fa1c85b
·
verified ·
1 Parent(s): ef745a7

Upload 2 files

Browse files
models/translation/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Translation Model Package
2
+
models/translation/translation_utils.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/translation/translation_utils.py
2
+
3
+ """
4
+ Translation Model Utilities for PENNY Project
5
+ Handles multilingual translation using NLLB-200 for civic engagement accessibility.
6
+ Provides async translation with structured error handling and language code normalization.
7
+ """
8
+
9
+ import asyncio
10
+ import time
11
+ import os
12
+ import httpx
13
+ from typing import Dict, Any, Optional, List
14
+
15
+ # --- Logging Imports ---
16
+ from app.logging_utils import log_interaction, sanitize_for_logging
17
+
18
+ # --- Hugging Face API Configuration ---
19
+ HF_API_URL = "https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-600M"
20
+ HF_TOKEN = os.getenv("HF_TOKEN")
21
+
22
+ AGENT_NAME = "penny-translate-agent"
23
+ SERVICE_AVAILABLE = True # Assume available since we're using API
24
+
25
+ # NLLB-200 Language Code Mapping (Common languages for civic engagement)
26
+ LANGUAGE_CODES = {
27
+ # English variants
28
+ "english": "eng_Latn",
29
+ "en": "eng_Latn",
30
+
31
+ # Spanish variants
32
+ "spanish": "spa_Latn",
33
+ "es": "spa_Latn",
34
+ "español": "spa_Latn",
35
+
36
+ # French
37
+ "french": "fra_Latn",
38
+ "fr": "fra_Latn",
39
+ "français": "fra_Latn",
40
+
41
+ # Mandarin Chinese
42
+ "chinese": "zho_Hans",
43
+ "mandarin": "zho_Hans",
44
+ "zh": "zho_Hans",
45
+
46
+ # Arabic
47
+ "arabic": "arb_Arab",
48
+ "ar": "arb_Arab",
49
+
50
+ # Hindi
51
+ "hindi": "hin_Deva",
52
+ "hi": "hin_Deva",
53
+
54
+ # Portuguese
55
+ "portuguese": "por_Latn",
56
+ "pt": "por_Latn",
57
+
58
+ # Russian
59
+ "russian": "rus_Cyrl",
60
+ "ru": "rus_Cyrl",
61
+
62
+ # German
63
+ "german": "deu_Latn",
64
+ "de": "deu_Latn",
65
+
66
+ # Vietnamese
67
+ "vietnamese": "vie_Latn",
68
+ "vi": "vie_Latn",
69
+
70
+ # Tagalog
71
+ "tagalog": "tgl_Latn",
72
+ "tl": "tgl_Latn",
73
+
74
+ # Urdu
75
+ "urdu": "urd_Arab",
76
+ "ur": "urd_Arab",
77
+
78
+ # Swahili
79
+ "swahili": "swh_Latn",
80
+ "sw": "swh_Latn",
81
+ }
82
+
83
+ # Pre-translated civic phrases for common queries
84
+ CIVIC_PHRASES = {
85
+ "eng_Latn": {
86
+ "voting_location": "Where is my polling place?",
87
+ "voter_registration": "How do I register to vote?",
88
+ "city_services": "What city services are available?",
89
+ "report_issue": "I want to report a problem.",
90
+ "contact_city": "How do I contact city hall?",
91
+ },
92
+ "spa_Latn": {
93
+ "voting_location": "¿Dónde está mi lugar de votación?",
94
+ "voter_registration": "¿Cómo me registro para votar?",
95
+ "city_services": "¿Qué servicios de la ciudad están disponibles?",
96
+ "report_issue": "Quiero reportar un problema.",
97
+ "contact_city": "¿Cómo contacto al ayuntamiento?",
98
+ }
99
+ }
100
+
101
+
102
+ def is_translation_available() -> bool:
103
+ """
104
+ Check if translation service is available.
105
+
106
+ Returns:
107
+ bool: True if translation API is configured and ready.
108
+ """
109
+ return HF_TOKEN is not None and len(HF_TOKEN) > 0
110
+
111
+
112
+ def normalize_language_code(lang: str) -> str:
113
+ """
114
+ Converts common language names/codes to NLLB-200 format.
115
+
116
+ Args:
117
+ lang: Language name or code (e.g., "spanish", "es", "español")
118
+
119
+ Returns:
120
+ NLLB-200 language code (e.g., "spa_Latn")
121
+ """
122
+ if not lang or not isinstance(lang, str):
123
+ return "eng_Latn" # Default to English
124
+
125
+ lang_lower = lang.lower().strip()
126
+
127
+ # Check if it's already in NLLB format (contains underscore)
128
+ if "_" in lang_lower:
129
+ return lang_lower
130
+
131
+ # Look up in mapping
132
+ return LANGUAGE_CODES.get(lang_lower, lang_lower)
133
+
134
+
135
+ def get_supported_languages() -> List[str]:
136
+ """
137
+ Get list of supported language codes.
138
+
139
+ Returns:
140
+ List of NLLB-200 language codes supported by PENNY.
141
+ """
142
+ return list(set(LANGUAGE_CODES.values()))
143
+
144
+
145
+ async def translate_text(
146
+ text: str,
147
+ source_language: str = "eng_Latn",
148
+ target_language: str = "spa_Latn",
149
+ tenant_id: Optional[str] = None
150
+ ) -> Dict[str, Any]:
151
+ """
152
+ Translates text from source language to target language using NLLB-200.
153
+
154
+ Args:
155
+ text: The text to translate.
156
+ source_language: Source language code (e.g., "eng_Latn", "spanish", "es")
157
+ target_language: Target language code (e.g., "spa_Latn", "french", "fr")
158
+ tenant_id: Optional tenant identifier for logging.
159
+
160
+ Returns:
161
+ A dictionary containing:
162
+ - translated_text (str): The translated text
163
+ - source_lang (str): Normalized source language code
164
+ - target_lang (str): Normalized target language code
165
+ - original_text (str): The input text
166
+ - available (bool): Whether the service was available
167
+ - error (str, optional): Error message if translation failed
168
+ - response_time_ms (int, optional): Translation time in milliseconds
169
+ """
170
+ start_time = time.time()
171
+
172
+ # Check availability
173
+ if not is_translation_available():
174
+ log_interaction(
175
+ intent="translation",
176
+ tenant_id=tenant_id,
177
+ success=False,
178
+ error="Translation API not configured (missing HF_TOKEN)",
179
+ fallback_used=True
180
+ )
181
+ return {
182
+ "translated_text": text, # Return original text as fallback
183
+ "source_lang": source_language,
184
+ "target_lang": target_language,
185
+ "original_text": text,
186
+ "available": False,
187
+ "error": "Translation service is temporarily unavailable."
188
+ }
189
+
190
+ # Validate input
191
+ if not text or not isinstance(text, str):
192
+ log_interaction(
193
+ intent="translation",
194
+ tenant_id=tenant_id,
195
+ success=False,
196
+ error="Invalid text input"
197
+ )
198
+ return {
199
+ "translated_text": "",
200
+ "source_lang": source_language,
201
+ "target_lang": target_language,
202
+ "original_text": text if isinstance(text, str) else "",
203
+ "available": True,
204
+ "error": "Invalid text input provided."
205
+ }
206
+
207
+ # Check text length (prevent processing extremely long texts)
208
+ if len(text) > 5000: # 5k character limit for translation
209
+ log_interaction(
210
+ intent="translation",
211
+ tenant_id=tenant_id,
212
+ success=False,
213
+ error=f"Text too long: {len(text)} characters",
214
+ text_preview=sanitize_for_logging(text[:100])
215
+ )
216
+ return {
217
+ "translated_text": text,
218
+ "source_lang": source_language,
219
+ "target_lang": target_language,
220
+ "original_text": text,
221
+ "available": True,
222
+ "error": "Text is too long for translation (max 5,000 characters)."
223
+ }
224
+
225
+ # Normalize language codes
226
+ src_lang = normalize_language_code(source_language)
227
+ tgt_lang = normalize_language_code(target_language)
228
+
229
+ # Skip translation if source and target are the same
230
+ if src_lang == tgt_lang:
231
+ log_interaction(
232
+ intent="translation_skipped",
233
+ tenant_id=tenant_id,
234
+ success=True,
235
+ details="Source and target languages are identical"
236
+ )
237
+ return {
238
+ "translated_text": text,
239
+ "source_lang": src_lang,
240
+ "target_lang": tgt_lang,
241
+ "original_text": text,
242
+ "available": True,
243
+ "skipped": True
244
+ }
245
+
246
+ try:
247
+ # Prepare API request
248
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
249
+ payload = {
250
+ "inputs": text,
251
+ "parameters": {
252
+ "src_lang": src_lang,
253
+ "tgt_lang": tgt_lang
254
+ }
255
+ }
256
+
257
+ # Call Hugging Face Inference API
258
+ async with httpx.AsyncClient(timeout=30.0) as client:
259
+ response = await client.post(HF_API_URL, json=payload, headers=headers)
260
+
261
+ response_time_ms = int((time.time() - start_time) * 1000)
262
+
263
+ if response.status_code != 200:
264
+ log_interaction(
265
+ intent="translation",
266
+ tenant_id=tenant_id,
267
+ success=False,
268
+ error=f"API returned status {response.status_code}",
269
+ response_time_ms=response_time_ms,
270
+ source_lang=src_lang,
271
+ target_lang=tgt_lang,
272
+ fallback_used=True
273
+ )
274
+ return {
275
+ "translated_text": text, # Fallback to original
276
+ "source_lang": src_lang,
277
+ "target_lang": tgt_lang,
278
+ "original_text": text,
279
+ "available": False,
280
+ "error": f"Translation API error: {response.status_code}",
281
+ "response_time_ms": response_time_ms
282
+ }
283
+
284
+ results = response.json()
285
+
286
+ # Validate results
287
+ if not results or not isinstance(results, list) or len(results) == 0:
288
+ log_interaction(
289
+ intent="translation",
290
+ tenant_id=tenant_id,
291
+ success=False,
292
+ error="Empty or invalid model output",
293
+ response_time_ms=response_time_ms,
294
+ source_lang=src_lang,
295
+ target_lang=tgt_lang
296
+ )
297
+ return {
298
+ "translated_text": text, # Fallback to original
299
+ "source_lang": src_lang,
300
+ "target_lang": tgt_lang,
301
+ "original_text": text,
302
+ "available": True,
303
+ "error": "Translation returned unexpected format."
304
+ }
305
+
306
+ # NLLB returns format: [{'translation_text': '...'}]
307
+ translated = results[0].get('translation_text', '').strip()
308
+
309
+ if not translated:
310
+ log_interaction(
311
+ intent="translation",
312
+ tenant_id=tenant_id,
313
+ success=False,
314
+ error="Empty translation result",
315
+ response_time_ms=response_time_ms,
316
+ source_lang=src_lang,
317
+ target_lang=tgt_lang
318
+ )
319
+ return {
320
+ "translated_text": text, # Fallback to original
321
+ "source_lang": src_lang,
322
+ "target_lang": tgt_lang,
323
+ "original_text": text,
324
+ "available": True,
325
+ "error": "Translation produced empty result."
326
+ }
327
+
328
+ # Log slow translations
329
+ if response_time_ms > 5000: # 5 seconds
330
+ log_interaction(
331
+ intent="translation_slow",
332
+ tenant_id=tenant_id,
333
+ success=True,
334
+ response_time_ms=response_time_ms,
335
+ details="Slow translation detected",
336
+ source_lang=src_lang,
337
+ target_lang=tgt_lang,
338
+ text_length=len(text)
339
+ )
340
+
341
+ log_interaction(
342
+ intent="translation",
343
+ tenant_id=tenant_id,
344
+ success=True,
345
+ response_time_ms=response_time_ms,
346
+ source_lang=src_lang,
347
+ target_lang=tgt_lang,
348
+ text_length=len(text)
349
+ )
350
+
351
+ return {
352
+ "translated_text": translated,
353
+ "source_lang": src_lang,
354
+ "target_lang": tgt_lang,
355
+ "original_text": text,
356
+ "available": True,
357
+ "response_time_ms": response_time_ms
358
+ }
359
+
360
+ except httpx.TimeoutException:
361
+ response_time_ms = int((time.time() - start_time) * 1000)
362
+ log_interaction(
363
+ intent="translation",
364
+ tenant_id=tenant_id,
365
+ success=False,
366
+ error="Translation request timed out",
367
+ response_time_ms=response_time_ms,
368
+ source_lang=src_lang,
369
+ target_lang=tgt_lang,
370
+ fallback_used=True
371
+ )
372
+ return {
373
+ "translated_text": text, # Fallback to original
374
+ "source_lang": src_lang,
375
+ "target_lang": tgt_lang,
376
+ "original_text": text,
377
+ "available": False,
378
+ "error": "Translation request timed out.",
379
+ "response_time_ms": response_time_ms
380
+ }
381
+
382
+ except asyncio.CancelledError:
383
+ log_interaction(
384
+ intent="translation",
385
+ tenant_id=tenant_id,
386
+ success=False,
387
+ error="Translation cancelled",
388
+ source_lang=src_lang,
389
+ target_lang=tgt_lang
390
+ )
391
+ raise
392
+
393
+ except Exception as e:
394
+ response_time_ms = int((time.time() - start_time) * 1000)
395
+
396
+ log_interaction(
397
+ intent="translation",
398
+ tenant_id=tenant_id,
399
+ success=False,
400
+ error=str(e),
401
+ response_time_ms=response_time_ms,
402
+ source_lang=src_lang,
403
+ target_lang=tgt_lang,
404
+ text_preview=sanitize_for_logging(text[:100]),
405
+ fallback_used=True
406
+ )
407
+
408
+ return {
409
+ "translated_text": text, # Fallback to original
410
+ "source_lang": src_lang,
411
+ "target_lang": tgt_lang,
412
+ "original_text": text,
413
+ "available": False,
414
+ "error": str(e),
415
+ "response_time_ms": response_time_ms
416
+ }
417
+
418
+
419
+ async def detect_and_translate(
420
+ text: str,
421
+ target_language: str = "eng_Latn",
422
+ tenant_id: Optional[str] = None
423
+ ) -> Dict[str, Any]:
424
+ """
425
+ Attempts to detect the source language and translate to target.
426
+
427
+ Note: This is a simplified heuristic-based detection. For production,
428
+ consider integrating a dedicated language detection model.
429
+
430
+ Args:
431
+ text: The text to translate
432
+ target_language: Target language code
433
+ tenant_id: Optional tenant identifier for logging
434
+
435
+ Returns:
436
+ Translation result dictionary
437
+ """
438
+ if not text or not isinstance(text, str):
439
+ return {
440
+ "translated_text": "",
441
+ "detected_lang": "unknown",
442
+ "target_lang": target_language,
443
+ "original_text": text if isinstance(text, str) else "",
444
+ "available": True,
445
+ "error": "Invalid text input."
446
+ }
447
+
448
+ # Simple heuristic: check for common non-English characters
449
+ detected_lang = "eng_Latn" # Default assumption
450
+
451
+ # Check for Spanish characters
452
+ if any(char in text for char in ['¿', '¡', 'ñ', 'á', 'é', 'í', 'ó', 'ú']):
453
+ detected_lang = "spa_Latn"
454
+ # Check for Chinese characters
455
+ elif any('\u4e00' <= char <= '\u9fff' for char in text):
456
+ detected_lang = "zho_Hans"
457
+ # Check for Arabic script
458
+ elif any('\u0600' <= char <= '\u06ff' for char in text):
459
+ detected_lang = "arb_Arab"
460
+ # Check for Cyrillic (Russian)
461
+ elif any('\u0400' <= char <= '\u04ff' for char in text):
462
+ detected_lang = "rus_Cyrl"
463
+ # Check for Devanagari (Hindi)
464
+ elif any('\u0900' <= char <= '\u097f' for char in text):
465
+ detected_lang = "hin_Deva"
466
+
467
+ log_interaction(
468
+ intent="language_detection",
469
+ tenant_id=tenant_id,
470
+ success=True,
471
+ detected_lang=detected_lang,
472
+ text_preview=sanitize_for_logging(text[:50])
473
+ )
474
+
475
+ result = await translate_text(text, detected_lang, target_language, tenant_id)
476
+ result["detected_lang"] = detected_lang
477
+
478
+ return result
479
+
480
+
481
+ async def batch_translate(
482
+ texts: List[str],
483
+ source_language: str = "eng_Latn",
484
+ target_language: str = "spa_Latn",
485
+ tenant_id: Optional[str] = None
486
+ ) -> List[Dict[str, Any]]:
487
+ """
488
+ Translate multiple texts at once.
489
+
490
+ Args:
491
+ texts: List of strings to translate
492
+ source_language: Source language code
493
+ target_language: Target language code
494
+ tenant_id: Optional tenant identifier for logging
495
+
496
+ Returns:
497
+ List of translation result dictionaries
498
+ """
499
+ if not texts or not isinstance(texts, list):
500
+ log_interaction(
501
+ intent="batch_translation",
502
+ tenant_id=tenant_id,
503
+ success=False,
504
+ error="Invalid texts input"
505
+ )
506
+ return []
507
+
508
+ # Filter valid texts and limit batch size
509
+ valid_texts = [t for t in texts if isinstance(t, str) and t.strip()]
510
+ if len(valid_texts) > 50: # Batch size limit
511
+ valid_texts = valid_texts[:50]
512
+ log_interaction(
513
+ intent="batch_translation",
514
+ tenant_id=tenant_id,
515
+ success=None,
516
+ details=f"Batch size limited to 50 texts"
517
+ )
518
+
519
+ if not valid_texts:
520
+ log_interaction(
521
+ intent="batch_translation",
522
+ tenant_id=tenant_id,
523
+ success=False,
524
+ error="No valid texts in batch"
525
+ )
526
+ return []
527
+
528
+ start_time = time.time()
529
+ results = []
530
+
531
+ for text in valid_texts:
532
+ result = await translate_text(text, source_language, target_language, tenant_id)
533
+ results.append(result)
534
+
535
+ response_time_ms = int((time.time() - start_time) * 1000)
536
+
537
+ log_interaction(
538
+ intent="batch_translation",
539
+ tenant_id=tenant_id,
540
+ success=True,
541
+ response_time_ms=response_time_ms,
542
+ batch_size=len(valid_texts),
543
+ source_lang=normalize_language_code(source_language),
544
+ target_lang=normalize_language_code(target_language)
545
+ )
546
+
547
+ return results
548
+
549
+
550
+ def get_civic_phrase(
551
+ phrase_key: str,
552
+ language: str = "eng_Latn"
553
+ ) -> str:
554
+ """
555
+ Get a pre-translated civic phrase for common queries.
556
+
557
+ Args:
558
+ phrase_key: Key for the civic phrase (e.g., "voting_location")
559
+ language: Target language code
560
+
561
+ Returns:
562
+ Translated phrase or empty string if not found
563
+ """
564
+ if not phrase_key or not isinstance(phrase_key, str):
565
+ return ""
566
+
567
+ lang_code = normalize_language_code(language)
568
+ phrase = CIVIC_PHRASES.get(lang_code, {}).get(phrase_key, "")
569
+
570
+ if phrase:
571
+ log_interaction(
572
+ intent="civic_phrase_lookup",
573
+ success=True,
574
+ phrase_key=phrase_key,
575
+ language=lang_code
576
+ )
577
+
578
+ return phrase