""" Text utilities for the Puja Verification Service. Provides LLM-powered translation of: - English names / DOB / gotra → Hindi (via Groq) - Sanskrit transcript text → Hindi (via Groq) Falls back to a lightweight rule-based transliteration when the LLM is unavailable or for purely numeric strings (dates). """ import re import json import logging from groq import Groq from app.config import GROQ_API_KEY logger = logging.getLogger(__name__) # ── Groq client (shared with llm_matching_service) ────────────────────────── _groq_client: Groq | None = None def _get_groq_client() -> Groq: global _groq_client if _groq_client is None: if not GROQ_API_KEY: raise RuntimeError("GROQ_API_KEY is not set") _groq_client = Groq(api_key=GROQ_API_KEY) return _groq_client # ── Devanagari digit table (kept for fast digit conversion) ────────────────── DEVANAGARI_DIGITS = str.maketrans("0123456789", "०१२३४५६७८९") # ───────────────────────────────────────────────────────────────────────────── # LLM-based English → Hindi translation # ───────────────────────────────────────────────────────────────────────────── def _translate_english_to_hindi(text: str) -> str: """ Translate an English string (name, date-of-birth, or gotra) into Hindi using Groq LLM. For proper nouns this effectively performs a phonetic transliteration; for dates it converts to the Hindi calendar style. Returns the Hindi string directly. """ if not text or not text.strip(): return text prompt = ( "You are a professional English-to-Hindi translator. " "Translate the following text into Hindi (Devanagari script). " "Rules:\n" "- For proper nouns (person names, gotra names), transliterate them " " phonetically into Devanagari (e.g., 'Rahul Sharma' → 'राहुल शर्मा').\n" "- For dates, convert to Hindi format with Devanagari numerals " " (e.g., '15 January 1995' → '१५ जनवरी १९९५', '15/01/1995' → '१५/०१/१९९५').\n" "- Return ONLY the translated Hindi text, nothing else.\n\n" f"Text: {text}" ) client = _get_groq_client() response = client.chat.completions.create( model="llama-3.1-8b-instant", temperature=0, max_tokens=256, messages=[ {"role": "system", "content": "You are a translator. Return only the Hindi translation, no explanation."}, {"role": "user", "content": prompt}, ], ) result = (response.choices[0].message.content or "").strip() # Strip surrounding quotes the LLM sometimes adds if result.startswith('"') and result.endswith('"'): result = result[1:-1] return result if result else text def convert_api_fields_to_hindi(name: str) -> dict: """ Convert raw API input fields (English) to Hindi using LLM translation. Returns a dict with keys: name_hindi. """ name_hindi = _translate_english_to_hindi(name) logger.info( "Translated fields → name: %s → %s", name, name_hindi ) return { "name_hindi": name_hindi, } # ───────────────────────────────────────────────────────────────────────────── # LLM-based Sanskrit → Hindi translation # ───────────────────────────────────────────────────────────────────────────── def sanskrit_to_hindi(text: str) -> str: """ Translate a Sanskrit transcript into fluent Hindi using Groq LLM. This replaces the old 7-word dictionary approach and produces a meaningful Hindi translation that the downstream LLM matcher can search for name / DOB / gotra occurrences. """ if not text or not text.strip(): return text prompt = ( "You are an expert Sanskrit-to-Hindi translator. " "Translate the following Sanskrit text into clear, natural Hindi. " "Rules:\n" "- Keep all proper nouns (person names, gotra names, deity names) exactly as-is in Devanagari.\n" "- Keep dates and numbers exactly as-is.\n" "- Translate the rest into simple, fluent Hindi.\n" "- Return ONLY the translated Hindi text, no explanation or commentary.\n\n" f"Sanskrit text:\n{text}" ) client = _get_groq_client() response = client.chat.completions.create( model="llama-3.1-8b-instant", temperature=0, max_tokens=2048, messages=[ {"role": "system", "content": "You are a Sanskrit-Hindi translator. Return only the Hindi translation."}, {"role": "user", "content": prompt}, ], ) result = (response.choices[0].message.content or "").strip() logger.info("Sanskrit→Hindi translation complete (%d chars → %d chars)", len(text), len(result)) return result if result else text # ───────────────────────────────────────────────────────────────────────────── # Legacy helpers (kept for backward compatibility) # ───────────────────────────────────────────────────────────────────────────── LATIN_TO_DEVANAGARI = { "a": "अ", "b": "ब", "c": "क", "d": "द", "e": "ए", "f": "फ", "g": "ग", "h": "ह", "i": "इ", "j": "ज", "k": "क", "l": "ल", "m": "म", "n": "न", "o": "ओ", "p": "प", "q": "क", "r": "र", "s": "स", "t": "त", "u": "उ", "v": "व", "w": "व", "x": "क्स", "y": "य", "z": "ज", } def _to_devanagari(text: str) -> str: """Rule-based Latin→Devanagari character transliteration (legacy).""" converted = [] for char in text: lower_char = char.lower() if lower_char in LATIN_TO_DEVANAGARI: converted.append(LATIN_TO_DEVANAGARI[lower_char]) else: converted.append(char) return "".join(converted).translate(DEVANAGARI_DIGITS) def build_sanskrit_details(name: str) -> str: name_sa = _to_devanagari(name) return f"नाम: {name_sa}" def build_hindi_details(name: str) -> str: """Build Hindi-formatted details from English input (uses LLM).""" fields = convert_api_fields_to_hindi(name) return f"नाम: {fields['name_hindi']}" def normalize_text(text: str, language: str = "en") -> str: if language == "en": text = text.lower() text = re.sub(r'[^a-z0-9\s]', '', text) elif language == "sa": text = re.sub(r'[^\u0900-\u097F\s]', '', text) # Keep only Devanagari return text