Spaces:
Running
Running
| """ | |
| Text utilities for the Puja Verification Service. | |
| Provides LLM-powered translation of: | |
| - English names / DOB / gotra โ Hindi (via Groq) | |
| - Sanskrit transcript text โ Hindi (via Groq) | |
| Falls back to a lightweight rule-based transliteration when the LLM is | |
| unavailable or for purely numeric strings (dates). | |
| """ | |
| import re | |
| import json | |
| import logging | |
| from groq import Groq | |
| from app.config import GROQ_API_KEY | |
| logger = logging.getLogger(__name__) | |
| # โโ Groq client (shared with llm_matching_service) โโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _groq_client: Groq | None = None | |
| def _get_groq_client() -> Groq: | |
| global _groq_client | |
| if _groq_client is None: | |
| if not GROQ_API_KEY: | |
| raise RuntimeError("GROQ_API_KEY is not set") | |
| _groq_client = Groq(api_key=GROQ_API_KEY) | |
| return _groq_client | |
| # โโ Devanagari digit table (kept for fast digit conversion) โโโโโโโโโโโโโโโโโโ | |
| DEVANAGARI_DIGITS = str.maketrans("0123456789", "เฅฆเฅงเฅจเฅฉเฅชเฅซเฅฌเฅญเฅฎเฅฏ") | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # LLM-based English โ Hindi translation | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _translate_english_to_hindi(text: str) -> str: | |
| """ | |
| Translate an English string (name, date-of-birth, or gotra) into Hindi | |
| using Groq LLM. For proper nouns this effectively performs a phonetic | |
| transliteration; for dates it converts to the Hindi calendar style. | |
| Returns the Hindi string directly. | |
| """ | |
| if not text or not text.strip(): | |
| return text | |
| prompt = ( | |
| "You are a professional English-to-Hindi translator. " | |
| "Translate the following text into Hindi (Devanagari script). " | |
| "Rules:\n" | |
| "- For proper nouns (person names, gotra names), transliterate them " | |
| " phonetically into Devanagari (e.g., 'Rahul Sharma' โ 'เคฐเคพเคนเฅเคฒ เคถเคฐเฅเคฎเคพ').\n" | |
| "- For dates, convert to Hindi format with Devanagari numerals " | |
| " (e.g., '15 January 1995' โ 'เฅงเฅซ เคเคจเคตเคฐเฅ เฅงเฅฏเฅฏเฅซ', '15/01/1995' โ 'เฅงเฅซ/เฅฆเฅง/เฅงเฅฏเฅฏเฅซ').\n" | |
| "- Return ONLY the translated Hindi text, nothing else.\n\n" | |
| f"Text: {text}" | |
| ) | |
| client = _get_groq_client() | |
| response = client.chat.completions.create( | |
| model="llama-3.1-8b-instant", | |
| temperature=0, | |
| max_tokens=256, | |
| messages=[ | |
| {"role": "system", "content": "You are a translator. Return only the Hindi translation, no explanation."}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| ) | |
| result = (response.choices[0].message.content or "").strip() | |
| # Strip surrounding quotes the LLM sometimes adds | |
| if result.startswith('"') and result.endswith('"'): | |
| result = result[1:-1] | |
| return result if result else text | |
| def convert_api_fields_to_hindi(name: str) -> dict: | |
| """ | |
| Convert raw API input fields (English) to Hindi using LLM translation. | |
| Returns a dict with keys: name_hindi. | |
| """ | |
| name_hindi = _translate_english_to_hindi(name) | |
| logger.info( | |
| "Translated fields โ name: %s โ %s", | |
| name, name_hindi | |
| ) | |
| return { | |
| "name_hindi": name_hindi, | |
| } | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # LLM-based Sanskrit โ Hindi translation | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def sanskrit_to_hindi(text: str) -> str: | |
| """ | |
| Translate a Sanskrit transcript into fluent Hindi using Groq LLM. | |
| This replaces the old 7-word dictionary approach and produces a | |
| meaningful Hindi translation that the downstream LLM matcher can | |
| search for name / DOB / gotra occurrences. | |
| """ | |
| if not text or not text.strip(): | |
| return text | |
| prompt = ( | |
| "You are an expert Sanskrit-to-Hindi translator. " | |
| "Translate the following Sanskrit text into clear, natural Hindi. " | |
| "Rules:\n" | |
| "- Keep all proper nouns (person names, gotra names, deity names) exactly as-is in Devanagari.\n" | |
| "- Keep dates and numbers exactly as-is.\n" | |
| "- Translate the rest into simple, fluent Hindi.\n" | |
| "- Return ONLY the translated Hindi text, no explanation or commentary.\n\n" | |
| f"Sanskrit text:\n{text}" | |
| ) | |
| client = _get_groq_client() | |
| response = client.chat.completions.create( | |
| model="llama-3.1-8b-instant", | |
| temperature=0, | |
| max_tokens=2048, | |
| messages=[ | |
| {"role": "system", "content": "You are a Sanskrit-Hindi translator. Return only the Hindi translation."}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| ) | |
| result = (response.choices[0].message.content or "").strip() | |
| logger.info("SanskritโHindi translation complete (%d chars โ %d chars)", len(text), len(result)) | |
| return result if result else text | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # Legacy helpers (kept for backward compatibility) | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| LATIN_TO_DEVANAGARI = { | |
| "a": "เค ", "b": "เคฌ", "c": "เค", "d": "เคฆ", "e": "เค", "f": "เคซ", "g": "เค", | |
| "h": "เคน", "i": "เค", "j": "เค", "k": "เค", "l": "เคฒ", "m": "เคฎ", "n": "เคจ", | |
| "o": "เค", "p": "เคช", "q": "เค", "r": "เคฐ", "s": "เคธ", "t": "เคค", "u": "เค", | |
| "v": "เคต", "w": "เคต", "x": "เคเฅเคธ", "y": "เคฏ", "z": "เค", | |
| } | |
| def _to_devanagari(text: str) -> str: | |
| """Rule-based LatinโDevanagari character transliteration (legacy).""" | |
| converted = [] | |
| for char in text: | |
| lower_char = char.lower() | |
| if lower_char in LATIN_TO_DEVANAGARI: | |
| converted.append(LATIN_TO_DEVANAGARI[lower_char]) | |
| else: | |
| converted.append(char) | |
| return "".join(converted).translate(DEVANAGARI_DIGITS) | |
| def build_sanskrit_details(name: str) -> str: | |
| name_sa = _to_devanagari(name) | |
| return f"เคจเคพเคฎ: {name_sa}" | |
| def build_hindi_details(name: str) -> str: | |
| """Build Hindi-formatted details from English input (uses LLM).""" | |
| fields = convert_api_fields_to_hindi(name) | |
| return f"เคจเคพเคฎ: {fields['name_hindi']}" | |
| def normalize_text(text: str, language: str = "en") -> str: | |
| if language == "en": | |
| text = text.lower() | |
| text = re.sub(r'[^a-z0-9\s]', '', text) | |
| elif language == "sa": | |
| text = re.sub(r'[^\u0900-\u097F\s]', '', text) # Keep only Devanagari | |
| return text |