audio-service / app /utils /text_utils.py
uncertainrods's picture
confidence_score
c956723
"""
Text utilities for the Puja Verification Service.
Provides LLM-powered translation of:
- English names / DOB / gotra โ†’ Hindi (via Groq)
- Sanskrit transcript text โ†’ Hindi (via Groq)
Falls back to a lightweight rule-based transliteration when the LLM is
unavailable or for purely numeric strings (dates).
"""
import re
import json
import logging
from groq import Groq
from app.config import GROQ_API_KEY
logger = logging.getLogger(__name__)
# โ”€โ”€ Groq client (shared with llm_matching_service) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
_groq_client: Groq | None = None
def _get_groq_client() -> Groq:
global _groq_client
if _groq_client is None:
if not GROQ_API_KEY:
raise RuntimeError("GROQ_API_KEY is not set")
_groq_client = Groq(api_key=GROQ_API_KEY)
return _groq_client
# โ”€โ”€ Devanagari digit table (kept for fast digit conversion) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
DEVANAGARI_DIGITS = str.maketrans("0123456789", "เฅฆเฅงเฅจเฅฉเฅชเฅซเฅฌเฅญเฅฎเฅฏ")
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# LLM-based English โ†’ Hindi translation
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _translate_english_to_hindi(text: str) -> str:
"""
Translate an English string (name, date-of-birth, or gotra) into Hindi
using Groq LLM. For proper nouns this effectively performs a phonetic
transliteration; for dates it converts to the Hindi calendar style.
Returns the Hindi string directly.
"""
if not text or not text.strip():
return text
prompt = (
"You are a professional English-to-Hindi translator. "
"Translate the following text into Hindi (Devanagari script). "
"Rules:\n"
"- For proper nouns (person names, gotra names), transliterate them "
" phonetically into Devanagari (e.g., 'Rahul Sharma' โ†’ 'เคฐเคพเคนเฅเคฒ เคถเคฐเฅเคฎเคพ').\n"
"- For dates, convert to Hindi format with Devanagari numerals "
" (e.g., '15 January 1995' โ†’ 'เฅงเฅซ เคœเคจเคตเคฐเฅ€ เฅงเฅฏเฅฏเฅซ', '15/01/1995' โ†’ 'เฅงเฅซ/เฅฆเฅง/เฅงเฅฏเฅฏเฅซ').\n"
"- Return ONLY the translated Hindi text, nothing else.\n\n"
f"Text: {text}"
)
client = _get_groq_client()
response = client.chat.completions.create(
model="llama-3.1-8b-instant",
temperature=0,
max_tokens=256,
messages=[
{"role": "system", "content": "You are a translator. Return only the Hindi translation, no explanation."},
{"role": "user", "content": prompt},
],
)
result = (response.choices[0].message.content or "").strip()
# Strip surrounding quotes the LLM sometimes adds
if result.startswith('"') and result.endswith('"'):
result = result[1:-1]
return result if result else text
def convert_api_fields_to_hindi(name: str) -> dict:
"""
Convert raw API input fields (English) to Hindi using LLM translation.
Returns a dict with keys: name_hindi.
"""
name_hindi = _translate_english_to_hindi(name)
logger.info(
"Translated fields โ†’ name: %s โ†’ %s",
name, name_hindi
)
return {
"name_hindi": name_hindi,
}
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# LLM-based Sanskrit โ†’ Hindi translation
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def sanskrit_to_hindi(text: str) -> str:
"""
Translate a Sanskrit transcript into fluent Hindi using Groq LLM.
This replaces the old 7-word dictionary approach and produces a
meaningful Hindi translation that the downstream LLM matcher can
search for name / DOB / gotra occurrences.
"""
if not text or not text.strip():
return text
prompt = (
"You are an expert Sanskrit-to-Hindi translator. "
"Translate the following Sanskrit text into clear, natural Hindi. "
"Rules:\n"
"- Keep all proper nouns (person names, gotra names, deity names) exactly as-is in Devanagari.\n"
"- Keep dates and numbers exactly as-is.\n"
"- Translate the rest into simple, fluent Hindi.\n"
"- Return ONLY the translated Hindi text, no explanation or commentary.\n\n"
f"Sanskrit text:\n{text}"
)
client = _get_groq_client()
response = client.chat.completions.create(
model="llama-3.1-8b-instant",
temperature=0,
max_tokens=2048,
messages=[
{"role": "system", "content": "You are a Sanskrit-Hindi translator. Return only the Hindi translation."},
{"role": "user", "content": prompt},
],
)
result = (response.choices[0].message.content or "").strip()
logger.info("Sanskritโ†’Hindi translation complete (%d chars โ†’ %d chars)", len(text), len(result))
return result if result else text
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Legacy helpers (kept for backward compatibility)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
LATIN_TO_DEVANAGARI = {
"a": "เค…", "b": "เคฌ", "c": "เค•", "d": "เคฆ", "e": "เค", "f": "เคซ", "g": "เค—",
"h": "เคน", "i": "เค‡", "j": "เคœ", "k": "เค•", "l": "เคฒ", "m": "เคฎ", "n": "เคจ",
"o": "เค“", "p": "เคช", "q": "เค•", "r": "เคฐ", "s": "เคธ", "t": "เคค", "u": "เค‰",
"v": "เคต", "w": "เคต", "x": "เค•เฅเคธ", "y": "เคฏ", "z": "เคœ",
}
def _to_devanagari(text: str) -> str:
"""Rule-based Latinโ†’Devanagari character transliteration (legacy)."""
converted = []
for char in text:
lower_char = char.lower()
if lower_char in LATIN_TO_DEVANAGARI:
converted.append(LATIN_TO_DEVANAGARI[lower_char])
else:
converted.append(char)
return "".join(converted).translate(DEVANAGARI_DIGITS)
def build_sanskrit_details(name: str) -> str:
name_sa = _to_devanagari(name)
return f"เคจเคพเคฎ: {name_sa}"
def build_hindi_details(name: str) -> str:
"""Build Hindi-formatted details from English input (uses LLM)."""
fields = convert_api_fields_to_hindi(name)
return f"เคจเคพเคฎ: {fields['name_hindi']}"
def normalize_text(text: str, language: str = "en") -> str:
if language == "en":
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
elif language == "sa":
text = re.sub(r'[^\u0900-\u097F\s]', '', text) # Keep only Devanagari
return text