Spaces:

uncertainrods
/

audio-service

Running

App Files Files Community

audio-service / app /utils /text_utils.py

uncertainrods

confidence_score

c956723 2 days ago

raw

history blame contribute delete

7.52 kB

	"""
	Text utilities for the Puja Verification Service.

	Provides LLM-powered translation of:
	- English names / DOB / gotra → Hindi (via Groq)
	- Sanskrit transcript text → Hindi (via Groq)

	Falls back to a lightweight rule-based transliteration when the LLM is
	unavailable or for purely numeric strings (dates).
	"""

	import re
	import json
	import logging

	from groq import Groq
	from app.config import GROQ_API_KEY

	logger = logging.getLogger(__name__)

	# ── Groq client (shared with llm_matching_service) ──────────────────────────
	_groq_client: Groq \| None = None


	def _get_groq_client() -> Groq:
	global _groq_client
	if _groq_client is None:
	if not GROQ_API_KEY:
	raise RuntimeError("GROQ_API_KEY is not set")
	_groq_client = Groq(api_key=GROQ_API_KEY)
	return _groq_client


	# ── Devanagari digit table (kept for fast digit conversion) ──────────────────
	DEVANAGARI_DIGITS = str.maketrans("0123456789", "०१२३४५६७८९")


	# ─────────────────────────────────────────────────────────────────────────────
	# LLM-based English → Hindi translation
	# ─────────────────────────────────────────────────────────────────────────────

	def _translate_english_to_hindi(text: str) -> str:
	"""
	Translate an English string (name, date-of-birth, or gotra) into Hindi
	using Groq LLM. For proper nouns this effectively performs a phonetic
	transliteration; for dates it converts to the Hindi calendar style.

	Returns the Hindi string directly.
	"""
	if not text or not text.strip():
	return text

	prompt = (
	"You are a professional English-to-Hindi translator. "
	"Translate the following text into Hindi (Devanagari script). "
	"Rules:\n"
	"- For proper nouns (person names, gotra names), transliterate them "
	" phonetically into Devanagari (e.g., 'Rahul Sharma' → 'राहुल शर्मा').\n"
	"- For dates, convert to Hindi format with Devanagari numerals "
	" (e.g., '15 January 1995' → '१५ जनवरी १९९५', '15/01/1995' → '१५/०१/१९९५').\n"
	"- Return ONLY the translated Hindi text, nothing else.\n\n"
	f"Text: {text}"
	)

	client = _get_groq_client()
	response = client.chat.completions.create(
	model="llama-3.1-8b-instant",
	temperature=0,
	max_tokens=256,
	messages=[
	{"role": "system", "content": "You are a translator. Return only the Hindi translation, no explanation."},
	{"role": "user", "content": prompt},
	],
	)

	result = (response.choices[0].message.content or "").strip()
	# Strip surrounding quotes the LLM sometimes adds
	if result.startswith('"') and result.endswith('"'):
	result = result[1:-1]
	return result if result else text


	def convert_api_fields_to_hindi(name: str) -> dict:
	"""
	Convert raw API input fields (English) to Hindi using LLM translation.

	Returns a dict with keys: name_hindi.
	"""
	name_hindi = _translate_english_to_hindi(name)

	logger.info(
	"Translated fields → name: %s → %s",
	name, name_hindi
	)

	return {
	"name_hindi": name_hindi,
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# LLM-based Sanskrit → Hindi translation
	# ─────────────────────────────────────────────────────────────────────────────

	def sanskrit_to_hindi(text: str) -> str:
	"""
	Translate a Sanskrit transcript into fluent Hindi using Groq LLM.

	This replaces the old 7-word dictionary approach and produces a
	meaningful Hindi translation that the downstream LLM matcher can
	search for name / DOB / gotra occurrences.
	"""
	if not text or not text.strip():
	return text

	prompt = (
	"You are an expert Sanskrit-to-Hindi translator. "
	"Translate the following Sanskrit text into clear, natural Hindi. "
	"Rules:\n"
	"- Keep all proper nouns (person names, gotra names, deity names) exactly as-is in Devanagari.\n"
	"- Keep dates and numbers exactly as-is.\n"
	"- Translate the rest into simple, fluent Hindi.\n"
	"- Return ONLY the translated Hindi text, no explanation or commentary.\n\n"
	f"Sanskrit text:\n{text}"
	)

	client = _get_groq_client()
	response = client.chat.completions.create(
	model="llama-3.1-8b-instant",
	temperature=0,
	max_tokens=2048,
	messages=[
	{"role": "system", "content": "You are a Sanskrit-Hindi translator. Return only the Hindi translation."},
	{"role": "user", "content": prompt},
	],
	)

	result = (response.choices[0].message.content or "").strip()
	logger.info("Sanskrit→Hindi translation complete (%d chars → %d chars)", len(text), len(result))
	return result if result else text


	# ─────────────────────────────────────────────────────────────────────────────
	# Legacy helpers (kept for backward compatibility)
	# ─────────────────────────────────────────────────────────────────────────────

	LATIN_TO_DEVANAGARI = {
	"a": "अ", "b": "ब", "c": "क", "d": "द", "e": "ए", "f": "फ", "g": "ग",
	"h": "ह", "i": "इ", "j": "ज", "k": "क", "l": "ल", "m": "म", "n": "न",
	"o": "ओ", "p": "प", "q": "क", "r": "र", "s": "स", "t": "त", "u": "उ",
	"v": "व", "w": "व", "x": "क्स", "y": "य", "z": "ज",
	}


	def _to_devanagari(text: str) -> str:
	"""Rule-based Latin→Devanagari character transliteration (legacy)."""
	converted = []
	for char in text:
	lower_char = char.lower()
	if lower_char in LATIN_TO_DEVANAGARI:
	converted.append(LATIN_TO_DEVANAGARI[lower_char])
	else:
	converted.append(char)
	return "".join(converted).translate(DEVANAGARI_DIGITS)


	def build_sanskrit_details(name: str) -> str:
	name_sa = _to_devanagari(name)
	return f"नाम: {name_sa}"


	def build_hindi_details(name: str) -> str:
	"""Build Hindi-formatted details from English input (uses LLM)."""
	fields = convert_api_fields_to_hindi(name)
	return f"नाम: {fields['name_hindi']}"


	def normalize_text(text: str, language: str = "en") -> str:
	if language == "en":
	text = text.lower()
	text = re.sub(r'[^a-z0-9\s]', '', text)
	elif language == "sa":
	text = re.sub(r'[^\u0900-\u097F\s]', '', text) # Keep only Devanagari
	return text