""" gujarati_processor.py — Gujarati Language Post-Processor for Maya THREE FUNCTIONS: 1. correct_transcript(text) — fixes common STT errors in Gujarati 2. add_natural_fillers(text) — adds human-like filler words to TTS output 3. build_gujarati_system_prompt_addon() — returns the code-mixing instruction block to append to Maya's system prompt """ import re import random from typing import Optional # ── TRANSCRIPT CORRECTIONS ──────────────────────────────────────────────────── TRANSCRIPT_CORRECTIONS = { # Phone call context misheard words r"\bshoe\b": "shu", # "shu" (what) heard as "shoe" r"\bkhem cho\b": "kem cho", # greeting r"\bchem cho\b": "kem cho", r"\bkal\b(?= [a-z])": "kal", # "kal" (tomorrow) — keep as-is r"\bcall\b(?= [a-z])": "kal", # Sarvam mishears "kal" as "call" sometimes # Common appointment/dental vocabulary r"\bapoinment\b": "appointment", r"\bapointment\b": "appointment", r"\bdocter\b": "doctor", r"\bdenter\b": "dentist", r"\bcleeaning\b": "cleaning", r"\bclining\b": "cleaning", # Time expressions — Sarvam sometimes transcribes these wrong r"\bbaze\b": "baje", # "baje" (o'clock in Gujarati) r"\bbaj\b(?=\s)": "baje", r"\bvagye\b": "vage", # "vage" (o'clock variant) r"\bvaagye\b": "vage", # Name of Ahmedabad localities (commonly mispronounced) r"\bvastrar\b": "Vastrapur", r"\bsatelite\b": "Satellite", r"\bnavrang\b": "Navrangpura", r"\bnaranpur\b": "Naranpura", # Gujarati filler words Sarvam sometimes misses r"\bhan\b": "haa", # "haa" (yes/okay) r"\bthik\b": "theek", # "theek" (okay) } def correct_transcript(text: str) -> str: if not text: return text corrected = text for pattern, replacement in TRANSCRIPT_CORRECTIONS.items(): corrected = re.sub(pattern, replacement, corrected, flags=re.IGNORECASE) return corrected # ── NATURAL GUJARATI FILLERS ────────────────────────────────────────────────── GUJARATI_FILLERS = [ "Juo, ", # "Look, " — very natural in Gujarati "Haa, ", # "Yes, " — affirmative opener "Acha, ", # "Okay, " — acknowledgment "Bilkul, ", # "Absolutely, " "Theek chhe, ", # "That's fine, " "Samjhi gayi, ", # "I understand, " (Maya referring to herself) ] HINDI_FILLERS = [ "Jee, ", # "Yes, " "Haan, ", # "Yes/Okay" "Achha, ", # "Okay" "Bilkul, ", # "Absolutely" "Theek hai, ", # "That's fine" "Samajh gayi, ", # "I understand" ] ENGLISH_FILLERS = [ "Sure, ", "Of course, ", "Absolutely, ", "Got it, ", "I see, ", ] FILLER_MAP = { "gujarati": GUJARATI_FILLERS, "hindi": HINDI_FILLERS, "english": ENGLISH_FILLERS, } # Words that already start the response naturally — don't add filler before these NATURAL_STARTERS = [ # Gujarati "Namaste", "Haa", "Juo", "Acha", "Bilkul", "Theek", "Samjhi", "3", "4", "5", "6", # Don't add filler before times/numbers "kal", "aaj", "parso", # Hindi "Namaste", "Haan", "Achha", "Bilkul", # English "Hello", "Sure", "Of course", "Yes", "No", "I", # Empathy openers from E2 — already natural "Mane", "Hu samjhi", "Maaf", "Juo samjho", ] def add_natural_fillers( text: str, language: str = "gujarati", probability: float = 0.35, ) -> str: if not text or len(text) < 15: return text # Don't add filler to already-natural openers for starter in NATURAL_STARTERS: if text.startswith(starter): return text # Don't add filler before pure questions if text.strip().endswith("?") and len(text.split()) < 8: return text # Probabilistic application if random.random() >= probability: return text fillers = FILLER_MAP.get(language, FILLER_MAP["gujarati"]) chosen = random.choice(fillers) # Lowercase the first letter of original text after filler if text and text[0].isupper() and not text[:2].isupper(): text = text[0].lower() + text[1:] return chosen + text # ── SYSTEM PROMPT ADDON ─────────────────────────────────────────────────────── def build_gujarati_system_prompt_addon() -> str: return """ GUJARATI LANGUAGE STYLE — MANDATORY RULES: You are speaking CONVERSATIONAL Gujarati on a phone call. Real Gujarati speakers mix Gujarati grammar with English nouns. This is called "code-mixing" and it sounds completely natural. CORRECT examples of how to respond: ✅ "Haa, appointment available chhe. Tamaro naam shu chhe?" ✅ "3 baje slot chhe. Chaleshe?" ✅ "Ek minute, check karu chhu." ✅ "Doctor sathe consultation ₹200 chhe." ✅ "OK, booking confirm thi gayi." ✅ "Maafi maango, 3 baje busy chhe. 4 baje available chhe?" WRONG — never say these (too formal, nobody speaks like this): ❌ "Haa, niyuktisthan upalabdh chhe. Tamaro naam shu chhe?" ❌ "Trann vage ni jagya chhe. Svikar karso?" ❌ "Ek pal, parischay karu chhu." ❌ "Vaidya sathe paramarsh ₹200 chhe." ❌ "Theek, pratishtha nischit thi gayi." GRAMMAR RULES for natural Gujarati: - Use Gujarati verb endings: "chhe", "chhu", "karu", "karso", "thi gayi" - Use English nouns directly: "appointment", "doctor", "booking", "slot" - Use English numbers with Gujarati time: "3 baje", "4 vage", "10 minute" - Short sentences only. Maximum 10 words per sentence. - Never use long Sanskrit-derived Gujarati compound words. - Say "check karu chhu" not "thaapan karu chhu" - Say "confirm thi gayi" not "nischit thi gayi" - Say "available chhe" not "upalabdh chhe" RESPONSE LENGTH: Phone call responses must be SHORT. Maximum 2 sentences per response. If the answer is one sentence — use ONE sentence. Do not add pleasantries or padding. FILLER SOUNDS (already handled by code — do NOT add these in your text): The system adds "Juo,", "Haa,", "Acha," automatically. Do not start responses with filler words yourself. """.strip() def build_hindi_system_prompt_addon() -> str: return """ HINDI LANGUAGE STYLE — MANDATORY RULES: Conversational Hindi on a phone call. Mix Hindi grammar with English nouns. CORRECT examples: ✅ "Haan, appointment available hai. Aapka naam kya hai?" ✅ "3 baje slot hai. Chalega?" ✅ "Ek minute, check karti hoon." ✅ "Doctor ke saath consultation ₹200 hai." ✅ "OK, booking confirm ho gayi." WRONG (too formal): ❌ "Haan, niyukti upalabdh hai." ❌ "Tin baje ka sthan hai." SHORT RESPONSES ONLY. Maximum 2 sentences. Max 10 words per sentence. """.strip()