Spaces:
Paused
Paused
| """ | |
| gujarati_processor.py β Gujarati Language Post-Processor for Maya | |
| THREE FUNCTIONS: | |
| 1. correct_transcript(text) β fixes common STT errors in Gujarati | |
| 2. add_natural_fillers(text) β adds human-like filler words to TTS output | |
| 3. build_gujarati_system_prompt_addon() β returns the code-mixing | |
| instruction block to append to Maya's system prompt | |
| """ | |
| import re | |
| import random | |
| from typing import Optional | |
| # ββ TRANSCRIPT CORRECTIONS ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TRANSCRIPT_CORRECTIONS = { | |
| # Phone call context misheard words | |
| r"\bshoe\b": "shu", # "shu" (what) heard as "shoe" | |
| r"\bkhem cho\b": "kem cho", # greeting | |
| r"\bchem cho\b": "kem cho", | |
| r"\bkal\b(?= [a-z])": "kal", # "kal" (tomorrow) β keep as-is | |
| r"\bcall\b(?= [a-z])": "kal", # Sarvam mishears "kal" as "call" sometimes | |
| # Common appointment/dental vocabulary | |
| r"\bapoinment\b": "appointment", | |
| r"\bapointment\b": "appointment", | |
| r"\bdocter\b": "doctor", | |
| r"\bdenter\b": "dentist", | |
| r"\bcleeaning\b": "cleaning", | |
| r"\bclining\b": "cleaning", | |
| # Time expressions β Sarvam sometimes transcribes these wrong | |
| r"\bbaze\b": "baje", # "baje" (o'clock in Gujarati) | |
| r"\bbaj\b(?=\s)": "baje", | |
| r"\bvagye\b": "vage", # "vage" (o'clock variant) | |
| r"\bvaagye\b": "vage", | |
| # Name of Ahmedabad localities (commonly mispronounced) | |
| r"\bvastrar\b": "Vastrapur", | |
| r"\bsatelite\b": "Satellite", | |
| r"\bnavrang\b": "Navrangpura", | |
| r"\bnaranpur\b": "Naranpura", | |
| # Gujarati filler words Sarvam sometimes misses | |
| r"\bhan\b": "haa", # "haa" (yes/okay) | |
| r"\bthik\b": "theek", # "theek" (okay) | |
| } | |
| def correct_transcript(text: str) -> str: | |
| if not text: | |
| return text | |
| corrected = text | |
| for pattern, replacement in TRANSCRIPT_CORRECTIONS.items(): | |
| corrected = re.sub(pattern, replacement, corrected, flags=re.IGNORECASE) | |
| return corrected | |
| # ββ NATURAL GUJARATI FILLERS ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| GUJARATI_FILLERS = [ | |
| "Juo, ", # "Look, " β very natural in Gujarati | |
| "Haa, ", # "Yes, " β affirmative opener | |
| "Acha, ", # "Okay, " β acknowledgment | |
| "Bilkul, ", # "Absolutely, " | |
| "Theek chhe, ", # "That's fine, " | |
| "Samjhi gayi, ", # "I understand, " (Maya referring to herself) | |
| ] | |
| HINDI_FILLERS = [ | |
| "Jee, ", # "Yes, " | |
| "Haan, ", # "Yes/Okay" | |
| "Achha, ", # "Okay" | |
| "Bilkul, ", # "Absolutely" | |
| "Theek hai, ", # "That's fine" | |
| "Samajh gayi, ", # "I understand" | |
| ] | |
| ENGLISH_FILLERS = [ | |
| "Sure, ", | |
| "Of course, ", | |
| "Absolutely, ", | |
| "Got it, ", | |
| "I see, ", | |
| ] | |
| FILLER_MAP = { | |
| "gujarati": GUJARATI_FILLERS, | |
| "hindi": HINDI_FILLERS, | |
| "english": ENGLISH_FILLERS, | |
| } | |
| # Words that already start the response naturally β don't add filler before these | |
| NATURAL_STARTERS = [ | |
| # Gujarati | |
| "Namaste", "Haa", "Juo", "Acha", "Bilkul", "Theek", "Samjhi", | |
| "3", "4", "5", "6", # Don't add filler before times/numbers | |
| "kal", "aaj", "parso", | |
| # Hindi | |
| "Namaste", "Haan", "Achha", "Bilkul", | |
| # English | |
| "Hello", "Sure", "Of course", "Yes", "No", "I", | |
| # Empathy openers from E2 β already natural | |
| "Mane", "Hu samjhi", "Maaf", "Juo samjho", | |
| ] | |
| def add_natural_fillers( | |
| text: str, | |
| language: str = "gujarati", | |
| probability: float = 0.35, | |
| ) -> str: | |
| if not text or len(text) < 15: | |
| return text | |
| # Don't add filler to already-natural openers | |
| for starter in NATURAL_STARTERS: | |
| if text.startswith(starter): | |
| return text | |
| # Don't add filler before pure questions | |
| if text.strip().endswith("?") and len(text.split()) < 8: | |
| return text | |
| # Probabilistic application | |
| if random.random() >= probability: | |
| return text | |
| fillers = FILLER_MAP.get(language, FILLER_MAP["gujarati"]) | |
| chosen = random.choice(fillers) | |
| # Lowercase the first letter of original text after filler | |
| if text and text[0].isupper() and not text[:2].isupper(): | |
| text = text[0].lower() + text[1:] | |
| return chosen + text | |
| # ββ SYSTEM PROMPT ADDON βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_gujarati_system_prompt_addon() -> str: | |
| return """ | |
| GUJARATI LANGUAGE STYLE β MANDATORY RULES: | |
| You are speaking CONVERSATIONAL Gujarati on a phone call. | |
| Real Gujarati speakers mix Gujarati grammar with English nouns. | |
| This is called "code-mixing" and it sounds completely natural. | |
| CORRECT examples of how to respond: | |
| β "Haa, appointment available chhe. Tamaro naam shu chhe?" | |
| β "3 baje slot chhe. Chaleshe?" | |
| β "Ek minute, check karu chhu." | |
| β "Doctor sathe consultation βΉ200 chhe." | |
| β "OK, booking confirm thi gayi." | |
| β "Maafi maango, 3 baje busy chhe. 4 baje available chhe?" | |
| WRONG β never say these (too formal, nobody speaks like this): | |
| β "Haa, niyuktisthan upalabdh chhe. Tamaro naam shu chhe?" | |
| β "Trann vage ni jagya chhe. Svikar karso?" | |
| β "Ek pal, parischay karu chhu." | |
| β "Vaidya sathe paramarsh βΉ200 chhe." | |
| β "Theek, pratishtha nischit thi gayi." | |
| GRAMMAR RULES for natural Gujarati: | |
| - Use Gujarati verb endings: "chhe", "chhu", "karu", "karso", "thi gayi" | |
| - Use English nouns directly: "appointment", "doctor", "booking", "slot" | |
| - Use English numbers with Gujarati time: "3 baje", "4 vage", "10 minute" | |
| - Short sentences only. Maximum 10 words per sentence. | |
| - Never use long Sanskrit-derived Gujarati compound words. | |
| - Say "check karu chhu" not "thaapan karu chhu" | |
| - Say "confirm thi gayi" not "nischit thi gayi" | |
| - Say "available chhe" not "upalabdh chhe" | |
| RESPONSE LENGTH: | |
| Phone call responses must be SHORT. | |
| Maximum 2 sentences per response. | |
| If the answer is one sentence β use ONE sentence. | |
| Do not add pleasantries or padding. | |
| FILLER SOUNDS (already handled by code β do NOT add these in your text): | |
| The system adds "Juo,", "Haa,", "Acha," automatically. | |
| Do not start responses with filler words yourself. | |
| """.strip() | |
| def build_hindi_system_prompt_addon() -> str: | |
| return """ | |
| HINDI LANGUAGE STYLE β MANDATORY RULES: | |
| Conversational Hindi on a phone call. Mix Hindi grammar with English nouns. | |
| CORRECT examples: | |
| β "Haan, appointment available hai. Aapka naam kya hai?" | |
| β "3 baje slot hai. Chalega?" | |
| β "Ek minute, check karti hoon." | |
| β "Doctor ke saath consultation βΉ200 hai." | |
| β "OK, booking confirm ho gayi." | |
| WRONG (too formal): | |
| β "Haan, niyukti upalabdh hai." | |
| β "Tin baje ka sthan hai." | |
| SHORT RESPONSES ONLY. Maximum 2 sentences. Max 10 words per sentence. | |
| """.strip() | |