File size: 7,111 Bytes
ebf9701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
gujarati_processor.py β€” Gujarati Language Post-Processor for Maya

THREE FUNCTIONS:
  1. correct_transcript(text) β€” fixes common STT errors in Gujarati
  2. add_natural_fillers(text) β€” adds human-like filler words to TTS output
  3. build_gujarati_system_prompt_addon() β€” returns the code-mixing
     instruction block to append to Maya's system prompt
"""

import re
import random
from typing import Optional


# ── TRANSCRIPT CORRECTIONS ────────────────────────────────────────────────────

TRANSCRIPT_CORRECTIONS = {
    # Phone call context misheard words
    r"\bshoe\b":        "shu",        # "shu" (what) heard as "shoe"
    r"\bkhem cho\b":    "kem cho",    # greeting
    r"\bchem cho\b":    "kem cho",
    r"\bkal\b(?= [a-z])": "kal",     # "kal" (tomorrow) β€” keep as-is
    r"\bcall\b(?= [a-z])": "kal",    # Sarvam mishears "kal" as "call" sometimes

    # Common appointment/dental vocabulary
    r"\bapoinment\b":   "appointment",
    r"\bapointment\b":  "appointment",
    r"\bdocter\b":      "doctor",
    r"\bdenter\b":      "dentist",
    r"\bcleeaning\b":   "cleaning",
    r"\bclining\b":     "cleaning",

    # Time expressions β€” Sarvam sometimes transcribes these wrong
    r"\bbaze\b":        "baje",       # "baje" (o'clock in Gujarati)
    r"\bbaj\b(?=\s)":   "baje",
    r"\bvagye\b":       "vage",       # "vage" (o'clock variant)
    r"\bvaagye\b":      "vage",

    # Name of Ahmedabad localities (commonly mispronounced)
    r"\bvastrar\b":     "Vastrapur",
    r"\bsatelite\b":    "Satellite",
    r"\bnavrang\b":     "Navrangpura",
    r"\bnaranpur\b":    "Naranpura",

    # Gujarati filler words Sarvam sometimes misses
    r"\bhan\b":         "haa",        # "haa" (yes/okay)
    r"\bthik\b":        "theek",      # "theek" (okay)
}


def correct_transcript(text: str) -> str:
    if not text:
        return text

    corrected = text
    for pattern, replacement in TRANSCRIPT_CORRECTIONS.items():
        corrected = re.sub(pattern, replacement, corrected, flags=re.IGNORECASE)

    return corrected


# ── NATURAL GUJARATI FILLERS ──────────────────────────────────────────────────

GUJARATI_FILLERS = [
    "Juo, ",           # "Look, " β€” very natural in Gujarati
    "Haa, ",           # "Yes, " β€” affirmative opener
    "Acha, ",          # "Okay, " β€” acknowledgment
    "Bilkul, ",        # "Absolutely, "
    "Theek chhe, ",    # "That's fine, "
    "Samjhi gayi, ",   # "I understand, " (Maya referring to herself)
]

HINDI_FILLERS = [
    "Jee, ",           # "Yes, "
    "Haan, ",          # "Yes/Okay"
    "Achha, ",         # "Okay"
    "Bilkul, ",        # "Absolutely"
    "Theek hai, ",     # "That's fine"
    "Samajh gayi, ",   # "I understand"
]

ENGLISH_FILLERS = [
    "Sure, ",
    "Of course, ",
    "Absolutely, ",
    "Got it, ",
    "I see, ",
]

FILLER_MAP = {
    "gujarati": GUJARATI_FILLERS,
    "hindi":    HINDI_FILLERS,
    "english":  ENGLISH_FILLERS,
}

# Words that already start the response naturally β€” don't add filler before these
NATURAL_STARTERS = [
    # Gujarati
    "Namaste", "Haa", "Juo", "Acha", "Bilkul", "Theek", "Samjhi",
    "3", "4", "5", "6",   # Don't add filler before times/numbers
    "kal", "aaj", "parso",
    # Hindi
    "Namaste", "Haan", "Achha", "Bilkul",
    # English
    "Hello", "Sure", "Of course", "Yes", "No", "I",
    # Empathy openers from E2 β€” already natural
    "Mane", "Hu samjhi", "Maaf", "Juo samjho",
]


def add_natural_fillers(
    text:     str,
    language: str = "gujarati",
    probability: float = 0.35,
) -> str:
    if not text or len(text) < 15:
        return text

    # Don't add filler to already-natural openers
    for starter in NATURAL_STARTERS:
        if text.startswith(starter):
            return text

    # Don't add filler before pure questions
    if text.strip().endswith("?") and len(text.split()) < 8:
        return text

    # Probabilistic application
    if random.random() >= probability:
        return text

    fillers = FILLER_MAP.get(language, FILLER_MAP["gujarati"])
    chosen  = random.choice(fillers)

    # Lowercase the first letter of original text after filler
    if text and text[0].isupper() and not text[:2].isupper():
        text = text[0].lower() + text[1:]

    return chosen + text


# ── SYSTEM PROMPT ADDON ───────────────────────────────────────────────────────

def build_gujarati_system_prompt_addon() -> str:
    return """

GUJARATI LANGUAGE STYLE β€” MANDATORY RULES:

You are speaking CONVERSATIONAL Gujarati on a phone call.
Real Gujarati speakers mix Gujarati grammar with English nouns.
This is called "code-mixing" and it sounds completely natural.

CORRECT examples of how to respond:
  βœ… "Haa, appointment available chhe. Tamaro naam shu chhe?"
  βœ… "3 baje slot chhe. Chaleshe?"
  βœ… "Ek minute, check karu chhu."
  βœ… "Doctor sathe consultation β‚Ή200 chhe."
  βœ… "OK, booking confirm thi gayi."
  βœ… "Maafi maango, 3 baje busy chhe. 4 baje available chhe?"

WRONG β€” never say these (too formal, nobody speaks like this):
  ❌ "Haa, niyuktisthan upalabdh chhe. Tamaro naam shu chhe?"
  ❌ "Trann vage ni jagya chhe. Svikar karso?"
  ❌ "Ek pal, parischay karu chhu."
  ❌ "Vaidya sathe paramarsh β‚Ή200 chhe."
  ❌ "Theek, pratishtha nischit thi gayi."

GRAMMAR RULES for natural Gujarati:
  - Use Gujarati verb endings: "chhe", "chhu", "karu", "karso", "thi gayi"
  - Use English nouns directly: "appointment", "doctor", "booking", "slot"
  - Use English numbers with Gujarati time: "3 baje", "4 vage", "10 minute"
  - Short sentences only. Maximum 10 words per sentence.
  - Never use long Sanskrit-derived Gujarati compound words.
  - Say "check karu chhu" not "thaapan karu chhu"
  - Say "confirm thi gayi" not "nischit thi gayi"
  - Say "available chhe" not "upalabdh chhe"

RESPONSE LENGTH:
  Phone call responses must be SHORT.
  Maximum 2 sentences per response.
  If the answer is one sentence β€” use ONE sentence.
  Do not add pleasantries or padding.

FILLER SOUNDS (already handled by code β€” do NOT add these in your text):
  The system adds "Juo,", "Haa,", "Acha," automatically.
  Do not start responses with filler words yourself.
""".strip()


def build_hindi_system_prompt_addon() -> str:
    return """

HINDI LANGUAGE STYLE β€” MANDATORY RULES:

Conversational Hindi on a phone call. Mix Hindi grammar with English nouns.

CORRECT examples:
  βœ… "Haan, appointment available hai. Aapka naam kya hai?"
  βœ… "3 baje slot hai. Chalega?"
  βœ… "Ek minute, check karti hoon."
  βœ… "Doctor ke saath consultation β‚Ή200 hai."
  βœ… "OK, booking confirm ho gayi."

WRONG (too formal):
  ❌ "Haan, niyukti upalabdh hai."
  ❌ "Tin baje ka sthan hai."

SHORT RESPONSES ONLY. Maximum 2 sentences. Max 10 words per sentence.
""".strip()