Spaces:

mohanbot799s
/

civicconnect-ai-engine

Sleeping

App Files Files Community

MOHAN799S commited on Mar 12

Commit

3ba902d

1 Parent(s): a349f20

Fix: convert Kannada→Telugu script before storing (Whisper quirk)

Browse files

Files changed (1) hide show

multi_modal/audio_to_text.py +38 -1

multi_modal/audio_to_text.py CHANGED Viewed

@@ -39,6 +39,7 @@ _VALID_SCRIPTS = {
     "LATIN",        # English
     "DEVANAGARI",   # Hindi
     "TELUGU",       # Telugu
     "COMMON",       # punctuation, digits, spaces
 }
@@ -54,11 +55,45 @@ _ALLOWED_LANGUAGES = {"en", "te", "hi"}
 # This prevents Telugu audio from being accepted as Hindi.
 _LANG_EXPECTED_SCRIPT = {
     "en": {"LATIN"},
-    "te": {"TELUGU"},
     "hi": {"DEVANAGARI"},
 }
 # ── Load Whisper ONCE at import time ──────────────────────────────────────────
 if _AUDIO_BACKEND == "local":
     print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
@@ -272,6 +307,7 @@ def _transcribe_local(audio_file) -> str:
                             print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
                             continue
                     print(f"[audio_to_text] OK lang={lang} | "
                           f"{len(text)} chars: {text[:100]}")
                     return text
@@ -425,6 +461,7 @@ def _transcribe_via_hf_api(audio_file) -> str:
                 text = ""
             if _is_valid_transcription(text):
                 print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
                 return text
             else:

     "LATIN",        # English
     "DEVANAGARI",   # Hindi
     "TELUGU",       # Telugu
+    "KANNADA",      # Whisper sometimes outputs Kannada for Telugu audio
     "COMMON",       # punctuation, digits, spaces
 }
 # This prevents Telugu audio from being accepted as Hindi.
 _LANG_EXPECTED_SCRIPT = {
     "en": {"LATIN"},
+    "te": {"TELUGU", "KANNADA"},  # Whisper may use Kannada script for Telugu
     "hi": {"DEVANAGARI"},
 }
+# ─────────────────────────────────────────────────────────────────────────────
+# KANNADA → TELUGU SCRIPT FIX
+# Whisper sometimes outputs Telugu audio in Kannada script (very similar glyphs).
+# We convert Kannada codepoints → Telugu so stored text is always Telugu script.
+# ─────────────────────────────────────────────────────────────────────────────
+_KANNADA_TO_TELUGU = {
+    "ಅ":"అ","ಆ":"ఆ","ಇ":"ఇ","ಈ":"ఈ","ಉ":"ఉ","ಊ":"ఊ","ಋ":"ఋ",
+    "ಎ":"ఎ","ಏ":"ఏ","ಐ":"ఐ","ಒ":"ఒ","ಓ":"ఓ","ಔ":"ఔ",
+    "ಾ":"ా","ಿ":"ి","ీ":"ీ","ು":"ు","ూ":"ూ","ೃ":"ృ",
+    "ೆ":"ె","ೇ":"ే","ೈ":"ై","ೊ":"ొ","ೋ":"ో","ೌ":"ౌ",
+    "ಂ":"ం","ಃ":"ః","಼":"఼",
+    "ಕ":"క","ಖ":"ఖ","ಗ":"గ","ಘ":"ఘ","ಙ":"ఙ",
+    "ಚ":"చ","ಛ":"ఛ","ಜ":"జ","ಝ":"ఝ","ಞ":"ఞ",
+    "ಟ":"ట","ಠ":"ఠ","ಡ":"డ","ಢ":"ఢ","ಣ":"ణ",
+    "ತ":"త","ಥ":"థ","ದ":"ద","ಧ":"ధ","ನ":"న",
+    "ಪ":"ప","ಫ":"ఫ","ಬ":"బ","ಭ":"భ","ಮ":"మ",
+    "ಯ":"య","ರ":"ర","ಲ":"ల","ವ":"వ","ಶ":"శ",
+    "ಷ":"ష","ಸ":"స","ಹ":"హ","ಳ":"ళ",
+    "್":"్",
+    "೦":"౦","೧":"౧","೨":"౨","೩":"౩","೪":"౪",
+    "೫":"౫","೬":"౬","೭":"౭","೮":"౮","೯":"౯",
+}
+def fix_script(text: str) -> str:
+    """Convert Kannada script → Telugu if Whisper used wrong script for Telugu audio."""
+    import unicodedata
+    if any(unicodedata.name(ch, "").startswith("KANNADA") for ch in text if ch.strip()):
+        converted = "".join(_KANNADA_TO_TELUGU.get(ch, ch) for ch in text)
+        print(f"[audio_to_text] Kannada→Telugu fix: {text[:40]!r} → {converted[:40]!r}")
+        return converted
+    return text
 # ── Load Whisper ONCE at import time ──────────────────────────────────────────
 if _AUDIO_BACKEND == "local":
     print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
                             print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
                             continue
+                    text = fix_script(text)  # Kannada→Telugu if needed
                     print(f"[audio_to_text] OK lang={lang} | "
                           f"{len(text)} chars: {text[:100]}")
                     return text
                 text = ""
             if _is_valid_transcription(text):
+                text = fix_script(text)  # Kannada→Telugu if Whisper used wrong script
                 print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
                 return text
             else: