MOHAN799S commited on
Commit
3ba902d
·
1 Parent(s): a349f20

Fix: convert Kannada→Telugu script before storing (Whisper quirk)

Browse files
Files changed (1) hide show
  1. multi_modal/audio_to_text.py +38 -1
multi_modal/audio_to_text.py CHANGED
@@ -39,6 +39,7 @@ _VALID_SCRIPTS = {
39
  "LATIN", # English
40
  "DEVANAGARI", # Hindi
41
  "TELUGU", # Telugu
 
42
  "COMMON", # punctuation, digits, spaces
43
  }
44
 
@@ -54,11 +55,45 @@ _ALLOWED_LANGUAGES = {"en", "te", "hi"}
54
  # This prevents Telugu audio from being accepted as Hindi.
55
  _LANG_EXPECTED_SCRIPT = {
56
  "en": {"LATIN"},
57
- "te": {"TELUGU"},
58
  "hi": {"DEVANAGARI"},
59
  }
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # ── Load Whisper ONCE at import time ──────────────────────────────────────────
63
  if _AUDIO_BACKEND == "local":
64
  print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
@@ -272,6 +307,7 @@ def _transcribe_local(audio_file) -> str:
272
  print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
273
  continue
274
 
 
275
  print(f"[audio_to_text] OK lang={lang} | "
276
  f"{len(text)} chars: {text[:100]}")
277
  return text
@@ -425,6 +461,7 @@ def _transcribe_via_hf_api(audio_file) -> str:
425
  text = ""
426
 
427
  if _is_valid_transcription(text):
 
428
  print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
429
  return text
430
  else:
 
39
  "LATIN", # English
40
  "DEVANAGARI", # Hindi
41
  "TELUGU", # Telugu
42
+ "KANNADA", # Whisper sometimes outputs Kannada for Telugu audio
43
  "COMMON", # punctuation, digits, spaces
44
  }
45
 
 
55
  # This prevents Telugu audio from being accepted as Hindi.
56
  _LANG_EXPECTED_SCRIPT = {
57
  "en": {"LATIN"},
58
+ "te": {"TELUGU", "KANNADA"}, # Whisper may use Kannada script for Telugu
59
  "hi": {"DEVANAGARI"},
60
  }
61
 
62
 
63
+
64
+
65
+ # ─────────────────────────────────────────────────────────────────────────────
66
+ # KANNADA → TELUGU SCRIPT FIX
67
+ # Whisper sometimes outputs Telugu audio in Kannada script (very similar glyphs).
68
+ # We convert Kannada codepoints → Telugu so stored text is always Telugu script.
69
+ # ─────────────────────────────────────────────────────────────────────────────
70
+ _KANNADA_TO_TELUGU = {
71
+ "ಅ":"అ","ಆ":"ఆ","ಇ":"ఇ","ಈ":"ఈ","ಉ":"ఉ","ಊ":"ఊ","ಋ":"ఋ",
72
+ "ಎ":"ఎ","ಏ":"ఏ","ಐ":"ఐ","ಒ":"ఒ","ಓ":"ఓ","ಔ":"ఔ",
73
+ "ಾ":"ా","ಿ":"ి","ీ":"ీ","ು":"ు","ూ":"ూ","ೃ":"ృ",
74
+ "ೆ":"ె","ೇ":"ే","ೈ":"ై","ೊ":"ొ","ೋ":"ో","ೌ":"ౌ",
75
+ "ಂ":"ం","ಃ":"ః","಼":"఼",
76
+ "ಕ":"క","ಖ":"ఖ","ಗ":"గ","ಘ":"ఘ","ಙ":"ఙ",
77
+ "ಚ":"చ","ಛ":"ఛ","ಜ":"జ","ಝ":"ఝ","ಞ":"ఞ",
78
+ "ಟ":"ట","ಠ":"ఠ","ಡ":"డ","ಢ":"ఢ","ಣ":"ణ",
79
+ "ತ":"త","ಥ":"థ","ದ":"ద","ಧ":"ధ","ನ":"న",
80
+ "ಪ":"ప","ಫ":"ఫ","ಬ":"బ","ಭ":"భ","ಮ":"మ",
81
+ "ಯ":"య","ರ":"ర","ಲ":"ల","ವ":"వ","ಶ":"శ",
82
+ "ಷ":"ష","ಸ":"స","ಹ":"హ","ಳ":"ళ",
83
+ "್":"్",
84
+ "೦":"౦","೧":"౧","೨":"౨","೩":"౩","೪":"౪",
85
+ "೫":"౫","೬":"౬","೭":"౭","೮":"౮","೯":"౯",
86
+ }
87
+
88
+ def fix_script(text: str) -> str:
89
+ """Convert Kannada script → Telugu if Whisper used wrong script for Telugu audio."""
90
+ import unicodedata
91
+ if any(unicodedata.name(ch, "").startswith("KANNADA") for ch in text if ch.strip()):
92
+ converted = "".join(_KANNADA_TO_TELUGU.get(ch, ch) for ch in text)
93
+ print(f"[audio_to_text] Kannada→Telugu fix: {text[:40]!r} → {converted[:40]!r}")
94
+ return converted
95
+ return text
96
+
97
  # ── Load Whisper ONCE at import time ──────────────────────────────────────────
98
  if _AUDIO_BACKEND == "local":
99
  print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
 
307
  print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
308
  continue
309
 
310
+ text = fix_script(text) # Kannada→Telugu if needed
311
  print(f"[audio_to_text] OK lang={lang} | "
312
  f"{len(text)} chars: {text[:100]}")
313
  return text
 
461
  text = ""
462
 
463
  if _is_valid_transcription(text):
464
+ text = fix_script(text) # Kannada→Telugu if Whisper used wrong script
465
  print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
466
  return text
467
  else: