civicconnect-ai-engine / multi_modal /audio_to_text.py
MOHAN799S
Fix: convert Kannada→Telugu script before storing (Whisper quirk)
3ba902d
# =========================================================
# multi_modal/audio_to_text.py
#
# Converts an uploaded audio file to text using Whisper.
#
# Supports: WAV, MP3, OGG, FLAC, M4A, WEBM (mobile browsers)
# Languages: Telugu / Hindi / English (forced, no random scripts)
#
# FIXES vs previous version:
# 1. Hallucination detection — Georgian/Chinese/Arabic output
# (ვვვვ... etc.) is detected and discarded, returns ""
# 2. Language forcing — tries TE → HI → EN in order instead
# of pure auto-detect which picks random scripts
# 3. Valid script check — only accepts Latin, Telugu,
# Devanagari output. Anything else = hallucination.
# 4. 500 error fix — empty/invalid transcription now safely
# returns "" instead of passing garbage to BERT classifier
# =========================================================
import os
import tempfile
import unicodedata
import torch
import numpy as np
from transformers import pipeline
# ── Environment ────────────────────────────────────────────────────────────────
_AUDIO_BACKEND = os.environ.get("AUDIO_BACKEND", "local") # "local" | "hf_api"
_HF_TOKEN = os.environ.get("HF_TOKEN", "")
# ── Model selection ────────────────────────────────────────────────────────────
MODEL_ID = os.environ.get("WHISPER_MODEL", "openai/whisper-small")
_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ── Valid Unicode scripts for EN / HI / TE ────────────────────────────────────
# Whisper hallucinates Georgian (ვ), Chinese (的), Arabic (ال) on bad audio.
# Only these script prefixes (from unicodedata.name) are accepted as real output.
_VALID_SCRIPTS = {
"LATIN", # English
"DEVANAGARI", # Hindi
"TELUGU", # Telugu
"KANNADA", # Whisper sometimes outputs Kannada for Telugu audio
"COMMON", # punctuation, digits, spaces
}
# Languages tried in order.
# EN first — fastest for English audio (most common).
# Only these 3 are permitted — no other language accepted.
_LANGUAGE_ORDER = ["en", "te", "hi"]
_ALLOWED_LANGUAGES = {"en", "te", "hi"}
# Expected dominant script per forced language.
# If we force "te" but get back Devanagari-heavy text, it is wrong.
# If we force "hi" but get back Telugu-heavy text, it is wrong.
# This prevents Telugu audio from being accepted as Hindi.
_LANG_EXPECTED_SCRIPT = {
"en": {"LATIN"},
"te": {"TELUGU", "KANNADA"}, # Whisper may use Kannada script for Telugu
"hi": {"DEVANAGARI"},
}
# ─────────────────────────────────────────────────────────────────────────────
# KANNADA → TELUGU SCRIPT FIX
# Whisper sometimes outputs Telugu audio in Kannada script (very similar glyphs).
# We convert Kannada codepoints → Telugu so stored text is always Telugu script.
# ─────────────────────────────────────────────────────────────────────────────
_KANNADA_TO_TELUGU = {
"ಅ":"అ","ಆ":"ఆ","ಇ":"ఇ","ಈ":"ఈ","ಉ":"ఉ","ಊ":"ఊ","ಋ":"ఋ",
"ಎ":"ఎ","ಏ":"ఏ","ಐ":"ఐ","ಒ":"ఒ","ಓ":"ఓ","ಔ":"ఔ",
"ಾ":"ా","ಿ":"ి","ీ":"ీ","ು":"ు","ూ":"ూ","ೃ":"ృ",
"ೆ":"ె","ೇ":"ే","ೈ":"ై","ೊ":"ొ","ೋ":"ో","ೌ":"ౌ",
"ಂ":"ం","ಃ":"ః","಼":"఼",
"ಕ":"క","ಖ":"ఖ","ಗ":"గ","ಘ":"ఘ","ಙ":"ఙ",
"ಚ":"చ","ಛ":"ఛ","ಜ":"జ","ಝ":"ఝ","ಞ":"ఞ",
"ಟ":"ట","ಠ":"ఠ","ಡ":"డ","ಢ":"ఢ","ಣ":"ణ",
"ತ":"త","ಥ":"థ","ದ":"ద","ಧ":"ధ","ನ":"న",
"ಪ":"ప","ಫ":"ఫ","ಬ":"బ","ಭ":"భ","ಮ":"మ",
"ಯ":"య","ರ":"ర","ಲ":"ల","ವ":"వ","ಶ":"శ",
"ಷ":"ష","ಸ":"స","ಹ":"హ","ಳ":"ళ",
"್":"్",
"೦":"౦","೧":"౧","೨":"౨","೩":"౩","೪":"౪",
"೫":"౫","೬":"౬","೭":"౭","೮":"౮","೯":"౯",
}
def fix_script(text: str) -> str:
"""Convert Kannada script → Telugu if Whisper used wrong script for Telugu audio."""
import unicodedata
if any(unicodedata.name(ch, "").startswith("KANNADA") for ch in text if ch.strip()):
converted = "".join(_KANNADA_TO_TELUGU.get(ch, ch) for ch in text)
print(f"[audio_to_text] Kannada→Telugu fix: {text[:40]!r}{converted[:40]!r}")
return converted
return text
# ── Load Whisper ONCE at import time ──────────────────────────────────────────
if _AUDIO_BACKEND == "local":
print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
_ASR_PIPELINE = pipeline(
task = "automatic-speech-recognition",
model = MODEL_ID,
device = _DEVICE,
)
print(f"✅ Whisper '{MODEL_ID}' loaded.")
else:
_ASR_PIPELINE = None
print(f"ℹ️ Whisper skipped — using HF API backend.")
# ─────────────────────────────────────────────────────────────────────────────
# HALLUCINATION DETECTION
# ─────────────────────────────────────────────────────────────────────────────
def _is_valid_transcription(text: str) -> bool:
"""
Returns True only if the transcription looks like real speech.
Checks:
1. Script check -- must be mostly Latin / Devanagari / Telugu
2. Repetition check -- rejects looping hallucinations like
"apne apne apne apne..." where a word repeats 5+ times
"""
if not text or len(text.strip()) < 3:
return False
chars = [c for c in text if not c.isspace()]
if not chars:
return False
# Check 1: Script validation
valid_count = 0
for c in chars:
try:
char_name = unicodedata.name(c, "")
script = char_name.split()[0] if char_name else "UNKNOWN"
if script in _VALID_SCRIPTS:
valid_count += 1
except Exception:
pass
ratio = valid_count / len(chars)
if ratio < 0.60:
print(f"[audio_to_text] WARNING script hallucination "
f"(valid_ratio={ratio:.2f}) discarding: {text[:60]!r}")
return False
# Check 2: Repetition detection
# "apne apne apne apne apne apne..." = Whisper looping hallucination
words = text.strip().split()
if len(words) >= 6:
# Max consecutive repeated word
max_repeat = 1
cur_repeat = 1
for i in range(1, len(words)):
if words[i].lower() == words[i - 1].lower():
cur_repeat += 1
max_repeat = max(max_repeat, cur_repeat)
else:
cur_repeat = 1
if max_repeat >= 5:
print(f"[audio_to_text] WARNING repetition hallucination "
f"(word repeats {max_repeat}x) discarding: {text[:60]!r}")
return False
# Low vocabulary diversity = looping hallucination
# "I love you. I love you..." = 3 unique / 15 words = 0.20 unique ratio
# Real speech always has more variety — threshold: <0.15 for longer texts
unique_ratio = len(set(w.lower() for w in words)) / len(words)
if unique_ratio < 0.15 and len(words) > 15:
print(f"[audio_to_text] WARNING low-diversity hallucination "
f"(unique_ratio={unique_ratio:.2f}) discarding: {text[:60]!r}")
return False
# Check 3: Character-level repetition — catches "अग्वावावावाव..." patterns
# where substrings repeat at character level (not caught by word check)
if len(text) > 20:
# Take a 4-char ngram from position 10 and count how many times it appears
probe = text[8:12]
rep_count = text.count(probe)
if rep_count > len(text) // 8: # appears more than once per 8 chars = looping
print(f"[audio_to_text] WARNING char-level repetition "
f"(probe {probe!r} repeats {rep_count}x) discarding: {text[:60]!r}")
return False
return True
# ─────────────────────────────────────────────────────────────────────────────
# PUBLIC API
# ─────────────────────────────────────────────────────────────────────────────
def transcribe_audio(audio_file) -> str:
"""
Transcribe an uploaded audio file to text.
Parameters
----------
audio_file : werkzeug.datastructures.FileStorage
File from Flask request.files["audio"].
Accepts WAV, MP3, OGG, FLAC, M4A, WEBM.
Returns
-------
str
Transcribed text in EN / HI / TE.
Returns "" on failure or hallucination — never raises.
"""
if _AUDIO_BACKEND == "hf_api" and _HF_TOKEN:
return _transcribe_via_hf_api(audio_file)
return _transcribe_local(audio_file)
# ─────────────────────────────────────────────────────────────────────────────
# LOCAL PATH
# ─────────────────────────────────────────────────────────────────────────────
def _transcribe_local(audio_file) -> str:
try:
audio_bytes = audio_file.read()
if not audio_bytes:
print("[audio_to_text] ⚠️ Empty audio file.")
return ""
suffix = _get_suffix(audio_file)
# Write to temp file — pydub needs a file path on disk
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
audio_array, sample_rate = _load_audio(tmp_path, suffix)
finally:
try:
os.unlink(tmp_path)
except OSError:
pass
if audio_array is None:
print("[audio_to_text] ❌ Could not decode audio — is ffmpeg installed?")
return ""
# ── Audio quality diagnostics ──────────────────────────────────────────
duration_sec = len(audio_array) / 16_000
rms = float(np.sqrt(np.mean(audio_array ** 2)))
peak = float(np.max(np.abs(audio_array)))
print(f"[audio_to_text] 🔍 duration={duration_sec:.1f}s | rms={rms:.4f} | peak={peak:.4f}")
# Reject silent or extremely quiet audio — Whisper hallucinates on silence
if rms < 0.001:
print("[audio_to_text] ❌ Audio is silent (rms<0.001) — nothing to transcribe")
return ""
if duration_sec < 0.5:
print(f"[audio_to_text] ❌ Audio too short ({duration_sec:.2f}s) — minimum 0.5s")
return ""
# ── Try EN → TE → HI — never pure auto-detect ─────────────────────────
# language=None causes Whisper to hallucinate Georgian/Chinese on bad audio.
# Forcing each language and validating the output is far more reliable.
#
# IMPORTANT: the pipeline mutates the input dict internally on the first
# call, so subsequent calls receive a broken dict. Fix: rebuild it fresh
# for every language attempt using a copy of the original numpy array.
audio_array_copy = audio_array.copy()
for lang in _LANGUAGE_ORDER:
try:
# Fresh dict every iteration — never reuse across pipeline calls
audio_input = {"raw": audio_array_copy.copy(), "sampling_rate": 16_000}
result = _ASR_PIPELINE(
audio_input,
generate_kwargs={
"language": lang,
"task": "transcribe",
# temperature and compression_ratio_threshold cause a
# 'logprobs' bug in some transformers versions — removed.
# Hallucination is handled by our own validator instead.
},
return_timestamps=False,
)
text = result.get("text", "").strip()
if not text:
print(f"[audio_to_text] ↩️ lang={lang} -> empty, trying next")
continue
# Strict language whitelist — only EN / HI / TE accepted.
# Whisper sometimes returns text in a completely different language
# even when forced (e.g. forced TE returns Khmer). Detect this by
# checking the detected_language field when available.
detected_lang = result.get("chunks", [{}])[0].get("language", lang) if isinstance(result.get("chunks"), list) else lang
if _is_valid_transcription(text):
# Extra check: does the output script match the forced language?
# Whisper-small often outputs Hindi (Devanagari) when forced to TE.
# Reject if dominant script does not match expected script for lang.
expected_scripts = _LANG_EXPECTED_SCRIPT.get(lang, None)
if expected_scripts and lang != "en":
chars = [c for c in text if not c.isspace()]
script_counts = {}
for c in chars:
try:
sc = unicodedata.name(c, "").split()[0]
script_counts[sc] = script_counts.get(sc, 0) + 1
except Exception:
pass
dominant = max(script_counts, key=script_counts.get) if script_counts else "UNKNOWN"
if dominant not in expected_scripts and dominant not in ("COMMON", "LATIN"):
print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
continue
text = fix_script(text) # Kannada→Telugu if needed
print(f"[audio_to_text] OK lang={lang} | "
f"{len(text)} chars: {text[:100]}")
return text
else:
print(f"[audio_to_text] lang={lang} hallucinated — trying next")
continue
except Exception as e:
print(f"[audio_to_text] ❌ lang={lang} error: {e}")
continue
print("[audio_to_text] ❌ All language attempts failed — returning empty")
return ""
except Exception as e:
print(f"[audio_to_text] ❌ Transcription failed: {e}")
return ""
def _load_audio(file_path: str, suffix: str):
"""
Load audio file as float32 numpy array at 16 kHz mono.
Strategy:
1. pydub — handles MP3, OGG, WEBM, M4A, WAV, FLAC (needs ffmpeg)
2. soundfile fallback — WAV and FLAC only (no ffmpeg needed)
Returns (audio_array, 16000) or (None, None) on failure.
"""
# ── pydub (primary) ────────────────────────────────────────────────────────
try:
from pydub import AudioSegment
fmt = suffix.lstrip(".").lower()
fmt_map = {"m4a": "mp4", "webm": "webm", "ogg": "ogg"}
fmt = fmt_map.get(fmt, fmt)
audio_seg = AudioSegment.from_file(file_path, format=fmt)
audio_seg = audio_seg.set_channels(1).set_frame_rate(16_000)
samples = np.array(audio_seg.get_array_of_samples(), dtype=np.float32)
# Normalize based on actual sample width — pydub can return int16 OR int32
# depending on source format. Always normalize to float32 [-1.0, 1.0]
sample_width = audio_seg.sample_width # bytes per sample: 1=8bit, 2=16bit, 4=32bit
max_val = float(2 ** (8 * sample_width - 1))
samples = samples / max_val
# Safety clamp — should already be in range but guard against edge cases
samples = np.clip(samples, -1.0, 1.0)
print(f"[audio_to_text] pydub decoded: sample_width={sample_width}B "
f"max_val={max_val:.0f} post_rms={float(np.sqrt(np.mean(samples**2))):.4f}")
return samples, 16_000
except ImportError:
print("[audio_to_text] pydub not installed — falling back to soundfile")
print(" pip install pydub + install ffmpeg")
except Exception as e:
print(f"[audio_to_text] pydub failed ({e}) — trying soundfile")
# ── soundfile (fallback — WAV/FLAC only) ───────────────────────────────────
try:
import soundfile as sf
audio_array, sample_rate = sf.read(file_path, dtype="float32")
if audio_array.ndim > 1:
audio_array = audio_array.mean(axis=1)
if sample_rate != 16_000:
audio_array = _resample(audio_array, sample_rate, 16_000)
return audio_array, 16_000
except Exception as e:
print(f"[audio_to_text] soundfile failed: {e}")
return None, None
# ─────────────────────────────────────────────────────────────────────────────
# HF API PATH — production / HF Spaces
# ─────────────────────────────────────────────────────────────────────────────
def _transcribe_via_hf_api(audio_file) -> str:
"""
Production path — HuggingFace Inference API (whisper-large-v3 on HF GPU).
Set AUDIO_BACKEND=hf_api and HF_TOKEN=hf_xxx in HF Space Secrets.
Why large-v3 via API instead of loading locally:
- large-v3 = 3GB — too large to load on free HF Spaces
- HF API runs it on GPU — faster than local CPU anyway (~15-30s vs 3min)
- Free tier: 1000 requests/day — enough for a civic portal
large-v3 auto-detect is accurate enough for EN/TE/HI — no need for
the 3-attempt language loop used in local path.
"""
import requests
try:
audio_bytes = audio_file.read()
if not audio_bytes:
return ""
print(f"[audio_to_text] HF API: sending {len(audio_bytes)} bytes to whisper-large-v3...")
# Detect audio format from magic bytes for correct Content-Type
if audio_bytes[:4] == b'OggS':
content_type = "audio/ogg"
elif audio_bytes[:4] == b'RIFF':
content_type = "audio/wav"
elif audio_bytes[:3] == b'ID3' or audio_bytes[:2] == b'\xff\xfb':
content_type = "audio/mpeg"
elif audio_bytes[:4] == b'fLaC':
content_type = "audio/flac"
else:
content_type = "audio/webm"
print(f"[audio_to_text] detected content_type={content_type}")
# First attempt: auto-detect language (large-v3 is accurate enough)
res = requests.post(
"https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3",
headers = {
"Authorization": f"Bearer {_HF_TOKEN}",
"Content-Type": content_type,
},
data = audio_bytes,
timeout = 120, # HF free tier can queue up to 60s before processing
)
# Handle model loading (HF cold start)
if res.status_code == 503:
import time
print("[audio_to_text] HF API: model loading — waiting 20s...")
time.sleep(20)
res = requests.post(
"https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3",
headers = {
"Authorization": f"Bearer {_HF_TOKEN}",
"Content-Type": content_type,
},
data = audio_bytes,
timeout = 120,
)
if res.ok:
data = res.json()
# HF API returns {"text": "..."} or [{"generated_text": "..."}]
if isinstance(data, dict):
text = data.get("text", "").strip()
elif isinstance(data, list) and data:
text = data[0].get("generated_text", "").strip()
else:
text = ""
if _is_valid_transcription(text):
text = fix_script(text) # Kannada→Telugu if Whisper used wrong script
print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
return text
else:
print(f"[audio_to_text] HF API hallucination discarded: {text[:60]!r}")
return ""
else:
print(f"[audio_to_text] HF API error {res.status_code}: {res.text[:300]}")
return ""
except requests.exceptions.Timeout:
print("[audio_to_text] HF API timeout — model may be overloaded")
return ""
except Exception as e:
print(f"[audio_to_text] HF API exception: {e}")
return ""
# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────
def _get_suffix(audio_file) -> str:
"""Determine file extension from FileStorage. Defaults to .webm."""
filename = getattr(audio_file, "filename", "") or ""
mime = getattr(audio_file, "mimetype", "") or ""
_MIME_TO_EXT = {
"audio/wav": ".wav", "audio/x-wav": ".wav", "audio/wave": ".wav",
"audio/mpeg": ".mp3", "audio/mp3": ".mp3",
"audio/ogg": ".ogg",
"audio/flac": ".flac", "audio/x-flac": ".flac",
"audio/mp4": ".m4a", "audio/x-m4a": ".m4a",
"audio/webm": ".webm", "video/webm": ".webm",
}
if "." in filename:
return "." + filename.rsplit(".", 1)[-1].lower()
# Default to .webm — Chrome/Edge MediaRecorder always sends webm
return _MIME_TO_EXT.get(mime.lower(), ".webm")
def _resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
"""Resample audio array from orig_sr to target_sr."""
try:
from scipy.signal import resample_poly
from math import gcd
g = gcd(orig_sr, target_sr)
return resample_poly(audio, target_sr // g, orig_sr // g).astype(np.float32)
except ImportError:
target_length = int(len(audio) * target_sr / orig_sr)
return np.interp(
np.linspace(0, len(audio) - 1, target_length),
np.arange(len(audio)),
audio,
).astype(np.float32)