Spaces:
Sleeping
Sleeping
| # ========================================================= | |
| # multi_modal/audio_to_text.py | |
| # | |
| # Converts an uploaded audio file to text using Whisper. | |
| # | |
| # Supports: WAV, MP3, OGG, FLAC, M4A, WEBM (mobile browsers) | |
| # Languages: Telugu / Hindi / English (forced, no random scripts) | |
| # | |
| # FIXES vs previous version: | |
| # 1. Hallucination detection — Georgian/Chinese/Arabic output | |
| # (ვვვვ... etc.) is detected and discarded, returns "" | |
| # 2. Language forcing — tries TE → HI → EN in order instead | |
| # of pure auto-detect which picks random scripts | |
| # 3. Valid script check — only accepts Latin, Telugu, | |
| # Devanagari output. Anything else = hallucination. | |
| # 4. 500 error fix — empty/invalid transcription now safely | |
| # returns "" instead of passing garbage to BERT classifier | |
| # ========================================================= | |
| import os | |
| import tempfile | |
| import unicodedata | |
| import torch | |
| import numpy as np | |
| from transformers import pipeline | |
| # ── Environment ──────────────────────────────────────────────────────────────── | |
| _AUDIO_BACKEND = os.environ.get("AUDIO_BACKEND", "local") # "local" | "hf_api" | |
| _HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| # ── Model selection ──────────────────────────────────────────────────────────── | |
| MODEL_ID = os.environ.get("WHISPER_MODEL", "openai/whisper-small") | |
| _DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # ── Valid Unicode scripts for EN / HI / TE ──────────────────────────────────── | |
| # Whisper hallucinates Georgian (ვ), Chinese (的), Arabic (ال) on bad audio. | |
| # Only these script prefixes (from unicodedata.name) are accepted as real output. | |
| _VALID_SCRIPTS = { | |
| "LATIN", # English | |
| "DEVANAGARI", # Hindi | |
| "TELUGU", # Telugu | |
| "KANNADA", # Whisper sometimes outputs Kannada for Telugu audio | |
| "COMMON", # punctuation, digits, spaces | |
| } | |
| # Languages tried in order. | |
| # EN first — fastest for English audio (most common). | |
| # Only these 3 are permitted — no other language accepted. | |
| _LANGUAGE_ORDER = ["en", "te", "hi"] | |
| _ALLOWED_LANGUAGES = {"en", "te", "hi"} | |
| # Expected dominant script per forced language. | |
| # If we force "te" but get back Devanagari-heavy text, it is wrong. | |
| # If we force "hi" but get back Telugu-heavy text, it is wrong. | |
| # This prevents Telugu audio from being accepted as Hindi. | |
| _LANG_EXPECTED_SCRIPT = { | |
| "en": {"LATIN"}, | |
| "te": {"TELUGU", "KANNADA"}, # Whisper may use Kannada script for Telugu | |
| "hi": {"DEVANAGARI"}, | |
| } | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # KANNADA → TELUGU SCRIPT FIX | |
| # Whisper sometimes outputs Telugu audio in Kannada script (very similar glyphs). | |
| # We convert Kannada codepoints → Telugu so stored text is always Telugu script. | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| _KANNADA_TO_TELUGU = { | |
| "ಅ":"అ","ಆ":"ఆ","ಇ":"ఇ","ಈ":"ఈ","ಉ":"ఉ","ಊ":"ఊ","ಋ":"ఋ", | |
| "ಎ":"ఎ","ಏ":"ఏ","ಐ":"ఐ","ಒ":"ఒ","ಓ":"ఓ","ಔ":"ఔ", | |
| "ಾ":"ా","ಿ":"ి","ీ":"ీ","ು":"ు","ూ":"ూ","ೃ":"ృ", | |
| "ೆ":"ె","ೇ":"ే","ೈ":"ై","ೊ":"ొ","ೋ":"ో","ೌ":"ౌ", | |
| "ಂ":"ం","ಃ":"ః","಼":"఼", | |
| "ಕ":"క","ಖ":"ఖ","ಗ":"గ","ಘ":"ఘ","ಙ":"ఙ", | |
| "ಚ":"చ","ಛ":"ఛ","ಜ":"జ","ಝ":"ఝ","ಞ":"ఞ", | |
| "ಟ":"ట","ಠ":"ఠ","ಡ":"డ","ಢ":"ఢ","ಣ":"ణ", | |
| "ತ":"త","ಥ":"థ","ದ":"ద","ಧ":"ధ","ನ":"న", | |
| "ಪ":"ప","ಫ":"ఫ","ಬ":"బ","ಭ":"భ","ಮ":"మ", | |
| "ಯ":"య","ರ":"ర","ಲ":"ల","ವ":"వ","ಶ":"శ", | |
| "ಷ":"ష","ಸ":"స","ಹ":"హ","ಳ":"ళ", | |
| "್":"్", | |
| "೦":"౦","೧":"౧","೨":"౨","೩":"౩","೪":"౪", | |
| "೫":"౫","೬":"౬","೭":"౭","೮":"౮","೯":"౯", | |
| } | |
| def fix_script(text: str) -> str: | |
| """Convert Kannada script → Telugu if Whisper used wrong script for Telugu audio.""" | |
| import unicodedata | |
| if any(unicodedata.name(ch, "").startswith("KANNADA") for ch in text if ch.strip()): | |
| converted = "".join(_KANNADA_TO_TELUGU.get(ch, ch) for ch in text) | |
| print(f"[audio_to_text] Kannada→Telugu fix: {text[:40]!r} → {converted[:40]!r}") | |
| return converted | |
| return text | |
| # ── Load Whisper ONCE at import time ────────────────────────────────────────── | |
| if _AUDIO_BACKEND == "local": | |
| print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…") | |
| _ASR_PIPELINE = pipeline( | |
| task = "automatic-speech-recognition", | |
| model = MODEL_ID, | |
| device = _DEVICE, | |
| ) | |
| print(f"✅ Whisper '{MODEL_ID}' loaded.") | |
| else: | |
| _ASR_PIPELINE = None | |
| print(f"ℹ️ Whisper skipped — using HF API backend.") | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # HALLUCINATION DETECTION | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def _is_valid_transcription(text: str) -> bool: | |
| """ | |
| Returns True only if the transcription looks like real speech. | |
| Checks: | |
| 1. Script check -- must be mostly Latin / Devanagari / Telugu | |
| 2. Repetition check -- rejects looping hallucinations like | |
| "apne apne apne apne..." where a word repeats 5+ times | |
| """ | |
| if not text or len(text.strip()) < 3: | |
| return False | |
| chars = [c for c in text if not c.isspace()] | |
| if not chars: | |
| return False | |
| # Check 1: Script validation | |
| valid_count = 0 | |
| for c in chars: | |
| try: | |
| char_name = unicodedata.name(c, "") | |
| script = char_name.split()[0] if char_name else "UNKNOWN" | |
| if script in _VALID_SCRIPTS: | |
| valid_count += 1 | |
| except Exception: | |
| pass | |
| ratio = valid_count / len(chars) | |
| if ratio < 0.60: | |
| print(f"[audio_to_text] WARNING script hallucination " | |
| f"(valid_ratio={ratio:.2f}) discarding: {text[:60]!r}") | |
| return False | |
| # Check 2: Repetition detection | |
| # "apne apne apne apne apne apne..." = Whisper looping hallucination | |
| words = text.strip().split() | |
| if len(words) >= 6: | |
| # Max consecutive repeated word | |
| max_repeat = 1 | |
| cur_repeat = 1 | |
| for i in range(1, len(words)): | |
| if words[i].lower() == words[i - 1].lower(): | |
| cur_repeat += 1 | |
| max_repeat = max(max_repeat, cur_repeat) | |
| else: | |
| cur_repeat = 1 | |
| if max_repeat >= 5: | |
| print(f"[audio_to_text] WARNING repetition hallucination " | |
| f"(word repeats {max_repeat}x) discarding: {text[:60]!r}") | |
| return False | |
| # Low vocabulary diversity = looping hallucination | |
| # "I love you. I love you..." = 3 unique / 15 words = 0.20 unique ratio | |
| # Real speech always has more variety — threshold: <0.15 for longer texts | |
| unique_ratio = len(set(w.lower() for w in words)) / len(words) | |
| if unique_ratio < 0.15 and len(words) > 15: | |
| print(f"[audio_to_text] WARNING low-diversity hallucination " | |
| f"(unique_ratio={unique_ratio:.2f}) discarding: {text[:60]!r}") | |
| return False | |
| # Check 3: Character-level repetition — catches "अग्वावावावाव..." patterns | |
| # where substrings repeat at character level (not caught by word check) | |
| if len(text) > 20: | |
| # Take a 4-char ngram from position 10 and count how many times it appears | |
| probe = text[8:12] | |
| rep_count = text.count(probe) | |
| if rep_count > len(text) // 8: # appears more than once per 8 chars = looping | |
| print(f"[audio_to_text] WARNING char-level repetition " | |
| f"(probe {probe!r} repeats {rep_count}x) discarding: {text[:60]!r}") | |
| return False | |
| return True | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # PUBLIC API | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def transcribe_audio(audio_file) -> str: | |
| """ | |
| Transcribe an uploaded audio file to text. | |
| Parameters | |
| ---------- | |
| audio_file : werkzeug.datastructures.FileStorage | |
| File from Flask request.files["audio"]. | |
| Accepts WAV, MP3, OGG, FLAC, M4A, WEBM. | |
| Returns | |
| ------- | |
| str | |
| Transcribed text in EN / HI / TE. | |
| Returns "" on failure or hallucination — never raises. | |
| """ | |
| if _AUDIO_BACKEND == "hf_api" and _HF_TOKEN: | |
| return _transcribe_via_hf_api(audio_file) | |
| return _transcribe_local(audio_file) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # LOCAL PATH | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def _transcribe_local(audio_file) -> str: | |
| try: | |
| audio_bytes = audio_file.read() | |
| if not audio_bytes: | |
| print("[audio_to_text] ⚠️ Empty audio file.") | |
| return "" | |
| suffix = _get_suffix(audio_file) | |
| # Write to temp file — pydub needs a file path on disk | |
| with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: | |
| tmp.write(audio_bytes) | |
| tmp_path = tmp.name | |
| try: | |
| audio_array, sample_rate = _load_audio(tmp_path, suffix) | |
| finally: | |
| try: | |
| os.unlink(tmp_path) | |
| except OSError: | |
| pass | |
| if audio_array is None: | |
| print("[audio_to_text] ❌ Could not decode audio — is ffmpeg installed?") | |
| return "" | |
| # ── Audio quality diagnostics ────────────────────────────────────────── | |
| duration_sec = len(audio_array) / 16_000 | |
| rms = float(np.sqrt(np.mean(audio_array ** 2))) | |
| peak = float(np.max(np.abs(audio_array))) | |
| print(f"[audio_to_text] 🔍 duration={duration_sec:.1f}s | rms={rms:.4f} | peak={peak:.4f}") | |
| # Reject silent or extremely quiet audio — Whisper hallucinates on silence | |
| if rms < 0.001: | |
| print("[audio_to_text] ❌ Audio is silent (rms<0.001) — nothing to transcribe") | |
| return "" | |
| if duration_sec < 0.5: | |
| print(f"[audio_to_text] ❌ Audio too short ({duration_sec:.2f}s) — minimum 0.5s") | |
| return "" | |
| # ── Try EN → TE → HI — never pure auto-detect ───────────────────────── | |
| # language=None causes Whisper to hallucinate Georgian/Chinese on bad audio. | |
| # Forcing each language and validating the output is far more reliable. | |
| # | |
| # IMPORTANT: the pipeline mutates the input dict internally on the first | |
| # call, so subsequent calls receive a broken dict. Fix: rebuild it fresh | |
| # for every language attempt using a copy of the original numpy array. | |
| audio_array_copy = audio_array.copy() | |
| for lang in _LANGUAGE_ORDER: | |
| try: | |
| # Fresh dict every iteration — never reuse across pipeline calls | |
| audio_input = {"raw": audio_array_copy.copy(), "sampling_rate": 16_000} | |
| result = _ASR_PIPELINE( | |
| audio_input, | |
| generate_kwargs={ | |
| "language": lang, | |
| "task": "transcribe", | |
| # temperature and compression_ratio_threshold cause a | |
| # 'logprobs' bug in some transformers versions — removed. | |
| # Hallucination is handled by our own validator instead. | |
| }, | |
| return_timestamps=False, | |
| ) | |
| text = result.get("text", "").strip() | |
| if not text: | |
| print(f"[audio_to_text] ↩️ lang={lang} -> empty, trying next") | |
| continue | |
| # Strict language whitelist — only EN / HI / TE accepted. | |
| # Whisper sometimes returns text in a completely different language | |
| # even when forced (e.g. forced TE returns Khmer). Detect this by | |
| # checking the detected_language field when available. | |
| detected_lang = result.get("chunks", [{}])[0].get("language", lang) if isinstance(result.get("chunks"), list) else lang | |
| if _is_valid_transcription(text): | |
| # Extra check: does the output script match the forced language? | |
| # Whisper-small often outputs Hindi (Devanagari) when forced to TE. | |
| # Reject if dominant script does not match expected script for lang. | |
| expected_scripts = _LANG_EXPECTED_SCRIPT.get(lang, None) | |
| if expected_scripts and lang != "en": | |
| chars = [c for c in text if not c.isspace()] | |
| script_counts = {} | |
| for c in chars: | |
| try: | |
| sc = unicodedata.name(c, "").split()[0] | |
| script_counts[sc] = script_counts.get(sc, 0) + 1 | |
| except Exception: | |
| pass | |
| dominant = max(script_counts, key=script_counts.get) if script_counts else "UNKNOWN" | |
| if dominant not in expected_scripts and dominant not in ("COMMON", "LATIN"): | |
| print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next") | |
| continue | |
| text = fix_script(text) # Kannada→Telugu if needed | |
| print(f"[audio_to_text] OK lang={lang} | " | |
| f"{len(text)} chars: {text[:100]}") | |
| return text | |
| else: | |
| print(f"[audio_to_text] lang={lang} hallucinated — trying next") | |
| continue | |
| except Exception as e: | |
| print(f"[audio_to_text] ❌ lang={lang} error: {e}") | |
| continue | |
| print("[audio_to_text] ❌ All language attempts failed — returning empty") | |
| return "" | |
| except Exception as e: | |
| print(f"[audio_to_text] ❌ Transcription failed: {e}") | |
| return "" | |
| def _load_audio(file_path: str, suffix: str): | |
| """ | |
| Load audio file as float32 numpy array at 16 kHz mono. | |
| Strategy: | |
| 1. pydub — handles MP3, OGG, WEBM, M4A, WAV, FLAC (needs ffmpeg) | |
| 2. soundfile fallback — WAV and FLAC only (no ffmpeg needed) | |
| Returns (audio_array, 16000) or (None, None) on failure. | |
| """ | |
| # ── pydub (primary) ──────────────────────────────────────────────────────── | |
| try: | |
| from pydub import AudioSegment | |
| fmt = suffix.lstrip(".").lower() | |
| fmt_map = {"m4a": "mp4", "webm": "webm", "ogg": "ogg"} | |
| fmt = fmt_map.get(fmt, fmt) | |
| audio_seg = AudioSegment.from_file(file_path, format=fmt) | |
| audio_seg = audio_seg.set_channels(1).set_frame_rate(16_000) | |
| samples = np.array(audio_seg.get_array_of_samples(), dtype=np.float32) | |
| # Normalize based on actual sample width — pydub can return int16 OR int32 | |
| # depending on source format. Always normalize to float32 [-1.0, 1.0] | |
| sample_width = audio_seg.sample_width # bytes per sample: 1=8bit, 2=16bit, 4=32bit | |
| max_val = float(2 ** (8 * sample_width - 1)) | |
| samples = samples / max_val | |
| # Safety clamp — should already be in range but guard against edge cases | |
| samples = np.clip(samples, -1.0, 1.0) | |
| print(f"[audio_to_text] pydub decoded: sample_width={sample_width}B " | |
| f"max_val={max_val:.0f} post_rms={float(np.sqrt(np.mean(samples**2))):.4f}") | |
| return samples, 16_000 | |
| except ImportError: | |
| print("[audio_to_text] pydub not installed — falling back to soundfile") | |
| print(" pip install pydub + install ffmpeg") | |
| except Exception as e: | |
| print(f"[audio_to_text] pydub failed ({e}) — trying soundfile") | |
| # ── soundfile (fallback — WAV/FLAC only) ─────────────────────────────────── | |
| try: | |
| import soundfile as sf | |
| audio_array, sample_rate = sf.read(file_path, dtype="float32") | |
| if audio_array.ndim > 1: | |
| audio_array = audio_array.mean(axis=1) | |
| if sample_rate != 16_000: | |
| audio_array = _resample(audio_array, sample_rate, 16_000) | |
| return audio_array, 16_000 | |
| except Exception as e: | |
| print(f"[audio_to_text] soundfile failed: {e}") | |
| return None, None | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # HF API PATH — production / HF Spaces | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def _transcribe_via_hf_api(audio_file) -> str: | |
| """ | |
| Production path — HuggingFace Inference API (whisper-large-v3 on HF GPU). | |
| Set AUDIO_BACKEND=hf_api and HF_TOKEN=hf_xxx in HF Space Secrets. | |
| Why large-v3 via API instead of loading locally: | |
| - large-v3 = 3GB — too large to load on free HF Spaces | |
| - HF API runs it on GPU — faster than local CPU anyway (~15-30s vs 3min) | |
| - Free tier: 1000 requests/day — enough for a civic portal | |
| large-v3 auto-detect is accurate enough for EN/TE/HI — no need for | |
| the 3-attempt language loop used in local path. | |
| """ | |
| import requests | |
| try: | |
| audio_bytes = audio_file.read() | |
| if not audio_bytes: | |
| return "" | |
| print(f"[audio_to_text] HF API: sending {len(audio_bytes)} bytes to whisper-large-v3...") | |
| # Detect audio format from magic bytes for correct Content-Type | |
| if audio_bytes[:4] == b'OggS': | |
| content_type = "audio/ogg" | |
| elif audio_bytes[:4] == b'RIFF': | |
| content_type = "audio/wav" | |
| elif audio_bytes[:3] == b'ID3' or audio_bytes[:2] == b'\xff\xfb': | |
| content_type = "audio/mpeg" | |
| elif audio_bytes[:4] == b'fLaC': | |
| content_type = "audio/flac" | |
| else: | |
| content_type = "audio/webm" | |
| print(f"[audio_to_text] detected content_type={content_type}") | |
| # First attempt: auto-detect language (large-v3 is accurate enough) | |
| res = requests.post( | |
| "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3", | |
| headers = { | |
| "Authorization": f"Bearer {_HF_TOKEN}", | |
| "Content-Type": content_type, | |
| }, | |
| data = audio_bytes, | |
| timeout = 120, # HF free tier can queue up to 60s before processing | |
| ) | |
| # Handle model loading (HF cold start) | |
| if res.status_code == 503: | |
| import time | |
| print("[audio_to_text] HF API: model loading — waiting 20s...") | |
| time.sleep(20) | |
| res = requests.post( | |
| "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3", | |
| headers = { | |
| "Authorization": f"Bearer {_HF_TOKEN}", | |
| "Content-Type": content_type, | |
| }, | |
| data = audio_bytes, | |
| timeout = 120, | |
| ) | |
| if res.ok: | |
| data = res.json() | |
| # HF API returns {"text": "..."} or [{"generated_text": "..."}] | |
| if isinstance(data, dict): | |
| text = data.get("text", "").strip() | |
| elif isinstance(data, list) and data: | |
| text = data[0].get("generated_text", "").strip() | |
| else: | |
| text = "" | |
| if _is_valid_transcription(text): | |
| text = fix_script(text) # Kannada→Telugu if Whisper used wrong script | |
| print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}") | |
| return text | |
| else: | |
| print(f"[audio_to_text] HF API hallucination discarded: {text[:60]!r}") | |
| return "" | |
| else: | |
| print(f"[audio_to_text] HF API error {res.status_code}: {res.text[:300]}") | |
| return "" | |
| except requests.exceptions.Timeout: | |
| print("[audio_to_text] HF API timeout — model may be overloaded") | |
| return "" | |
| except Exception as e: | |
| print(f"[audio_to_text] HF API exception: {e}") | |
| return "" | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # HELPERS | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def _get_suffix(audio_file) -> str: | |
| """Determine file extension from FileStorage. Defaults to .webm.""" | |
| filename = getattr(audio_file, "filename", "") or "" | |
| mime = getattr(audio_file, "mimetype", "") or "" | |
| _MIME_TO_EXT = { | |
| "audio/wav": ".wav", "audio/x-wav": ".wav", "audio/wave": ".wav", | |
| "audio/mpeg": ".mp3", "audio/mp3": ".mp3", | |
| "audio/ogg": ".ogg", | |
| "audio/flac": ".flac", "audio/x-flac": ".flac", | |
| "audio/mp4": ".m4a", "audio/x-m4a": ".m4a", | |
| "audio/webm": ".webm", "video/webm": ".webm", | |
| } | |
| if "." in filename: | |
| return "." + filename.rsplit(".", 1)[-1].lower() | |
| # Default to .webm — Chrome/Edge MediaRecorder always sends webm | |
| return _MIME_TO_EXT.get(mime.lower(), ".webm") | |
| def _resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: | |
| """Resample audio array from orig_sr to target_sr.""" | |
| try: | |
| from scipy.signal import resample_poly | |
| from math import gcd | |
| g = gcd(orig_sr, target_sr) | |
| return resample_poly(audio, target_sr // g, orig_sr // g).astype(np.float32) | |
| except ImportError: | |
| target_length = int(len(audio) * target_sr / orig_sr) | |
| return np.interp( | |
| np.linspace(0, len(audio) - 1, target_length), | |
| np.arange(len(audio)), | |
| audio, | |
| ).astype(np.float32) |