Spaces:

mohanbot799s
/

civicconnect-ai-engine

Sleeping

civicconnect-ai-engine / multi_modal /audio_to_text.py

MOHAN799S

Fix: convert Kannada→Telugu script before storing (Whisper quirk)

3ba902d about 2 months ago

24.4 kB

	# =========================================================
	# multi_modal/audio_to_text.py
	#
	# Converts an uploaded audio file to text using Whisper.
	#
	# Supports: WAV, MP3, OGG, FLAC, M4A, WEBM (mobile browsers)
	# Languages: Telugu / Hindi / English (forced, no random scripts)
	#
	# FIXES vs previous version:
	# 1. Hallucination detection — Georgian/Chinese/Arabic output
	# (ვვვვ... etc.) is detected and discarded, returns ""
	# 2. Language forcing — tries TE → HI → EN in order instead
	# of pure auto-detect which picks random scripts
	# 3. Valid script check — only accepts Latin, Telugu,
	# Devanagari output. Anything else = hallucination.
	# 4. 500 error fix — empty/invalid transcription now safely
	# returns "" instead of passing garbage to BERT classifier
	# =========================================================

	import os
	import tempfile
	import unicodedata
	import torch
	import numpy as np
	from transformers import pipeline

	# ── Environment ────────────────────────────────────────────────────────────────
	_AUDIO_BACKEND = os.environ.get("AUDIO_BACKEND", "local") # "local" \| "hf_api"
	_HF_TOKEN = os.environ.get("HF_TOKEN", "")

	# ── Model selection ────────────────────────────────────────────────────────────
	MODEL_ID = os.environ.get("WHISPER_MODEL", "openai/whisper-small")
	_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# ── Valid Unicode scripts for EN / HI / TE ────────────────────────────────────
	# Whisper hallucinates Georgian (ვ), Chinese (的), Arabic (ال) on bad audio.
	# Only these script prefixes (from unicodedata.name) are accepted as real output.
	_VALID_SCRIPTS = {
	"LATIN", # English
	"DEVANAGARI", # Hindi
	"TELUGU", # Telugu
	"KANNADA", # Whisper sometimes outputs Kannada for Telugu audio
	"COMMON", # punctuation, digits, spaces
	}

	# Languages tried in order.
	# EN first — fastest for English audio (most common).
	# Only these 3 are permitted — no other language accepted.
	_LANGUAGE_ORDER = ["en", "te", "hi"]
	_ALLOWED_LANGUAGES = {"en", "te", "hi"}

	# Expected dominant script per forced language.
	# If we force "te" but get back Devanagari-heavy text, it is wrong.
	# If we force "hi" but get back Telugu-heavy text, it is wrong.
	# This prevents Telugu audio from being accepted as Hindi.
	_LANG_EXPECTED_SCRIPT = {
	"en": {"LATIN"},
	"te": {"TELUGU", "KANNADA"}, # Whisper may use Kannada script for Telugu
	"hi": {"DEVANAGARI"},
	}




	# ─────────────────────────────────────────────────────────────────────────────
	# KANNADA → TELUGU SCRIPT FIX
	# Whisper sometimes outputs Telugu audio in Kannada script (very similar glyphs).
	# We convert Kannada codepoints → Telugu so stored text is always Telugu script.
	# ─────────────────────────────────────────────────────────────────────────────
	_KANNADA_TO_TELUGU = {
	"ಅ":"అ","ಆ":"ఆ","ಇ":"ఇ","ಈ":"ఈ","ಉ":"ఉ","ಊ":"ఊ","ಋ":"ఋ",
	"ಎ":"ఎ","ಏ":"ఏ","ಐ":"ఐ","ಒ":"ఒ","ಓ":"ఓ","ಔ":"ఔ",
	"ಾ":"ా","ಿ":"ి","ీ":"ీ","ು":"ు","ూ":"ూ","ೃ":"ృ",
	"ೆ":"ె","ೇ":"ే","ೈ":"ై","ೊ":"ొ","ೋ":"ో","ೌ":"ౌ",
	"ಂ":"ం","ಃ":"ః","಼":"఼",
	"ಕ":"క","ಖ":"ఖ","ಗ":"గ","ಘ":"ఘ","ಙ":"ఙ",
	"ಚ":"చ","ಛ":"ఛ","ಜ":"జ","ಝ":"ఝ","ಞ":"ఞ",
	"ಟ":"ట","ಠ":"ఠ","ಡ":"డ","ಢ":"ఢ","ಣ":"ణ",
	"ತ":"త","ಥ":"థ","ದ":"ద","ಧ":"ధ","ನ":"న",
	"ಪ":"ప","ಫ":"ఫ","ಬ":"బ","ಭ":"భ","ಮ":"మ",
	"ಯ":"య","ರ":"ర","ಲ":"ల","ವ":"వ","ಶ":"శ",
	"ಷ":"ష","ಸ":"స","ಹ":"హ","ಳ":"ళ",
	"್":"్",
	"೦":"౦","೧":"౧","೨":"౨","೩":"౩","೪":"౪",
	"೫":"౫","೬":"౬","೭":"౭","೮":"౮","೯":"౯",
	}

	def fix_script(text: str) -> str:
	"""Convert Kannada script → Telugu if Whisper used wrong script for Telugu audio."""
	import unicodedata
	if any(unicodedata.name(ch, "").startswith("KANNADA") for ch in text if ch.strip()):
	converted = "".join(_KANNADA_TO_TELUGU.get(ch, ch) for ch in text)
	print(f"[audio_to_text] Kannada→Telugu fix: {text[:40]!r} → {converted[:40]!r}")
	return converted
	return text

	# ── Load Whisper ONCE at import time ──────────────────────────────────────────
	if _AUDIO_BACKEND == "local":
	print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
	_ASR_PIPELINE = pipeline(
	task = "automatic-speech-recognition",
	model = MODEL_ID,
	device = _DEVICE,
	)
	print(f"✅ Whisper '{MODEL_ID}' loaded.")
	else:
	_ASR_PIPELINE = None
	print(f"ℹ️ Whisper skipped — using HF API backend.")


	# ─────────────────────────────────────────────────────────────────────────────
	# HALLUCINATION DETECTION
	# ─────────────────────────────────────────────────────────────────────────────
	def _is_valid_transcription(text: str) -> bool:
	"""
	Returns True only if the transcription looks like real speech.

	Checks:
	1. Script check -- must be mostly Latin / Devanagari / Telugu
	2. Repetition check -- rejects looping hallucinations like
	"apne apne apne apne..." where a word repeats 5+ times
	"""
	if not text or len(text.strip()) < 3:
	return False

	chars = [c for c in text if not c.isspace()]
	if not chars:
	return False

	# Check 1: Script validation
	valid_count = 0
	for c in chars:
	try:
	char_name = unicodedata.name(c, "")
	script = char_name.split()[0] if char_name else "UNKNOWN"
	if script in _VALID_SCRIPTS:
	valid_count += 1
	except Exception:
	pass

	ratio = valid_count / len(chars)
	if ratio < 0.60:
	print(f"[audio_to_text] WARNING script hallucination "
	f"(valid_ratio={ratio:.2f}) discarding: {text[:60]!r}")
	return False

	# Check 2: Repetition detection
	# "apne apne apne apne apne apne..." = Whisper looping hallucination
	words = text.strip().split()
	if len(words) >= 6:
	# Max consecutive repeated word
	max_repeat = 1
	cur_repeat = 1
	for i in range(1, len(words)):
	if words[i].lower() == words[i - 1].lower():
	cur_repeat += 1
	max_repeat = max(max_repeat, cur_repeat)
	else:
	cur_repeat = 1
	if max_repeat >= 5:
	print(f"[audio_to_text] WARNING repetition hallucination "
	f"(word repeats {max_repeat}x) discarding: {text[:60]!r}")
	return False

	# Low vocabulary diversity = looping hallucination
	# "I love you. I love you..." = 3 unique / 15 words = 0.20 unique ratio
	# Real speech always has more variety — threshold: <0.15 for longer texts
	unique_ratio = len(set(w.lower() for w in words)) / len(words)
	if unique_ratio < 0.15 and len(words) > 15:
	print(f"[audio_to_text] WARNING low-diversity hallucination "
	f"(unique_ratio={unique_ratio:.2f}) discarding: {text[:60]!r}")
	return False

	# Check 3: Character-level repetition — catches "अग्वावावावाव..." patterns
	# where substrings repeat at character level (not caught by word check)
	if len(text) > 20:
	# Take a 4-char ngram from position 10 and count how many times it appears
	probe = text[8:12]
	rep_count = text.count(probe)
	if rep_count > len(text) // 8: # appears more than once per 8 chars = looping
	print(f"[audio_to_text] WARNING char-level repetition "
	f"(probe {probe!r} repeats {rep_count}x) discarding: {text[:60]!r}")
	return False

	return True


	# ─────────────────────────────────────────────────────────────────────────────
	# PUBLIC API
	# ─────────────────────────────────────────────────────────────────────────────
	def transcribe_audio(audio_file) -> str:
	"""
	Transcribe an uploaded audio file to text.

	Parameters
	----------
	audio_file : werkzeug.datastructures.FileStorage
	File from Flask request.files["audio"].
	Accepts WAV, MP3, OGG, FLAC, M4A, WEBM.

	Returns
	-------
	str
	Transcribed text in EN / HI / TE.
	Returns "" on failure or hallucination — never raises.
	"""
	if _AUDIO_BACKEND == "hf_api" and _HF_TOKEN:
	return _transcribe_via_hf_api(audio_file)
	return _transcribe_local(audio_file)


	# ─────────────────────────────────────────────────────────────────────────────
	# LOCAL PATH
	# ─────────────────────────────────────────────────────────────────────────────
	def _transcribe_local(audio_file) -> str:
	try:
	audio_bytes = audio_file.read()
	if not audio_bytes:
	print("[audio_to_text] ⚠️ Empty audio file.")
	return ""

	suffix = _get_suffix(audio_file)

	# Write to temp file — pydub needs a file path on disk
	with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
	tmp.write(audio_bytes)
	tmp_path = tmp.name

	try:
	audio_array, sample_rate = _load_audio(tmp_path, suffix)
	finally:
	try:
	os.unlink(tmp_path)
	except OSError:
	pass

	if audio_array is None:
	print("[audio_to_text] ❌ Could not decode audio — is ffmpeg installed?")
	return ""

	# ── Audio quality diagnostics ──────────────────────────────────────────
	duration_sec = len(audio_array) / 16_000
	rms = float(np.sqrt(np.mean(audio_array ** 2)))
	peak = float(np.max(np.abs(audio_array)))
	print(f"[audio_to_text] 🔍 duration={duration_sec:.1f}s \| rms={rms:.4f} \| peak={peak:.4f}")

	# Reject silent or extremely quiet audio — Whisper hallucinates on silence
	if rms < 0.001:
	print("[audio_to_text] ❌ Audio is silent (rms<0.001) — nothing to transcribe")
	return ""
	if duration_sec < 0.5:
	print(f"[audio_to_text] ❌ Audio too short ({duration_sec:.2f}s) — minimum 0.5s")
	return ""

	# ── Try EN → TE → HI — never pure auto-detect ─────────────────────────
	# language=None causes Whisper to hallucinate Georgian/Chinese on bad audio.
	# Forcing each language and validating the output is far more reliable.
	#
	# IMPORTANT: the pipeline mutates the input dict internally on the first
	# call, so subsequent calls receive a broken dict. Fix: rebuild it fresh
	# for every language attempt using a copy of the original numpy array.
	audio_array_copy = audio_array.copy()

	for lang in _LANGUAGE_ORDER:
	try:
	# Fresh dict every iteration — never reuse across pipeline calls
	audio_input = {"raw": audio_array_copy.copy(), "sampling_rate": 16_000}
	result = _ASR_PIPELINE(
	audio_input,
	generate_kwargs={
	"language": lang,
	"task": "transcribe",
	# temperature and compression_ratio_threshold cause a
	# 'logprobs' bug in some transformers versions — removed.
	# Hallucination is handled by our own validator instead.
	},
	return_timestamps=False,
	)
	text = result.get("text", "").strip()

	if not text:
	print(f"[audio_to_text] ↩️ lang={lang} -> empty, trying next")
	continue

	# Strict language whitelist — only EN / HI / TE accepted.
	# Whisper sometimes returns text in a completely different language
	# even when forced (e.g. forced TE returns Khmer). Detect this by
	# checking the detected_language field when available.
	detected_lang = result.get("chunks", [{}])[0].get("language", lang) if isinstance(result.get("chunks"), list) else lang

	if _is_valid_transcription(text):
	# Extra check: does the output script match the forced language?
	# Whisper-small often outputs Hindi (Devanagari) when forced to TE.
	# Reject if dominant script does not match expected script for lang.
	expected_scripts = _LANG_EXPECTED_SCRIPT.get(lang, None)
	if expected_scripts and lang != "en":
	chars = [c for c in text if not c.isspace()]
	script_counts = {}
	for c in chars:
	try:
	sc = unicodedata.name(c, "").split()[0]
	script_counts[sc] = script_counts.get(sc, 0) + 1
	except Exception:
	pass
	dominant = max(script_counts, key=script_counts.get) if script_counts else "UNKNOWN"
	if dominant not in expected_scripts and dominant not in ("COMMON", "LATIN"):
	print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
	continue

	text = fix_script(text) # Kannada→Telugu if needed
	print(f"[audio_to_text] OK lang={lang} \| "
	f"{len(text)} chars: {text[:100]}")
	return text
	else:
	print(f"[audio_to_text] lang={lang} hallucinated — trying next")
	continue

	except Exception as e:
	print(f"[audio_to_text] ❌ lang={lang} error: {e}")
	continue

	print("[audio_to_text] ❌ All language attempts failed — returning empty")
	return ""

	except Exception as e:
	print(f"[audio_to_text] ❌ Transcription failed: {e}")
	return ""


	def _load_audio(file_path: str, suffix: str):
	"""
	Load audio file as float32 numpy array at 16 kHz mono.

	Strategy:
	1. pydub — handles MP3, OGG, WEBM, M4A, WAV, FLAC (needs ffmpeg)
	2. soundfile fallback — WAV and FLAC only (no ffmpeg needed)

	Returns (audio_array, 16000) or (None, None) on failure.
	"""
	# ── pydub (primary) ────────────────────────────────────────────────────────
	try:
	from pydub import AudioSegment

	fmt = suffix.lstrip(".").lower()
	fmt_map = {"m4a": "mp4", "webm": "webm", "ogg": "ogg"}
	fmt = fmt_map.get(fmt, fmt)

	audio_seg = AudioSegment.from_file(file_path, format=fmt)
	audio_seg = audio_seg.set_channels(1).set_frame_rate(16_000)
	samples = np.array(audio_seg.get_array_of_samples(), dtype=np.float32)

	# Normalize based on actual sample width — pydub can return int16 OR int32
	# depending on source format. Always normalize to float32 [-1.0, 1.0]
	sample_width = audio_seg.sample_width # bytes per sample: 1=8bit, 2=16bit, 4=32bit
	max_val = float(2 ** (8 * sample_width - 1))
	samples = samples / max_val
	# Safety clamp — should already be in range but guard against edge cases
	samples = np.clip(samples, -1.0, 1.0)

	print(f"[audio_to_text] pydub decoded: sample_width={sample_width}B "
	f"max_val={max_val:.0f} post_rms={float(np.sqrt(np.mean(samples**2))):.4f}")

	return samples, 16_000

	except ImportError:
	print("[audio_to_text] pydub not installed — falling back to soundfile")
	print(" pip install pydub + install ffmpeg")
	except Exception as e:
	print(f"[audio_to_text] pydub failed ({e}) — trying soundfile")

	# ── soundfile (fallback — WAV/FLAC only) ───────────────────────────────────
	try:
	import soundfile as sf
	audio_array, sample_rate = sf.read(file_path, dtype="float32")

	if audio_array.ndim > 1:
	audio_array = audio_array.mean(axis=1)

	if sample_rate != 16_000:
	audio_array = _resample(audio_array, sample_rate, 16_000)

	return audio_array, 16_000

	except Exception as e:
	print(f"[audio_to_text] soundfile failed: {e}")
	return None, None


	# ─────────────────────────────────────────────────────────────────────────────
	# HF API PATH — production / HF Spaces
	# ─────────────────────────────────────────────────────────────────────────────
	def _transcribe_via_hf_api(audio_file) -> str:
	"""
	Production path — HuggingFace Inference API (whisper-large-v3 on HF GPU).
	Set AUDIO_BACKEND=hf_api and HF_TOKEN=hf_xxx in HF Space Secrets.

	Why large-v3 via API instead of loading locally:
	- large-v3 = 3GB — too large to load on free HF Spaces
	- HF API runs it on GPU — faster than local CPU anyway (~15-30s vs 3min)
	- Free tier: 1000 requests/day — enough for a civic portal

	large-v3 auto-detect is accurate enough for EN/TE/HI — no need for
	the 3-attempt language loop used in local path.
	"""
	import requests

	try:
	audio_bytes = audio_file.read()
	if not audio_bytes:
	return ""

	print(f"[audio_to_text] HF API: sending {len(audio_bytes)} bytes to whisper-large-v3...")

	# Detect audio format from magic bytes for correct Content-Type
	if audio_bytes[:4] == b'OggS':
	content_type = "audio/ogg"
	elif audio_bytes[:4] == b'RIFF':
	content_type = "audio/wav"
	elif audio_bytes[:3] == b'ID3' or audio_bytes[:2] == b'\xff\xfb':
	content_type = "audio/mpeg"
	elif audio_bytes[:4] == b'fLaC':
	content_type = "audio/flac"
	else:
	content_type = "audio/webm"
	print(f"[audio_to_text] detected content_type={content_type}")

	# First attempt: auto-detect language (large-v3 is accurate enough)
	res = requests.post(
	"https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3",
	headers = {
	"Authorization": f"Bearer {_HF_TOKEN}",
	"Content-Type": content_type,
	},
	data = audio_bytes,
	timeout = 120, # HF free tier can queue up to 60s before processing
	)

	# Handle model loading (HF cold start)
	if res.status_code == 503:
	import time
	print("[audio_to_text] HF API: model loading — waiting 20s...")
	time.sleep(20)
	res = requests.post(
	"https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3",
	headers = {
	"Authorization": f"Bearer {_HF_TOKEN}",
	"Content-Type": content_type,
	},
	data = audio_bytes,
	timeout = 120,
	)

	if res.ok:
	data = res.json()
	# HF API returns {"text": "..."} or [{"generated_text": "..."}]
	if isinstance(data, dict):
	text = data.get("text", "").strip()
	elif isinstance(data, list) and data:
	text = data[0].get("generated_text", "").strip()
	else:
	text = ""

	if _is_valid_transcription(text):
	text = fix_script(text) # Kannada→Telugu if Whisper used wrong script
	print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
	return text
	else:
	print(f"[audio_to_text] HF API hallucination discarded: {text[:60]!r}")
	return ""
	else:
	print(f"[audio_to_text] HF API error {res.status_code}: {res.text[:300]}")
	return ""

	except requests.exceptions.Timeout:
	print("[audio_to_text] HF API timeout — model may be overloaded")
	return ""
	except Exception as e:
	print(f"[audio_to_text] HF API exception: {e}")
	return ""


	# ─────────────────────────────────────────────────────────────────────────────
	# HELPERS
	# ─────────────────────────────────────────────────────────────────────────────
	def _get_suffix(audio_file) -> str:
	"""Determine file extension from FileStorage. Defaults to .webm."""
	filename = getattr(audio_file, "filename", "") or ""
	mime = getattr(audio_file, "mimetype", "") or ""

	_MIME_TO_EXT = {
	"audio/wav": ".wav", "audio/x-wav": ".wav", "audio/wave": ".wav",
	"audio/mpeg": ".mp3", "audio/mp3": ".mp3",
	"audio/ogg": ".ogg",
	"audio/flac": ".flac", "audio/x-flac": ".flac",
	"audio/mp4": ".m4a", "audio/x-m4a": ".m4a",
	"audio/webm": ".webm", "video/webm": ".webm",
	}

	if "." in filename:
	return "." + filename.rsplit(".", 1)[-1].lower()

	# Default to .webm — Chrome/Edge MediaRecorder always sends webm
	return _MIME_TO_EXT.get(mime.lower(), ".webm")


	def _resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
	"""Resample audio array from orig_sr to target_sr."""
	try:
	from scipy.signal import resample_poly
	from math import gcd
	g = gcd(orig_sr, target_sr)
	return resample_poly(audio, target_sr // g, orig_sr // g).astype(np.float32)
	except ImportError:
	target_length = int(len(audio) * target_sr / orig_sr)
	return np.interp(
	np.linspace(0, len(audio) - 1, target_length),
	np.arange(len(audio)),
	audio,
	).astype(np.float32)