Spaces:

Shaankar39
/

vaani-cavp-engine

Build error

App Files Files Community

vaani-cavp-engine / modules /ai_classification.py

Shaankar39

init: Vaani CAVP engine (CPU, accuracy-first — Whisper large-v3, spaCy trf)

7d5f092 28 days ago

raw

history blame contribute delete

7.06 kB

	"""AI CLASSIFICATION LAYER
	Wav2Vec 2.0 -> Phoneme identification
	SpeechBrain -> Emotion + Accent classification
	langdetect -> Language identification
	"""

	from __future__ import annotations

	import logging
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Any

	import numpy as np

	logger = logging.getLogger(__name__)

	# Lazy-loaded singletons
	_wav2vec_model: Any = None
	_wav2vec_processor: Any = None
	_emotion_classifier: Any = None


	# ---------------------------------------------------------------------------
	# Wav2Vec 2.0: Phoneme-level identification
	# ---------------------------------------------------------------------------

	@dataclass
	class PhonemeSpan:
	phoneme: str
	start_ms: int
	end_ms: int
	confidence: float


	@dataclass
	class Wav2VecResult:
	phonemes: list[PhonemeSpan]
	raw_transcript: str
	model_name: str


	def _load_wav2vec() -> tuple[Any, Any]:
	global _wav2vec_model, _wav2vec_processor
	if _wav2vec_model is None:
	from config import TORCH_DEVICE
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	model_id = "facebook/wav2vec2-base-960h"
	logger.info("Loading Wav2Vec2 model: %s on %s", model_id, TORCH_DEVICE)
	_wav2vec_processor = Wav2Vec2Processor.from_pretrained(model_id)
	_wav2vec_model = Wav2Vec2ForCTC.from_pretrained(model_id).to(TORCH_DEVICE)
	_wav2vec_model.eval()
	return _wav2vec_model, _wav2vec_processor


	def classify_phonemes(audio_path: str \| Path) -> Wav2VecResult \| None:
	"""Identify phonemes from audio using Wav2Vec 2.0."""
	try:
	import torch
	import librosa
	from config import TORCH_DEVICE

	model, processor = _load_wav2vec()
	y, sr = librosa.load(str(audio_path), sr=16000)

	inputs = processor(y, sampling_rate=16000, return_tensors="pt", padding=True)
	inputs = {k: v.to(TORCH_DEVICE) for k, v in inputs.items()}
	with torch.no_grad():
	logits = model(**inputs).logits

	probs = torch.softmax(logits, dim=-1)
	predicted_ids = torch.argmax(logits, dim=-1)
	transcript = processor.batch_decode(predicted_ids)[0]

	# Extract phoneme-level spans
	ids = predicted_ids[0].tolist()
	prob_vals = probs[0].max(dim=-1).values.tolist()
	vocab = processor.tokenizer.get_vocab()
	id_to_char = {v: k for k, v in vocab.items()}

	ms_per_frame = (len(y) / sr * 1000) / len(ids) if ids else 0
	phonemes: list[PhonemeSpan] = []
	prev_id = -1
	for i, (pid, conf) in enumerate(zip(ids, prob_vals)):
	if pid == prev_id or pid == processor.tokenizer.pad_token_id:
	prev_id = pid
	continue
	char = id_to_char.get(pid, "?")
	if char == "\|":
	char = " "
	phonemes.append(PhonemeSpan(
	phoneme=char,
	start_ms=int(i * ms_per_frame),
	end_ms=int((i + 1) * ms_per_frame),
	confidence=round(conf, 4),
	))
	prev_id = pid

	return Wav2VecResult(
	phonemes=phonemes,
	raw_transcript=transcript,
	model_name="facebook/wav2vec2-base-960h",
	)
	except ImportError:
	logger.warning("transformers/torch not installed, skipping Wav2Vec")
	return None
	except Exception as exc:
	logger.warning("Wav2Vec classification failed: %s", exc)
	return None


	# ---------------------------------------------------------------------------
	# SpeechBrain: Emotion + Accent classification
	# ---------------------------------------------------------------------------

	@dataclass
	class EmotionResult:
	label: str
	scores: dict[str, float]
	model_name: str


	@dataclass
	class AccentResult:
	accent: str
	confidence: float
	top_accents: dict[str, float]


	@dataclass
	class SpeechBrainResult:
	emotion: EmotionResult \| None
	accent: AccentResult \| None


	def classify_speechbrain(audio_path: str \| Path) -> SpeechBrainResult:
	"""Classify emotion and accent using SpeechBrain."""
	emotion: EmotionResult \| None = None
	accent: AccentResult \| None = None

	# Emotion recognition
	try:
	from config import TORCH_DEVICE
	from speechbrain.inference.interfaces import foreign_class
	emotion_model = foreign_class(
	source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
	pymodule_file="custom_interface.py",
	classname="CustomEncoderWav2vec2Classifier",
	savedir="/tmp/speechbrain_emotion",
	run_opts={"device": TORCH_DEVICE},
	)
	out_prob, score, index, label = emotion_model.classify_file(str(audio_path))
	probs = out_prob.squeeze().tolist()
	labels = ["neutral", "happy", "sad", "angry"]
	scores = {l: round(float(p), 4) for l, p in zip(labels, probs)} if len(probs) == len(labels) else {}
	emotion = EmotionResult(
	label=label[0] if isinstance(label, list) else str(label),
	scores=scores,
	model_name="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
	)
	except Exception as exc:
	logger.warning("SpeechBrain emotion failed: %s", exc)

	# Accent classification
	try:
	from config import TORCH_DEVICE
	from speechbrain.inference.classifiers import EncoderClassifier
	accent_model = EncoderClassifier.from_hparams(
	source="speechbrain/lang-id-commonlanguage_ecapa",
	savedir="/tmp/speechbrain_accent",
	run_opts={"device": TORCH_DEVICE},
	)
	out_prob, score, index, label = accent_model.classify_file(str(audio_path))
	accent = AccentResult(
	accent=label[0] if isinstance(label, list) else str(label),
	confidence=round(float(score.squeeze()), 4),
	top_accents={},
	)
	except Exception as exc:
	logger.warning("SpeechBrain accent failed: %s", exc)

	return SpeechBrainResult(emotion=emotion, accent=accent)


	# ---------------------------------------------------------------------------
	# langdetect: Language identification from text
	# ---------------------------------------------------------------------------

	@dataclass
	class LanguageDetection:
	language: str
	confidence: float
	all_languages: dict[str, float]


	def detect_language(text: str) -> LanguageDetection:
	"""Detect language from text using langdetect."""
	try:
	from langdetect import detect_langs
	results = detect_langs(text)
	top = results[0]
	all_langs = {str(r.lang): round(float(r.prob), 4) for r in results}
	return LanguageDetection(
	language=str(top.lang),
	confidence=round(float(top.prob), 4),
	all_languages=all_langs,
	)
	except Exception as exc:
	logger.warning("Language detection failed: %s", exc)
	return LanguageDetection(language="unknown", confidence=0.0, all_languages={})