Spaces:

Shaankar39
/

vaani-cavp-engine

Build error

File size: 7,062 Bytes

7d5f092

"""AI CLASSIFICATION LAYER
Wav2Vec 2.0   -> Phoneme identification
SpeechBrain   -> Emotion + Accent classification
langdetect    -> Language identification
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import numpy as np

logger = logging.getLogger(__name__)

# Lazy-loaded singletons
_wav2vec_model: Any = None
_wav2vec_processor: Any = None
_emotion_classifier: Any = None


# ---------------------------------------------------------------------------
# Wav2Vec 2.0: Phoneme-level identification
# ---------------------------------------------------------------------------

@dataclass
class PhonemeSpan:
    phoneme: str
    start_ms: int
    end_ms: int
    confidence: float


@dataclass
class Wav2VecResult:
    phonemes: list[PhonemeSpan]
    raw_transcript: str
    model_name: str


def _load_wav2vec() -> tuple[Any, Any]:
    global _wav2vec_model, _wav2vec_processor
    if _wav2vec_model is None:
        from config import TORCH_DEVICE
        from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
        model_id = "facebook/wav2vec2-base-960h"
        logger.info("Loading Wav2Vec2 model: %s on %s", model_id, TORCH_DEVICE)
        _wav2vec_processor = Wav2Vec2Processor.from_pretrained(model_id)
        _wav2vec_model = Wav2Vec2ForCTC.from_pretrained(model_id).to(TORCH_DEVICE)
        _wav2vec_model.eval()
    return _wav2vec_model, _wav2vec_processor


def classify_phonemes(audio_path: str | Path) -> Wav2VecResult | None:
    """Identify phonemes from audio using Wav2Vec 2.0."""
    try:
        import torch
        import librosa
        from config import TORCH_DEVICE

        model, processor = _load_wav2vec()
        y, sr = librosa.load(str(audio_path), sr=16000)

        inputs = processor(y, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(TORCH_DEVICE) for k, v in inputs.items()}
        with torch.no_grad():
            logits = model(**inputs).logits

        probs = torch.softmax(logits, dim=-1)
        predicted_ids = torch.argmax(logits, dim=-1)
        transcript = processor.batch_decode(predicted_ids)[0]

        # Extract phoneme-level spans
        ids = predicted_ids[0].tolist()
        prob_vals = probs[0].max(dim=-1).values.tolist()
        vocab = processor.tokenizer.get_vocab()
        id_to_char = {v: k for k, v in vocab.items()}

        ms_per_frame = (len(y) / sr * 1000) / len(ids) if ids else 0
        phonemes: list[PhonemeSpan] = []
        prev_id = -1
        for i, (pid, conf) in enumerate(zip(ids, prob_vals)):
            if pid == prev_id or pid == processor.tokenizer.pad_token_id:
                prev_id = pid
                continue
            char = id_to_char.get(pid, "?")
            if char == "|":
                char = " "
            phonemes.append(PhonemeSpan(
                phoneme=char,
                start_ms=int(i * ms_per_frame),
                end_ms=int((i + 1) * ms_per_frame),
                confidence=round(conf, 4),
            ))
            prev_id = pid

        return Wav2VecResult(
            phonemes=phonemes,
            raw_transcript=transcript,
            model_name="facebook/wav2vec2-base-960h",
        )
    except ImportError:
        logger.warning("transformers/torch not installed, skipping Wav2Vec")
        return None
    except Exception as exc:
        logger.warning("Wav2Vec classification failed: %s", exc)
        return None


# ---------------------------------------------------------------------------
# SpeechBrain: Emotion + Accent classification
# ---------------------------------------------------------------------------

@dataclass
class EmotionResult:
    label: str
    scores: dict[str, float]
    model_name: str


@dataclass
class AccentResult:
    accent: str
    confidence: float
    top_accents: dict[str, float]


@dataclass
class SpeechBrainResult:
    emotion: EmotionResult | None
    accent: AccentResult | None


def classify_speechbrain(audio_path: str | Path) -> SpeechBrainResult:
    """Classify emotion and accent using SpeechBrain."""
    emotion: EmotionResult | None = None
    accent: AccentResult | None = None

    # Emotion recognition
    try:
        from config import TORCH_DEVICE
        from speechbrain.inference.interfaces import foreign_class
        emotion_model = foreign_class(
            source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
            pymodule_file="custom_interface.py",
            classname="CustomEncoderWav2vec2Classifier",
            savedir="/tmp/speechbrain_emotion",
            run_opts={"device": TORCH_DEVICE},
        )
        out_prob, score, index, label = emotion_model.classify_file(str(audio_path))
        probs = out_prob.squeeze().tolist()
        labels = ["neutral", "happy", "sad", "angry"]
        scores = {l: round(float(p), 4) for l, p in zip(labels, probs)} if len(probs) == len(labels) else {}
        emotion = EmotionResult(
            label=label[0] if isinstance(label, list) else str(label),
            scores=scores,
            model_name="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
        )
    except Exception as exc:
        logger.warning("SpeechBrain emotion failed: %s", exc)

    # Accent classification
    try:
        from config import TORCH_DEVICE
        from speechbrain.inference.classifiers import EncoderClassifier
        accent_model = EncoderClassifier.from_hparams(
            source="speechbrain/lang-id-commonlanguage_ecapa",
            savedir="/tmp/speechbrain_accent",
            run_opts={"device": TORCH_DEVICE},
        )
        out_prob, score, index, label = accent_model.classify_file(str(audio_path))
        accent = AccentResult(
            accent=label[0] if isinstance(label, list) else str(label),
            confidence=round(float(score.squeeze()), 4),
            top_accents={},
        )
    except Exception as exc:
        logger.warning("SpeechBrain accent failed: %s", exc)

    return SpeechBrainResult(emotion=emotion, accent=accent)


# ---------------------------------------------------------------------------
# langdetect: Language identification from text
# ---------------------------------------------------------------------------

@dataclass
class LanguageDetection:
    language: str
    confidence: float
    all_languages: dict[str, float]


def detect_language(text: str) -> LanguageDetection:
    """Detect language from text using langdetect."""
    try:
        from langdetect import detect_langs
        results = detect_langs(text)
        top = results[0]
        all_langs = {str(r.lang): round(float(r.prob), 4) for r in results}
        return LanguageDetection(
            language=str(top.lang),
            confidence=round(float(top.prob), 4),
            all_languages=all_langs,
        )
    except Exception as exc:
        logger.warning("Language detection failed: %s", exc)
        return LanguageDetection(language="unknown", confidence=0.0, all_languages={})