Sentinelai_api / audio /audio_module.py
Utkarshres32's picture
Deploy Sentinelai API backend
2758540
"""
audio/asr.py - Automatic Speech Recognition using openai/whisper-small
audio/classifier.py - Audio classification using wav2vec2
audio/tts.py - Text-to-Speech using microsoft/speecht5_tts
"""
import time
import torch
import numpy as np
from typing import Dict, Optional, Union
from loguru import logger
from config import settings, DEVICE
# ═══════════════════════════════════════════════════════════════════════════
# ASR - Automatic Speech Recognition
# ═══════════════════════════════════════════════════════════════════════════
class WhisperASR:
"""Transcribes audio using openai/whisper-small."""
def __init__(self):
if not settings.ENABLE_AUDIO:
logger.info("Audio module disabled. Set ENABLE_AUDIO=true to activate.")
self._ready = False
return
logger.info(f"Loading Whisper ASR model: {settings.WHISPER_MODEL}")
try:
import whisper
model_name = settings.WHISPER_MODEL.split("/")[-1] # "whisper-small" β†’ "small"
self.model = whisper.load_model(model_name, device=str(DEVICE))
self._ready = True
logger.info("βœ… WhisperASR ready.")
except ImportError:
logger.warning("openai-whisper not installed. ASR unavailable.")
self._ready = False
def transcribe(self, audio_path: str, language: Optional[str] = None) -> Dict:
if not self._ready:
return {"text": "", "error": "ASR not available"}
t0 = time.perf_counter()
opts = {}
if language:
opts["language"] = language
result = self.model.transcribe(audio_path, **opts)
latency_ms = (time.perf_counter() - t0) * 1000
return {
"text": result["text"],
"language": result.get("language", "unknown"),
"segments": result.get("segments", []),
"latency_ms": round(latency_ms, 2),
}
def transcribe_bytes(self, audio_bytes: bytes, sample_rate: int = 16000) -> Dict:
import tempfile, soundfile as sf, os
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
tmp_path = f.name
try:
audio_array = np.frombuffer(audio_bytes, dtype=np.float32)
sf.write(tmp_path, audio_array, sample_rate)
return self.transcribe(tmp_path)
finally:
os.unlink(tmp_path)
# ═══════════════════════════════════════════════════════════════════════════
# Audio Classifier
# ═══════════════════════════════════════════════════════════════════════════
class AudioClassifier:
"""Classifies audio events (gunshot, scream, etc.) using wav2vec2."""
KEYWORDS = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]
def __init__(self):
if not settings.ENABLE_AUDIO:
self._ready = False
return
logger.info(f"Loading audio classifier: {settings.AUDIO_CLASS_MODEL}")
try:
from transformers import pipeline
device_id = 0 if str(DEVICE) == "cuda" else -1
self.classifier = pipeline(
"audio-classification",
model=settings.AUDIO_CLASS_MODEL,
device=device_id,
)
self._ready = True
logger.info("βœ… AudioClassifier ready.")
except Exception as e:
logger.warning(f"AudioClassifier init failed: {e}")
self._ready = False
def classify(self, audio_path: str, top_k: int = 5) -> Dict:
if not self._ready:
return {"classes": [], "error": "Audio classifier not available"}
t0 = time.perf_counter()
results = self.classifier(audio_path, top_k=top_k)
latency_ms = (time.perf_counter() - t0) * 1000
return {
"classes": [{"label": r["label"], "score": round(r["score"], 4)} for r in results],
"latency_ms": round(latency_ms, 2),
}
# ═══════════════════════════════════════════════════════════════════════════
# TTS - Text to Speech
# ═══════════════════════════════════════════════════════════════════════════
class SpeechSynthesizer:
"""Generates speech from text using microsoft/speecht5_tts."""
def __init__(self):
if not settings.ENABLE_AUDIO:
self._ready = False
return
logger.info(f"Loading TTS model: {settings.TTS_MODEL}")
try:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
self.processor = SpeechT5Processor.from_pretrained(settings.TTS_MODEL)
self.model = SpeechT5ForTextToSpeech.from_pretrained(settings.TTS_MODEL).to(DEVICE)
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(DEVICE)
# Load speaker embeddings
ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
self.speaker_embeddings = torch.tensor(ds[7306]["xvector"]).unsqueeze(0).to(DEVICE)
self._ready = True
logger.info("βœ… SpeechSynthesizer ready.")
except Exception as e:
logger.warning(f"TTS init failed: {e}")
self._ready = False
def synthesize(self, text: str) -> Optional[np.ndarray]:
"""Synthesize text to audio. Returns numpy array (float32) or None."""
if not self._ready:
return None
inputs = self.processor(text=text, return_tensors="pt").to(DEVICE)
with torch.inference_mode():
speech = self.model.generate_speech(
inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder
)
return speech.cpu().numpy()