File size: 6,656 Bytes
2758540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
audio/asr.py - Automatic Speech Recognition using openai/whisper-small
audio/classifier.py - Audio classification using wav2vec2
audio/tts.py - Text-to-Speech using microsoft/speecht5_tts
"""
import time
import torch
import numpy as np
from typing import Dict, Optional, Union
from loguru import logger
from config import settings, DEVICE


# ═══════════════════════════════════════════════════════════════════════════
# ASR - Automatic Speech Recognition
# ═══════════════════════════════════════════════════════════════════════════

class WhisperASR:
    """Transcribes audio using openai/whisper-small."""

    def __init__(self):
        if not settings.ENABLE_AUDIO:
            logger.info("Audio module disabled. Set ENABLE_AUDIO=true to activate.")
            self._ready = False
            return
        logger.info(f"Loading Whisper ASR model: {settings.WHISPER_MODEL}")
        try:
            import whisper
            model_name = settings.WHISPER_MODEL.split("/")[-1]  # "whisper-small" β†’ "small"
            self.model = whisper.load_model(model_name, device=str(DEVICE))
            self._ready = True
            logger.info("βœ… WhisperASR ready.")
        except ImportError:
            logger.warning("openai-whisper not installed. ASR unavailable.")
            self._ready = False

    def transcribe(self, audio_path: str, language: Optional[str] = None) -> Dict:
        if not self._ready:
            return {"text": "", "error": "ASR not available"}
        t0 = time.perf_counter()
        opts = {}
        if language:
            opts["language"] = language
        result = self.model.transcribe(audio_path, **opts)
        latency_ms = (time.perf_counter() - t0) * 1000
        return {
            "text": result["text"],
            "language": result.get("language", "unknown"),
            "segments": result.get("segments", []),
            "latency_ms": round(latency_ms, 2),
        }

    def transcribe_bytes(self, audio_bytes: bytes, sample_rate: int = 16000) -> Dict:
        import tempfile, soundfile as sf, os
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            tmp_path = f.name
        try:
            audio_array = np.frombuffer(audio_bytes, dtype=np.float32)
            sf.write(tmp_path, audio_array, sample_rate)
            return self.transcribe(tmp_path)
        finally:
            os.unlink(tmp_path)


# ═══════════════════════════════════════════════════════════════════════════
# Audio Classifier
# ═══════════════════════════════════════════════════════════════════════════

class AudioClassifier:
    """Classifies audio events (gunshot, scream, etc.) using wav2vec2."""

    KEYWORDS = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]

    def __init__(self):
        if not settings.ENABLE_AUDIO:
            self._ready = False
            return
        logger.info(f"Loading audio classifier: {settings.AUDIO_CLASS_MODEL}")
        try:
            from transformers import pipeline
            device_id = 0 if str(DEVICE) == "cuda" else -1
            self.classifier = pipeline(
                "audio-classification",
                model=settings.AUDIO_CLASS_MODEL,
                device=device_id,
            )
            self._ready = True
            logger.info("βœ… AudioClassifier ready.")
        except Exception as e:
            logger.warning(f"AudioClassifier init failed: {e}")
            self._ready = False

    def classify(self, audio_path: str, top_k: int = 5) -> Dict:
        if not self._ready:
            return {"classes": [], "error": "Audio classifier not available"}
        t0 = time.perf_counter()
        results = self.classifier(audio_path, top_k=top_k)
        latency_ms = (time.perf_counter() - t0) * 1000
        return {
            "classes": [{"label": r["label"], "score": round(r["score"], 4)} for r in results],
            "latency_ms": round(latency_ms, 2),
        }


# ═══════════════════════════════════════════════════════════════════════════
# TTS - Text to Speech
# ═══════════════════════════════════════════════════════════════════════════

class SpeechSynthesizer:
    """Generates speech from text using microsoft/speecht5_tts."""

    def __init__(self):
        if not settings.ENABLE_AUDIO:
            self._ready = False
            return
        logger.info(f"Loading TTS model: {settings.TTS_MODEL}")
        try:
            from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
            from datasets import load_dataset
            self.processor = SpeechT5Processor.from_pretrained(settings.TTS_MODEL)
            self.model = SpeechT5ForTextToSpeech.from_pretrained(settings.TTS_MODEL).to(DEVICE)
            self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(DEVICE)
            # Load speaker embeddings
            ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
            self.speaker_embeddings = torch.tensor(ds[7306]["xvector"]).unsqueeze(0).to(DEVICE)
            self._ready = True
            logger.info("βœ… SpeechSynthesizer ready.")
        except Exception as e:
            logger.warning(f"TTS init failed: {e}")
            self._ready = False

    def synthesize(self, text: str) -> Optional[np.ndarray]:
        """Synthesize text to audio. Returns numpy array (float32) or None."""
        if not self._ready:
            return None
        inputs = self.processor(text=text, return_tensors="pt").to(DEVICE)
        with torch.inference_mode():
            speech = self.model.generate_speech(
                inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder
            )
        return speech.cpu().numpy()