| """ |
| Non-verbal event detection ensemble models. |
| |
| Detects: <laugh>, <chuckle>, <sigh>, <breath>, <cough>, <gasp>, etc. |
| """ |
|
|
| import numpy as np |
| import torch |
| from typing import Dict, Any, List, Optional |
| import logging |
|
|
| from ..base import BaseModel, BaseEnsemble |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class SenseVoiceEventDetector(BaseModel): |
| """SenseVoice for event detection (integrated with emotion model).""" |
|
|
| def __init__(self, model_name: str = "FunAudioLLM/SenseVoiceSmall", weight: float = 0.5, device: str = 'cpu'): |
| super().__init__(name="sensevoice_events", weight=weight, device=device) |
| self.model_name = model_name |
|
|
| def load(self): |
| """Load SenseVoice model.""" |
| try: |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor |
|
|
| logger.info(f"Loading {self.model_name} for event detection...") |
|
|
| self.processor = AutoProcessor.from_pretrained(self.model_name) |
| self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.model_name) |
| self.model.to(self.device) |
| self.model.eval() |
|
|
| self.is_loaded = True |
| logger.info(f"✅ SenseVoice event detector loaded") |
|
|
| except Exception as e: |
| logger.warning(f"SenseVoice not available: {e}") |
| self.is_loaded = False |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Detect events using SenseVoice.""" |
| if not self.is_loaded: |
| return {"events": [], "confidence": {}} |
|
|
| try: |
| |
| |
| event_mapping = { |
| "laughter": "<laugh>", |
| "crying": "<cry>", |
| "coughing": "<cough>", |
| "sneezing": "<sneeze>", |
| "applause": "<applause>" |
| } |
|
|
| |
| detected = [] |
|
|
| return { |
| "events": detected, |
| "confidence": {} |
| } |
|
|
| except Exception as e: |
| logger.error(f"SenseVoice event detection error: {e}") |
| return {"events": [], "confidence": {}} |
|
|
|
|
| class LibrosaEventDetector(BaseModel): |
| """Rule-based event detector using librosa features.""" |
|
|
| def __init__(self, weight: float = 0.3, device: str = 'cpu'): |
| super().__init__(name="librosa_events", weight=weight, device=device) |
|
|
| def load(self): |
| """Load librosa (no model to load).""" |
| try: |
| import librosa |
| self.librosa = librosa |
| self.is_loaded = True |
| logger.info("✅ Librosa event detector ready") |
| except ImportError: |
| logger.error("Librosa not installed: pip install librosa") |
| raise |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Detect events using audio features.""" |
| if not self.is_loaded: |
| return {"events": [], "confidence": {}} |
|
|
| try: |
| events = [] |
| confidence = {} |
|
|
| |
| if sample_rate != 16000: |
| audio = self.librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
| sample_rate = 16000 |
|
|
| |
| rms = self.librosa.feature.rms(y=audio)[0] |
| zcr = self.librosa.feature.zero_crossing_rate(audio)[0] |
| spectral_centroid = self.librosa.feature.spectral_centroid(y=audio, sr=sample_rate)[0] |
|
|
| |
| energy_std = np.std(rms) |
| zcr_mean = np.mean(zcr) |
|
|
| if energy_std > 0.15 and zcr_mean > 0.1: |
| events.append("<laugh>") |
| confidence["<laugh>"] = min(float(energy_std * 5), 0.95) |
|
|
| |
| low_freq_ratio = np.mean(spectral_centroid < 500) |
| rms_mean = np.mean(rms) |
|
|
| if rms_mean < 0.02 and low_freq_ratio > 0.6 and len(audio) > sample_rate * 0.3: |
| events.append("<breath>") |
| confidence["<breath>"] = float(low_freq_ratio * 0.8) |
|
|
| |
| if len(rms) > 10: |
| rms_diff = np.diff(rms) |
| |
| if np.any(rms_diff < -0.05) and np.any(rms_diff > 0.05): |
| events.append("<sigh>") |
| confidence["<sigh>"] = 0.6 |
|
|
| |
| rms_max = np.max(rms) |
| if rms_max > 0.8 and energy_std > 0.2: |
| events.append("<cough>") |
| confidence["<cough>"] = float(min(rms_max, 0.9)) |
|
|
| return { |
| "events": events, |
| "confidence": confidence |
| } |
|
|
| except Exception as e: |
| logger.error(f"Librosa event detection error: {e}") |
| return {"events": [], "confidence": {}} |
|
|
|
|
| class CNNLSTMEventDetector(BaseModel): |
| """CNN+LSTM model for non-verbal event detection.""" |
|
|
| def __init__(self, model_path: Optional[str] = None, weight: float = 0.2, device: str = 'cpu'): |
| super().__init__(name="cnn_lstm_events", weight=weight, device=device) |
| self.model_path = model_path |
|
|
| def load(self): |
| """Load CNN-LSTM model.""" |
| if self.model_path is None: |
| logger.warning("CNN-LSTM model path not provided. Using placeholder.") |
| self.is_loaded = False |
| return |
|
|
| try: |
| |
| |
| logger.warning("CNN-LSTM model not implemented yet") |
| self.is_loaded = False |
|
|
| except Exception as e: |
| logger.error(f"Failed to load CNN-LSTM: {e}") |
| self.is_loaded = False |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Predict events using CNN-LSTM.""" |
| if not self.is_loaded: |
| return {"events": [], "confidence": {}} |
|
|
| |
| return {"events": [], "confidence": {}} |
|
|
|
|
| class EventEnsemble(BaseEnsemble): |
| """Ensemble for non-verbal event detection.""" |
|
|
| def __init__(self, device: str = 'cpu'): |
| """ |
| Initialize event detection ensemble. |
| |
| Args: |
| device: 'cpu' or 'cuda' |
| """ |
| self.device = device |
|
|
| |
| models = [ |
| LibrosaEventDetector(weight=0.5, device=device), |
| SenseVoiceEventDetector(weight=0.5, device=device), |
| |
| ] |
|
|
| super().__init__(models=models, voting_strategy='weighted') |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """ |
| Detect events using ensemble. |
| |
| Args: |
| audio: Audio array |
| sample_rate: Sample rate |
| |
| Returns: |
| Dictionary with detected events and confidence scores |
| """ |
| |
| predictions = self.predict_all(audio, sample_rate) |
|
|
| if not predictions: |
| return { |
| "events": [], |
| "confidence": {}, |
| "detections": [] |
| } |
|
|
| |
| all_events = [] |
| event_confidence = {} |
| event_counts = {} |
|
|
| for pred in predictions: |
| model_events = pred.get("events", []) |
| model_confidence = pred.get("confidence", {}) |
| model_weight = pred.get("model_weight", 1.0) |
|
|
| for event in model_events: |
| all_events.append(event) |
| event_counts[event] = event_counts.get(event, 0) + 1 |
|
|
| |
| conf = model_confidence.get(event, 0.5) * model_weight |
| event_confidence[event] = event_confidence.get(event, 0.0) + conf |
|
|
| |
| unique_events = list(set(all_events)) |
|
|
| |
| total_weight = sum(p.get("model_weight", 1.0) for p in predictions) |
| for event in event_confidence: |
| event_confidence[event] = event_confidence[event] / total_weight |
|
|
| return { |
| "events": unique_events, |
| "confidence": event_confidence, |
| "counts": event_counts, |
| "detections": predictions |
| } |
|
|