marcosremar
Add FULL mode support + Event detection ensemble
ffbf816
"""
Non-verbal event detection ensemble models.
Detects: <laugh>, <chuckle>, <sigh>, <breath>, <cough>, <gasp>, etc.
"""
import numpy as np
import torch
from typing import Dict, Any, List, Optional
import logging
from ..base import BaseModel, BaseEnsemble
logger = logging.getLogger(__name__)
class SenseVoiceEventDetector(BaseModel):
"""SenseVoice for event detection (integrated with emotion model)."""
def __init__(self, model_name: str = "FunAudioLLM/SenseVoiceSmall", weight: float = 0.5, device: str = 'cpu'):
super().__init__(name="sensevoice_events", weight=weight, device=device)
self.model_name = model_name
def load(self):
"""Load SenseVoice model."""
try:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
logger.info(f"Loading {self.model_name} for event detection...")
self.processor = AutoProcessor.from_pretrained(self.model_name)
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.model_name)
self.model.to(self.device)
self.model.eval()
self.is_loaded = True
logger.info(f"✅ SenseVoice event detector loaded")
except Exception as e:
logger.warning(f"SenseVoice not available: {e}")
self.is_loaded = False
def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
"""Detect events using SenseVoice."""
if not self.is_loaded:
return {"events": [], "confidence": {}}
try:
# SenseVoice can detect: bgm, applause, laughter, crying, coughing, sneezing
# Map to Orpheus tags
event_mapping = {
"laughter": "<laugh>",
"crying": "<cry>",
"coughing": "<cough>",
"sneezing": "<sneeze>",
"applause": "<applause>"
}
# Placeholder - actual implementation would use model output
detected = []
return {
"events": detected,
"confidence": {}
}
except Exception as e:
logger.error(f"SenseVoice event detection error: {e}")
return {"events": [], "confidence": {}}
class LibrosaEventDetector(BaseModel):
"""Rule-based event detector using librosa features."""
def __init__(self, weight: float = 0.3, device: str = 'cpu'):
super().__init__(name="librosa_events", weight=weight, device=device)
def load(self):
"""Load librosa (no model to load)."""
try:
import librosa
self.librosa = librosa
self.is_loaded = True
logger.info("✅ Librosa event detector ready")
except ImportError:
logger.error("Librosa not installed: pip install librosa")
raise
def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
"""Detect events using audio features."""
if not self.is_loaded:
return {"events": [], "confidence": {}}
try:
events = []
confidence = {}
# Resample if needed
if sample_rate != 16000:
audio = self.librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
# Extract features
rms = self.librosa.feature.rms(y=audio)[0]
zcr = self.librosa.feature.zero_crossing_rate(audio)[0]
spectral_centroid = self.librosa.feature.spectral_centroid(y=audio, sr=sample_rate)[0]
# 1. Detect LAUGHTER (high energy variation + high zero-crossing)
energy_std = np.std(rms)
zcr_mean = np.mean(zcr)
if energy_std > 0.15 and zcr_mean > 0.1:
events.append("<laugh>")
confidence["<laugh>"] = min(float(energy_std * 5), 0.95)
# 2. Detect BREATH (low energy, low frequency)
low_freq_ratio = np.mean(spectral_centroid < 500)
rms_mean = np.mean(rms)
if rms_mean < 0.02 and low_freq_ratio > 0.6 and len(audio) > sample_rate * 0.3:
events.append("<breath>")
confidence["<breath>"] = float(low_freq_ratio * 0.8)
# 3. Detect SIGH (energy drop then rise)
if len(rms) > 10:
rms_diff = np.diff(rms)
# Look for dip pattern
if np.any(rms_diff < -0.05) and np.any(rms_diff > 0.05):
events.append("<sigh>")
confidence["<sigh>"] = 0.6
# 4. Detect COUGH (sudden burst of energy)
rms_max = np.max(rms)
if rms_max > 0.8 and energy_std > 0.2:
events.append("<cough>")
confidence["<cough>"] = float(min(rms_max, 0.9))
return {
"events": events,
"confidence": confidence
}
except Exception as e:
logger.error(f"Librosa event detection error: {e}")
return {"events": [], "confidence": {}}
class CNNLSTMEventDetector(BaseModel):
"""CNN+LSTM model for non-verbal event detection."""
def __init__(self, model_path: Optional[str] = None, weight: float = 0.2, device: str = 'cpu'):
super().__init__(name="cnn_lstm_events", weight=weight, device=device)
self.model_path = model_path
def load(self):
"""Load CNN-LSTM model."""
if self.model_path is None:
logger.warning("CNN-LSTM model path not provided. Using placeholder.")
self.is_loaded = False
return
try:
# TODO: Load actual CNN-LSTM model
# This would be a custom model trained on NonVerbalSpeech-38K or similar
logger.warning("CNN-LSTM model not implemented yet")
self.is_loaded = False
except Exception as e:
logger.error(f"Failed to load CNN-LSTM: {e}")
self.is_loaded = False
def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
"""Predict events using CNN-LSTM."""
if not self.is_loaded:
return {"events": [], "confidence": {}}
# TODO: Implement CNN-LSTM prediction
return {"events": [], "confidence": {}}
class EventEnsemble(BaseEnsemble):
"""Ensemble for non-verbal event detection."""
def __init__(self, device: str = 'cpu'):
"""
Initialize event detection ensemble.
Args:
device: 'cpu' or 'cuda'
"""
self.device = device
# Initialize models
models = [
LibrosaEventDetector(weight=0.5, device=device), # Most reliable currently
SenseVoiceEventDetector(weight=0.5, device=device), # If available
# CNNLSTMEventDetector(weight=0.3, device=device), # TODO: Add when trained
]
super().__init__(models=models, voting_strategy='weighted')
def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
"""
Detect events using ensemble.
Args:
audio: Audio array
sample_rate: Sample rate
Returns:
Dictionary with detected events and confidence scores
"""
# Get predictions from all models
predictions = self.predict_all(audio, sample_rate)
if not predictions:
return {
"events": [],
"confidence": {},
"detections": []
}
# Aggregate events
all_events = []
event_confidence = {}
event_counts = {}
for pred in predictions:
model_events = pred.get("events", [])
model_confidence = pred.get("confidence", {})
model_weight = pred.get("model_weight", 1.0)
for event in model_events:
all_events.append(event)
event_counts[event] = event_counts.get(event, 0) + 1
# Weighted confidence
conf = model_confidence.get(event, 0.5) * model_weight
event_confidence[event] = event_confidence.get(event, 0.0) + conf
# Get unique events detected by majority
unique_events = list(set(all_events))
# Normalize confidence
total_weight = sum(p.get("model_weight", 1.0) for p in predictions)
for event in event_confidence:
event_confidence[event] = event_confidence[event] / total_weight
return {
"events": unique_events,
"confidence": event_confidence,
"counts": event_counts,
"detections": predictions
}