marcosremar

Add FULL mode support + Event detection ensemble

ffbf816 5 months ago

8.76 kB

	"""
	Non-verbal event detection ensemble models.

	Detects: <laugh>, <chuckle>, <sigh>, <breath>, <cough>, <gasp>, etc.
	"""

	import numpy as np
	import torch
	from typing import Dict, Any, List, Optional
	import logging

	from ..base import BaseModel, BaseEnsemble

	logger = logging.getLogger(__name__)


	class SenseVoiceEventDetector(BaseModel):
	"""SenseVoice for event detection (integrated with emotion model)."""

	def __init__(self, model_name: str = "FunAudioLLM/SenseVoiceSmall", weight: float = 0.5, device: str = 'cpu'):
	super().__init__(name="sensevoice_events", weight=weight, device=device)
	self.model_name = model_name

	def load(self):
	"""Load SenseVoice model."""
	try:
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor

	logger.info(f"Loading {self.model_name} for event detection...")

	self.processor = AutoProcessor.from_pretrained(self.model_name)
	self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.model_name)
	self.model.to(self.device)
	self.model.eval()

	self.is_loaded = True
	logger.info(f"✅ SenseVoice event detector loaded")

	except Exception as e:
	logger.warning(f"SenseVoice not available: {e}")
	self.is_loaded = False

	def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
	"""Detect events using SenseVoice."""
	if not self.is_loaded:
	return {"events": [], "confidence": {}}

	try:
	# SenseVoice can detect: bgm, applause, laughter, crying, coughing, sneezing
	# Map to Orpheus tags
	event_mapping = {
	"laughter": "<laugh>",
	"crying": "<cry>",
	"coughing": "<cough>",
	"sneezing": "<sneeze>",
	"applause": "<applause>"
	}

	# Placeholder - actual implementation would use model output
	detected = []

	return {
	"events": detected,
	"confidence": {}
	}

	except Exception as e:
	logger.error(f"SenseVoice event detection error: {e}")
	return {"events": [], "confidence": {}}


	class LibrosaEventDetector(BaseModel):
	"""Rule-based event detector using librosa features."""

	def __init__(self, weight: float = 0.3, device: str = 'cpu'):
	super().__init__(name="librosa_events", weight=weight, device=device)

	def load(self):
	"""Load librosa (no model to load)."""
	try:
	import librosa
	self.librosa = librosa
	self.is_loaded = True
	logger.info("✅ Librosa event detector ready")
	except ImportError:
	logger.error("Librosa not installed: pip install librosa")
	raise

	def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
	"""Detect events using audio features."""
	if not self.is_loaded:
	return {"events": [], "confidence": {}}

	try:
	events = []
	confidence = {}

	# Resample if needed
	if sample_rate != 16000:
	audio = self.librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
	sample_rate = 16000

	# Extract features
	rms = self.librosa.feature.rms(y=audio)[0]
	zcr = self.librosa.feature.zero_crossing_rate(audio)[0]
	spectral_centroid = self.librosa.feature.spectral_centroid(y=audio, sr=sample_rate)[0]

	# 1. Detect LAUGHTER (high energy variation + high zero-crossing)
	energy_std = np.std(rms)
	zcr_mean = np.mean(zcr)

	if energy_std > 0.15 and zcr_mean > 0.1:
	events.append("<laugh>")
	confidence["<laugh>"] = min(float(energy_std * 5), 0.95)

	# 2. Detect BREATH (low energy, low frequency)
	low_freq_ratio = np.mean(spectral_centroid < 500)
	rms_mean = np.mean(rms)

	if rms_mean < 0.02 and low_freq_ratio > 0.6 and len(audio) > sample_rate * 0.3:
	events.append("<breath>")
	confidence["<breath>"] = float(low_freq_ratio * 0.8)

	# 3. Detect SIGH (energy drop then rise)
	if len(rms) > 10:
	rms_diff = np.diff(rms)
	# Look for dip pattern
	if np.any(rms_diff < -0.05) and np.any(rms_diff > 0.05):
	events.append("<sigh>")
	confidence["<sigh>"] = 0.6

	# 4. Detect COUGH (sudden burst of energy)
	rms_max = np.max(rms)
	if rms_max > 0.8 and energy_std > 0.2:
	events.append("<cough>")
	confidence["<cough>"] = float(min(rms_max, 0.9))

	return {
	"events": events,
	"confidence": confidence
	}

	except Exception as e:
	logger.error(f"Librosa event detection error: {e}")
	return {"events": [], "confidence": {}}


	class CNNLSTMEventDetector(BaseModel):
	"""CNN+LSTM model for non-verbal event detection."""

	def __init__(self, model_path: Optional[str] = None, weight: float = 0.2, device: str = 'cpu'):
	super().__init__(name="cnn_lstm_events", weight=weight, device=device)
	self.model_path = model_path

	def load(self):
	"""Load CNN-LSTM model."""
	if self.model_path is None:
	logger.warning("CNN-LSTM model path not provided. Using placeholder.")
	self.is_loaded = False
	return

	try:
	# TODO: Load actual CNN-LSTM model
	# This would be a custom model trained on NonVerbalSpeech-38K or similar
	logger.warning("CNN-LSTM model not implemented yet")
	self.is_loaded = False

	except Exception as e:
	logger.error(f"Failed to load CNN-LSTM: {e}")
	self.is_loaded = False

	def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
	"""Predict events using CNN-LSTM."""
	if not self.is_loaded:
	return {"events": [], "confidence": {}}

	# TODO: Implement CNN-LSTM prediction
	return {"events": [], "confidence": {}}


	class EventEnsemble(BaseEnsemble):
	"""Ensemble for non-verbal event detection."""

	def __init__(self, device: str = 'cpu'):
	"""
	Initialize event detection ensemble.

	Args:
	device: 'cpu' or 'cuda'
	"""
	self.device = device

	# Initialize models
	models = [
	LibrosaEventDetector(weight=0.5, device=device), # Most reliable currently
	SenseVoiceEventDetector(weight=0.5, device=device), # If available
	# CNNLSTMEventDetector(weight=0.3, device=device), # TODO: Add when trained
	]

	super().__init__(models=models, voting_strategy='weighted')

	def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
	"""
	Detect events using ensemble.

	Args:
	audio: Audio array
	sample_rate: Sample rate

	Returns:
	Dictionary with detected events and confidence scores
	"""
	# Get predictions from all models
	predictions = self.predict_all(audio, sample_rate)

	if not predictions:
	return {
	"events": [],
	"confidence": {},
	"detections": []
	}

	# Aggregate events
	all_events = []
	event_confidence = {}
	event_counts = {}

	for pred in predictions:
	model_events = pred.get("events", [])
	model_confidence = pred.get("confidence", {})
	model_weight = pred.get("model_weight", 1.0)

	for event in model_events:
	all_events.append(event)
	event_counts[event] = event_counts.get(event, 0) + 1

	# Weighted confidence
	conf = model_confidence.get(event, 0.5) * model_weight
	event_confidence[event] = event_confidence.get(event, 0.0) + conf

	# Get unique events detected by majority
	unique_events = list(set(all_events))

	# Normalize confidence
	total_weight = sum(p.get("model_weight", 1.0) for p in predictions)
	for event in event_confidence:
	event_confidence[event] = event_confidence[event] / total_weight

	return {
	"events": unique_events,
	"confidence": event_confidence,
	"counts": event_counts,
	"detections": predictions
	}