Spaces:

daasime
/

sop-audio-analyzer

Running

App Files Files Community

sop-audio-analyzer / src /fraud_detection /whisper_detector.py

daasime

Add SOP Audio Analyzer app files

ebba35f 3 months ago

raw

history blame contribute delete

7.36 kB

	"""
	Whisper Detector
	Detects low-volume background voices (whispers) that may indicate someone
	is being prompted or helped during a test.
	"""

	import numpy as np
	import librosa
	from dataclasses import dataclass, field
	from typing import List, Tuple


	@dataclass
	class WhisperInstance:
	"""A detected whisper event."""
	start: float
	end: float
	confidence: float


	@dataclass
	class WhisperResult:
	"""Result of whisper detection."""
	detected: bool
	instances: List[WhisperInstance] = field(default_factory=list)

	@property
	def count(self) -> int:
	return len(self.instances)


	class WhisperDetector:
	"""
	Detects whispers/low background voices in audio.

	Whispers have distinct characteristics:
	- Lower amplitude than normal speech
	- More high-frequency content (less voiced, more fricative)
	- Often occur during pauses in main speaker's speech
	"""

	def __init__(self,
	energy_threshold: float = 0.02,
	min_duration: float = 0.3,
	max_amplitude_ratio: float = 0.3):
	"""
	Args:
	energy_threshold: Minimum energy to consider as potential whisper
	min_duration: Minimum duration in seconds for a whisper
	max_amplitude_ratio: Max ratio vs main speech (whispers are quieter)
	"""
	self.energy_threshold = energy_threshold
	self.min_duration = min_duration
	self.max_amplitude_ratio = max_amplitude_ratio

	def detect(self, waveform: np.ndarray, sample_rate: int,
	main_speaker_segments: List[dict] = None) -> WhisperResult:
	"""
	Detect whispers in audio.

	Args:
	waveform: Audio waveform as numpy array
	sample_rate: Sample rate of audio
	main_speaker_segments: Segments where main speaker is talking
	(whispers are checked outside these)

	Returns:
	WhisperResult with detected whisper instances
	"""
	# Ensure mono
	if len(waveform.shape) > 1:
	waveform = waveform.mean(axis=0)

	# Calculate main speech amplitude for comparison
	main_amplitude = np.percentile(np.abs(waveform), 95)

	# Frame-based analysis
	frame_length = int(0.025 * sample_rate) # 25ms frames
	hop_length = int(0.010 * sample_rate) # 10ms hop

	# Calculate energy per frame
	energy = librosa.feature.rms(y=waveform, frame_length=frame_length,
	hop_length=hop_length)[0]

	# Calculate spectral centroid (whispers have higher centroid)
	spectral_centroid = librosa.feature.spectral_centroid(
	y=waveform, sr=sample_rate,
	n_fft=frame_length, hop_length=hop_length
	)[0]

	# Calculate zero crossing rate (whispers have higher ZCR)
	zcr = librosa.feature.zero_crossing_rate(
	y=waveform, frame_length=frame_length, hop_length=hop_length
	)[0]

	# Normalize features
	energy_norm = energy / (main_amplitude + 1e-10)
	centroid_norm = spectral_centroid / (sample_rate / 2)

	# Identify whisper candidates:
	# - Low energy (but not silent)
	# - High spectral centroid (breathy)
	# - High zero crossing rate
	whisper_frames = (
	(energy > self.energy_threshold * main_amplitude) &
	(energy_norm < self.max_amplitude_ratio) &
	(centroid_norm > 0.15) &
	(zcr > 0.1)
	)

	# Convert frames to time segments
	frame_times = librosa.frames_to_time(
	np.arange(len(energy)), sr=sample_rate, hop_length=hop_length
	)

	# Group consecutive whisper frames
	instances = []
	in_whisper = False
	start_time = 0

	for i, is_whisper in enumerate(whisper_frames):
	time = frame_times[i] if i < len(frame_times) else frame_times[-1]

	if is_whisper and not in_whisper:
	start_time = time
	in_whisper = True
	elif not is_whisper and in_whisper:
	duration = time - start_time
	if duration >= self.min_duration:
	# Check if this overlaps with main speaker
	if not self._overlaps_main_speaker(start_time, time, main_speaker_segments):
	confidence = self._calculate_confidence(
	waveform, sample_rate, start_time, time, main_amplitude
	)
	if confidence > 0.5:
	instances.append(WhisperInstance(
	start=round(start_time, 2),
	end=round(time, 2),
	confidence=round(confidence, 2)
	))
	in_whisper = False

	# Handle case where audio ends during whisper
	if in_whisper:
	end_time = frame_times[-1] if len(frame_times) > 0 else 0
	duration = end_time - start_time
	if duration >= self.min_duration:
	if not self._overlaps_main_speaker(start_time, end_time, main_speaker_segments):
	confidence = self._calculate_confidence(
	waveform, sample_rate, start_time, end_time, main_amplitude
	)
	if confidence > 0.5:
	instances.append(WhisperInstance(
	start=round(start_time, 2),
	end=round(end_time, 2),
	confidence=round(confidence, 2)
	))

	return WhisperResult(
	detected=len(instances) > 0,
	instances=instances
	)

	def _overlaps_main_speaker(self, start: float, end: float,
	segments: List[dict]) -> bool:
	"""Check if time range overlaps with main speaker segments."""
	if not segments:
	return False

	for seg in segments:
	seg_start = seg.get('start', 0)
	seg_end = seg.get('end', 0)
	# Check for overlap
	if start < seg_end and end > seg_start:
	return True
	return False

	def _calculate_confidence(self, waveform: np.ndarray, sample_rate: int,
	start: float, end: float,
	main_amplitude: float) -> float:
	"""Calculate confidence that this segment is a whisper."""
	start_sample = int(start * sample_rate)
	end_sample = int(end * sample_rate)

	if end_sample > len(waveform):
	end_sample = len(waveform)
	if start_sample >= end_sample:
	return 0.0

	segment = waveform[start_sample:end_sample]

	# Calculate features for this segment
	seg_amplitude = np.percentile(np.abs(segment), 95)
	amplitude_ratio = seg_amplitude / (main_amplitude + 1e-10)

	# Whisper confidence based on amplitude ratio
	# Lower ratio = more likely whisper
	if amplitude_ratio > 0.5:
	return 0.0

	# Scale confidence: 0.1-0.3 ratio = high confidence
	confidence = 1.0 - (amplitude_ratio / 0.5)
	return min(1.0, max(0.0, confidence))