sop-audio-analyzer / src /fraud_detection /whisper_detector.py
daasime's picture
Add SOP Audio Analyzer app files
ebba35f
"""
Whisper Detector
Detects low-volume background voices (whispers) that may indicate someone
is being prompted or helped during a test.
"""
import numpy as np
import librosa
from dataclasses import dataclass, field
from typing import List, Tuple
@dataclass
class WhisperInstance:
"""A detected whisper event."""
start: float
end: float
confidence: float
@dataclass
class WhisperResult:
"""Result of whisper detection."""
detected: bool
instances: List[WhisperInstance] = field(default_factory=list)
@property
def count(self) -> int:
return len(self.instances)
class WhisperDetector:
"""
Detects whispers/low background voices in audio.
Whispers have distinct characteristics:
- Lower amplitude than normal speech
- More high-frequency content (less voiced, more fricative)
- Often occur during pauses in main speaker's speech
"""
def __init__(self,
energy_threshold: float = 0.02,
min_duration: float = 0.3,
max_amplitude_ratio: float = 0.3):
"""
Args:
energy_threshold: Minimum energy to consider as potential whisper
min_duration: Minimum duration in seconds for a whisper
max_amplitude_ratio: Max ratio vs main speech (whispers are quieter)
"""
self.energy_threshold = energy_threshold
self.min_duration = min_duration
self.max_amplitude_ratio = max_amplitude_ratio
def detect(self, waveform: np.ndarray, sample_rate: int,
main_speaker_segments: List[dict] = None) -> WhisperResult:
"""
Detect whispers in audio.
Args:
waveform: Audio waveform as numpy array
sample_rate: Sample rate of audio
main_speaker_segments: Segments where main speaker is talking
(whispers are checked outside these)
Returns:
WhisperResult with detected whisper instances
"""
# Ensure mono
if len(waveform.shape) > 1:
waveform = waveform.mean(axis=0)
# Calculate main speech amplitude for comparison
main_amplitude = np.percentile(np.abs(waveform), 95)
# Frame-based analysis
frame_length = int(0.025 * sample_rate) # 25ms frames
hop_length = int(0.010 * sample_rate) # 10ms hop
# Calculate energy per frame
energy = librosa.feature.rms(y=waveform, frame_length=frame_length,
hop_length=hop_length)[0]
# Calculate spectral centroid (whispers have higher centroid)
spectral_centroid = librosa.feature.spectral_centroid(
y=waveform, sr=sample_rate,
n_fft=frame_length, hop_length=hop_length
)[0]
# Calculate zero crossing rate (whispers have higher ZCR)
zcr = librosa.feature.zero_crossing_rate(
y=waveform, frame_length=frame_length, hop_length=hop_length
)[0]
# Normalize features
energy_norm = energy / (main_amplitude + 1e-10)
centroid_norm = spectral_centroid / (sample_rate / 2)
# Identify whisper candidates:
# - Low energy (but not silent)
# - High spectral centroid (breathy)
# - High zero crossing rate
whisper_frames = (
(energy > self.energy_threshold * main_amplitude) &
(energy_norm < self.max_amplitude_ratio) &
(centroid_norm > 0.15) &
(zcr > 0.1)
)
# Convert frames to time segments
frame_times = librosa.frames_to_time(
np.arange(len(energy)), sr=sample_rate, hop_length=hop_length
)
# Group consecutive whisper frames
instances = []
in_whisper = False
start_time = 0
for i, is_whisper in enumerate(whisper_frames):
time = frame_times[i] if i < len(frame_times) else frame_times[-1]
if is_whisper and not in_whisper:
start_time = time
in_whisper = True
elif not is_whisper and in_whisper:
duration = time - start_time
if duration >= self.min_duration:
# Check if this overlaps with main speaker
if not self._overlaps_main_speaker(start_time, time, main_speaker_segments):
confidence = self._calculate_confidence(
waveform, sample_rate, start_time, time, main_amplitude
)
if confidence > 0.5:
instances.append(WhisperInstance(
start=round(start_time, 2),
end=round(time, 2),
confidence=round(confidence, 2)
))
in_whisper = False
# Handle case where audio ends during whisper
if in_whisper:
end_time = frame_times[-1] if len(frame_times) > 0 else 0
duration = end_time - start_time
if duration >= self.min_duration:
if not self._overlaps_main_speaker(start_time, end_time, main_speaker_segments):
confidence = self._calculate_confidence(
waveform, sample_rate, start_time, end_time, main_amplitude
)
if confidence > 0.5:
instances.append(WhisperInstance(
start=round(start_time, 2),
end=round(end_time, 2),
confidence=round(confidence, 2)
))
return WhisperResult(
detected=len(instances) > 0,
instances=instances
)
def _overlaps_main_speaker(self, start: float, end: float,
segments: List[dict]) -> bool:
"""Check if time range overlaps with main speaker segments."""
if not segments:
return False
for seg in segments:
seg_start = seg.get('start', 0)
seg_end = seg.get('end', 0)
# Check for overlap
if start < seg_end and end > seg_start:
return True
return False
def _calculate_confidence(self, waveform: np.ndarray, sample_rate: int,
start: float, end: float,
main_amplitude: float) -> float:
"""Calculate confidence that this segment is a whisper."""
start_sample = int(start * sample_rate)
end_sample = int(end * sample_rate)
if end_sample > len(waveform):
end_sample = len(waveform)
if start_sample >= end_sample:
return 0.0
segment = waveform[start_sample:end_sample]
# Calculate features for this segment
seg_amplitude = np.percentile(np.abs(segment), 95)
amplitude_ratio = seg_amplitude / (main_amplitude + 1e-10)
# Whisper confidence based on amplitude ratio
# Lower ratio = more likely whisper
if amplitude_ratio > 0.5:
return 0.0
# Scale confidence: 0.1-0.3 ratio = high confidence
confidence = 1.0 - (amplitude_ratio / 0.5)
return min(1.0, max(0.0, confidence))