Spaces:
Running
Running
| """ | |
| Whisper Detector | |
| Detects low-volume background voices (whispers) that may indicate someone | |
| is being prompted or helped during a test. | |
| """ | |
| import numpy as np | |
| import librosa | |
| from dataclasses import dataclass, field | |
| from typing import List, Tuple | |
| class WhisperInstance: | |
| """A detected whisper event.""" | |
| start: float | |
| end: float | |
| confidence: float | |
| class WhisperResult: | |
| """Result of whisper detection.""" | |
| detected: bool | |
| instances: List[WhisperInstance] = field(default_factory=list) | |
| def count(self) -> int: | |
| return len(self.instances) | |
| class WhisperDetector: | |
| """ | |
| Detects whispers/low background voices in audio. | |
| Whispers have distinct characteristics: | |
| - Lower amplitude than normal speech | |
| - More high-frequency content (less voiced, more fricative) | |
| - Often occur during pauses in main speaker's speech | |
| """ | |
| def __init__(self, | |
| energy_threshold: float = 0.02, | |
| min_duration: float = 0.3, | |
| max_amplitude_ratio: float = 0.3): | |
| """ | |
| Args: | |
| energy_threshold: Minimum energy to consider as potential whisper | |
| min_duration: Minimum duration in seconds for a whisper | |
| max_amplitude_ratio: Max ratio vs main speech (whispers are quieter) | |
| """ | |
| self.energy_threshold = energy_threshold | |
| self.min_duration = min_duration | |
| self.max_amplitude_ratio = max_amplitude_ratio | |
| def detect(self, waveform: np.ndarray, sample_rate: int, | |
| main_speaker_segments: List[dict] = None) -> WhisperResult: | |
| """ | |
| Detect whispers in audio. | |
| Args: | |
| waveform: Audio waveform as numpy array | |
| sample_rate: Sample rate of audio | |
| main_speaker_segments: Segments where main speaker is talking | |
| (whispers are checked outside these) | |
| Returns: | |
| WhisperResult with detected whisper instances | |
| """ | |
| # Ensure mono | |
| if len(waveform.shape) > 1: | |
| waveform = waveform.mean(axis=0) | |
| # Calculate main speech amplitude for comparison | |
| main_amplitude = np.percentile(np.abs(waveform), 95) | |
| # Frame-based analysis | |
| frame_length = int(0.025 * sample_rate) # 25ms frames | |
| hop_length = int(0.010 * sample_rate) # 10ms hop | |
| # Calculate energy per frame | |
| energy = librosa.feature.rms(y=waveform, frame_length=frame_length, | |
| hop_length=hop_length)[0] | |
| # Calculate spectral centroid (whispers have higher centroid) | |
| spectral_centroid = librosa.feature.spectral_centroid( | |
| y=waveform, sr=sample_rate, | |
| n_fft=frame_length, hop_length=hop_length | |
| )[0] | |
| # Calculate zero crossing rate (whispers have higher ZCR) | |
| zcr = librosa.feature.zero_crossing_rate( | |
| y=waveform, frame_length=frame_length, hop_length=hop_length | |
| )[0] | |
| # Normalize features | |
| energy_norm = energy / (main_amplitude + 1e-10) | |
| centroid_norm = spectral_centroid / (sample_rate / 2) | |
| # Identify whisper candidates: | |
| # - Low energy (but not silent) | |
| # - High spectral centroid (breathy) | |
| # - High zero crossing rate | |
| whisper_frames = ( | |
| (energy > self.energy_threshold * main_amplitude) & | |
| (energy_norm < self.max_amplitude_ratio) & | |
| (centroid_norm > 0.15) & | |
| (zcr > 0.1) | |
| ) | |
| # Convert frames to time segments | |
| frame_times = librosa.frames_to_time( | |
| np.arange(len(energy)), sr=sample_rate, hop_length=hop_length | |
| ) | |
| # Group consecutive whisper frames | |
| instances = [] | |
| in_whisper = False | |
| start_time = 0 | |
| for i, is_whisper in enumerate(whisper_frames): | |
| time = frame_times[i] if i < len(frame_times) else frame_times[-1] | |
| if is_whisper and not in_whisper: | |
| start_time = time | |
| in_whisper = True | |
| elif not is_whisper and in_whisper: | |
| duration = time - start_time | |
| if duration >= self.min_duration: | |
| # Check if this overlaps with main speaker | |
| if not self._overlaps_main_speaker(start_time, time, main_speaker_segments): | |
| confidence = self._calculate_confidence( | |
| waveform, sample_rate, start_time, time, main_amplitude | |
| ) | |
| if confidence > 0.5: | |
| instances.append(WhisperInstance( | |
| start=round(start_time, 2), | |
| end=round(time, 2), | |
| confidence=round(confidence, 2) | |
| )) | |
| in_whisper = False | |
| # Handle case where audio ends during whisper | |
| if in_whisper: | |
| end_time = frame_times[-1] if len(frame_times) > 0 else 0 | |
| duration = end_time - start_time | |
| if duration >= self.min_duration: | |
| if not self._overlaps_main_speaker(start_time, end_time, main_speaker_segments): | |
| confidence = self._calculate_confidence( | |
| waveform, sample_rate, start_time, end_time, main_amplitude | |
| ) | |
| if confidence > 0.5: | |
| instances.append(WhisperInstance( | |
| start=round(start_time, 2), | |
| end=round(end_time, 2), | |
| confidence=round(confidence, 2) | |
| )) | |
| return WhisperResult( | |
| detected=len(instances) > 0, | |
| instances=instances | |
| ) | |
| def _overlaps_main_speaker(self, start: float, end: float, | |
| segments: List[dict]) -> bool: | |
| """Check if time range overlaps with main speaker segments.""" | |
| if not segments: | |
| return False | |
| for seg in segments: | |
| seg_start = seg.get('start', 0) | |
| seg_end = seg.get('end', 0) | |
| # Check for overlap | |
| if start < seg_end and end > seg_start: | |
| return True | |
| return False | |
| def _calculate_confidence(self, waveform: np.ndarray, sample_rate: int, | |
| start: float, end: float, | |
| main_amplitude: float) -> float: | |
| """Calculate confidence that this segment is a whisper.""" | |
| start_sample = int(start * sample_rate) | |
| end_sample = int(end * sample_rate) | |
| if end_sample > len(waveform): | |
| end_sample = len(waveform) | |
| if start_sample >= end_sample: | |
| return 0.0 | |
| segment = waveform[start_sample:end_sample] | |
| # Calculate features for this segment | |
| seg_amplitude = np.percentile(np.abs(segment), 95) | |
| amplitude_ratio = seg_amplitude / (main_amplitude + 1e-10) | |
| # Whisper confidence based on amplitude ratio | |
| # Lower ratio = more likely whisper | |
| if amplitude_ratio > 0.5: | |
| return 0.0 | |
| # Scale confidence: 0.1-0.3 ratio = high confidence | |
| confidence = 1.0 - (amplitude_ratio / 0.5) | |
| return min(1.0, max(0.0, confidence)) | |