Spaces:
Running
Running
| """ | |
| Suspicious Pause Detector | |
| Detects abnormally long silences that may indicate the speaker is looking up | |
| answers or receiving help during a test. | |
| """ | |
| import numpy as np | |
| from dataclasses import dataclass, field | |
| from typing import List, Optional | |
| class SuspiciousPause: | |
| """A detected suspicious pause.""" | |
| start: float | |
| end: float | |
| duration: float | |
| context: str = "" # What happened before/after | |
| class PauseResult: | |
| """Result of suspicious pause detection.""" | |
| detected: bool | |
| pauses: List[SuspiciousPause] = field(default_factory=list) | |
| total_suspicious_time: float = 0.0 | |
| longest_pause: float = 0.0 | |
| def count(self) -> int: | |
| return len(self.pauses) | |
| class SuspiciousPauseDetector: | |
| """ | |
| Detects suspicious long pauses in speech. | |
| In natural conversation, pauses are typically: | |
| - Short (< 2 seconds) for thinking | |
| - Medium (2-4 seconds) for complex thoughts | |
| Suspicious pauses (> 5 seconds) may indicate: | |
| - Looking up answers | |
| - Receiving external help | |
| - Reading from a source | |
| """ | |
| def __init__(self, | |
| min_suspicious_duration: float = 5.0, | |
| warning_duration: float = 3.0, | |
| max_natural_pause: float = 2.0): | |
| """ | |
| Args: | |
| min_suspicious_duration: Minimum pause duration to flag as suspicious | |
| warning_duration: Duration to flag as a warning (not fully suspicious) | |
| max_natural_pause: Maximum duration for a natural pause | |
| """ | |
| self.min_suspicious_duration = min_suspicious_duration | |
| self.warning_duration = warning_duration | |
| self.max_natural_pause = max_natural_pause | |
| def detect(self, speech_segments: List[dict], | |
| total_duration: float, | |
| transcription_segments: List[dict] = None) -> PauseResult: | |
| """ | |
| Detect suspicious pauses between speech segments. | |
| Args: | |
| speech_segments: List of {'start': float, 'end': float} for speech | |
| total_duration: Total audio duration in seconds | |
| transcription_segments: Optional transcription with timestamps for context | |
| Returns: | |
| PauseResult with detected suspicious pauses | |
| """ | |
| if not speech_segments: | |
| return PauseResult(detected=False) | |
| # Sort segments by start time | |
| sorted_segments = sorted(speech_segments, key=lambda s: s.get('start', 0)) | |
| suspicious_pauses = [] | |
| # Check pause at the beginning | |
| first_start = sorted_segments[0].get('start', 0) | |
| if first_start >= self.min_suspicious_duration: | |
| context = self._get_context(0, first_start, transcription_segments, "start") | |
| suspicious_pauses.append(SuspiciousPause( | |
| start=0, | |
| end=first_start, | |
| duration=round(first_start, 2), | |
| context=context | |
| )) | |
| # Check pauses between segments | |
| for i in range(1, len(sorted_segments)): | |
| prev_end = sorted_segments[i-1].get('end', 0) | |
| curr_start = sorted_segments[i].get('start', 0) | |
| gap = curr_start - prev_end | |
| if gap >= self.min_suspicious_duration: | |
| context = self._get_context(prev_end, curr_start, transcription_segments, "middle") | |
| suspicious_pauses.append(SuspiciousPause( | |
| start=round(prev_end, 2), | |
| end=round(curr_start, 2), | |
| duration=round(gap, 2), | |
| context=context | |
| )) | |
| # Check pause at the end | |
| last_end = sorted_segments[-1].get('end', 0) | |
| end_gap = total_duration - last_end | |
| if end_gap >= self.min_suspicious_duration: | |
| context = self._get_context(last_end, total_duration, transcription_segments, "end") | |
| suspicious_pauses.append(SuspiciousPause( | |
| start=round(last_end, 2), | |
| end=round(total_duration, 2), | |
| duration=round(end_gap, 2), | |
| context=context | |
| )) | |
| # Calculate summary statistics | |
| total_suspicious_time = sum(p.duration for p in suspicious_pauses) | |
| longest_pause = max((p.duration for p in suspicious_pauses), default=0) | |
| return PauseResult( | |
| detected=len(suspicious_pauses) > 0, | |
| pauses=suspicious_pauses, | |
| total_suspicious_time=round(total_suspicious_time, 2), | |
| longest_pause=round(longest_pause, 2) | |
| ) | |
| def detect_from_vad(self, vad_result: dict, total_duration: float) -> PauseResult: | |
| """ | |
| Detect suspicious pauses using VAD output. | |
| Args: | |
| vad_result: VAD result with 'segments' list | |
| total_duration: Total audio duration | |
| Returns: | |
| PauseResult with detected suspicious pauses | |
| """ | |
| segments = vad_result.get('segments', []) | |
| return self.detect(segments, total_duration) | |
| def _get_context(self, start: float, end: float, | |
| transcription_segments: List[dict], | |
| position: str) -> str: | |
| """ | |
| Get context about what happened before/after the pause. | |
| """ | |
| if not transcription_segments: | |
| if position == "start": | |
| return "Long silence at audio start" | |
| elif position == "end": | |
| return "Long silence at audio end" | |
| else: | |
| return "Long silence mid-conversation" | |
| # Find text before and after the pause | |
| text_before = "" | |
| text_after = "" | |
| for seg in transcription_segments: | |
| seg_end = seg.get('end', 0) | |
| seg_start = seg.get('start', 0) | |
| seg_text = seg.get('text', '').strip() | |
| # Text ending just before pause | |
| if seg_end <= start + 0.5 and seg_end >= start - 1.0: | |
| text_before = seg_text[-50:] if len(seg_text) > 50 else seg_text | |
| # Text starting just after pause | |
| if seg_start >= end - 0.5 and seg_start <= end + 1.0: | |
| text_after = seg_text[:50] if len(seg_text) > 50 else seg_text | |
| if text_before and text_after: | |
| return f"After: '{text_before}...' | Before: '...{text_after}'" | |
| elif text_before: | |
| return f"After: '{text_before}...'" | |
| elif text_after: | |
| return f"Before: '...{text_after}'" | |
| else: | |
| return f"Silence at {position} of audio" | |
| def analyze_pause_pattern(self, speech_segments: List[dict], | |
| total_duration: float) -> dict: | |
| """ | |
| Analyze the overall pause pattern in the audio. | |
| Returns statistics about pause behavior. | |
| """ | |
| if not speech_segments or len(speech_segments) < 2: | |
| return { | |
| 'avg_pause': 0, | |
| 'max_pause': 0, | |
| 'pause_count': 0, | |
| 'speech_ratio': 0 | |
| } | |
| sorted_segments = sorted(speech_segments, key=lambda s: s.get('start', 0)) | |
| pauses = [] | |
| for i in range(1, len(sorted_segments)): | |
| prev_end = sorted_segments[i-1].get('end', 0) | |
| curr_start = sorted_segments[i].get('start', 0) | |
| gap = curr_start - prev_end | |
| if gap > 0.1: # Ignore very small gaps | |
| pauses.append(gap) | |
| if not pauses: | |
| return { | |
| 'avg_pause': 0, | |
| 'max_pause': 0, | |
| 'pause_count': 0, | |
| 'speech_ratio': 1.0 | |
| } | |
| # Calculate speech time | |
| speech_time = sum( | |
| seg.get('end', 0) - seg.get('start', 0) | |
| for seg in sorted_segments | |
| ) | |
| return { | |
| 'avg_pause': round(np.mean(pauses), 2), | |
| 'max_pause': round(max(pauses), 2), | |
| 'pause_count': len(pauses), | |
| 'speech_ratio': round(speech_time / total_duration, 2) if total_duration > 0 else 0, | |
| 'natural_pauses': sum(1 for p in pauses if p <= self.max_natural_pause), | |
| 'warning_pauses': sum(1 for p in pauses if self.max_natural_pause < p < self.min_suspicious_duration), | |
| 'suspicious_pauses': sum(1 for p in pauses if p >= self.min_suspicious_duration) | |
| } | |