Spaces:

daasime
/

sop-audio-analyzer

Sleeping

File size: 7,846 Bytes

ebba35f

"""
Reading Pattern Analyzer
Detects if someone is reading prepared answers vs speaking naturally.

Key indicators of reading:
- Consistent speech rate (no natural variation)
- Lack of filler words ("um", "uh", "like", "you know")
- Regular pause patterns
- Monotonic rhythm
"""

import numpy as np
from dataclasses import dataclass, field
from typing import List, Optional


# Common filler words in English
FILLER_WORDS = [
    'um', 'uh', 'uhm', 'umm', 'er', 'ah', 'like', 'you know',
    'basically', 'actually', 'so', 'well', 'i mean', 'kind of',
    'sort of', 'right', 'okay'
]


@dataclass
class ReadingPatternResult:
    """Result of reading pattern analysis."""
    is_reading: bool
    confidence: float  # 0.0 to 1.0
    indicators: List[str] = field(default_factory=list)
    speech_rate_cv: float = 0.0  # Coefficient of variation
    filler_word_rate: float = 0.0  # Fillers per minute
    pause_regularity: float = 0.0  # How regular pauses are


class ReadingPatternAnalyzer:
    """
    Analyzes speech patterns to detect if someone is reading.

    Uses transcription with timestamps to analyze:
    - Speech rate variation
    - Filler word frequency
    - Pause patterns
    """

    def __init__(self,
                 min_speech_rate_cv: float = 0.15,
                 min_filler_rate: float = 2.0,
                 reading_threshold: float = 0.6):
        """
        Args:
            min_speech_rate_cv: Minimum coefficient of variation for natural speech
            min_filler_rate: Minimum filler words per minute for natural speech
            reading_threshold: Confidence threshold to flag as reading
        """
        self.min_speech_rate_cv = min_speech_rate_cv
        self.min_filler_rate = min_filler_rate
        self.reading_threshold = reading_threshold

    def analyze(self, transcription: str, word_timestamps: List[dict],
                duration_seconds: float) -> ReadingPatternResult:
        """
        Analyze transcription for reading patterns.

        Args:
            transcription: Full transcription text
            word_timestamps: List of {'word': str, 'start': float, 'end': float}
            duration_seconds: Total audio duration

        Returns:
            ReadingPatternResult with analysis
        """
        if not word_timestamps or len(word_timestamps) < 10:
            return ReadingPatternResult(
                is_reading=False,
                confidence=0.0,
                indicators=["Insufficient data for analysis"]
            )

        indicators = []
        scores = []

        # 1. Analyze speech rate variation
        speech_rate_cv = self._analyze_speech_rate(word_timestamps)
        if speech_rate_cv < self.min_speech_rate_cv:
            indicators.append(f"Constant speech rate (CV={speech_rate_cv:.2f})")
            scores.append(0.8)
        else:
            scores.append(0.2)

        # 2. Analyze filler word frequency
        filler_rate = self._analyze_filler_words(transcription, duration_seconds)
        if filler_rate < self.min_filler_rate:
            indicators.append(f"Few filler words ({filler_rate:.1f}/min)")
            scores.append(0.7)
        else:
            scores.append(0.2)

        # 3. Analyze pause patterns
        pause_regularity = self._analyze_pause_patterns(word_timestamps)
        if pause_regularity > 0.7:
            indicators.append(f"Regular pause pattern ({pause_regularity:.0%})")
            scores.append(0.6)
        else:
            scores.append(0.2)

        # 4. Check for natural speech markers
        has_corrections = self._has_self_corrections(transcription)
        if not has_corrections:
            indicators.append("No self-corrections detected")
            scores.append(0.5)
        else:
            scores.append(0.1)

        # Calculate overall confidence
        confidence = np.mean(scores)
        is_reading = confidence >= self.reading_threshold

        return ReadingPatternResult(
            is_reading=is_reading,
            confidence=round(confidence, 2),
            indicators=indicators,
            speech_rate_cv=round(speech_rate_cv, 3),
            filler_word_rate=round(filler_rate, 2),
            pause_regularity=round(pause_regularity, 2)
        )

    def _analyze_speech_rate(self, word_timestamps: List[dict]) -> float:
        """
        Calculate coefficient of variation of speech rate.
        Natural speech has variable rate, reading is more constant.
        """
        if len(word_timestamps) < 5:
            return 0.0

        # Calculate words per second in sliding windows
        window_size = 3.0  # seconds
        hop = 1.0  # seconds

        rates = []
        max_time = word_timestamps[-1].get('end', 0)

        for start in np.arange(0, max_time - window_size, hop):
            end = start + window_size
            words_in_window = [
                w for w in word_timestamps
                if w.get('start', 0) >= start and w.get('end', 0) <= end
            ]
            if words_in_window:
                rate = len(words_in_window) / window_size
                rates.append(rate)

        if len(rates) < 3:
            return 0.0

        # Coefficient of variation (std / mean)
        mean_rate = np.mean(rates)
        if mean_rate == 0:
            return 0.0

        cv = np.std(rates) / mean_rate
        return cv

    def _analyze_filler_words(self, transcription: str,
                               duration_seconds: float) -> float:
        """
        Count filler words per minute.
        Natural speech has more fillers, reading has fewer.
        """
        text_lower = transcription.lower()
        filler_count = 0

        for filler in FILLER_WORDS:
            # Count occurrences (word boundaries)
            import re
            pattern = r'\b' + re.escape(filler) + r'\b'
            matches = re.findall(pattern, text_lower)
            filler_count += len(matches)

        # Calculate per minute rate
        minutes = duration_seconds / 60.0
        if minutes < 0.1:
            return 0.0

        return filler_count / minutes

    def _analyze_pause_patterns(self, word_timestamps: List[dict]) -> float:
        """
        Analyze regularity of pauses between words.
        Reading tends to have more regular pauses.
        """
        if len(word_timestamps) < 5:
            return 0.0

        # Calculate gaps between consecutive words
        gaps = []
        for i in range(1, len(word_timestamps)):
            prev_end = word_timestamps[i-1].get('end', 0)
            curr_start = word_timestamps[i].get('start', 0)
            gap = curr_start - prev_end
            if gap > 0.05:  # Ignore very small gaps
                gaps.append(gap)

        if len(gaps) < 3:
            return 0.0

        # Calculate regularity (inverse of coefficient of variation)
        mean_gap = np.mean(gaps)
        if mean_gap == 0:
            return 0.0

        cv = np.std(gaps) / mean_gap
        regularity = 1.0 / (1.0 + cv)  # Higher = more regular

        return regularity

    def _has_self_corrections(self, transcription: str) -> bool:
        """
        Check for self-corrections which indicate natural speech.
        E.g., "I went to the... I mean, I was going to the store"
        """
        correction_markers = [
            'i mean', 'sorry', 'no wait', 'actually', 'let me',
            'what i meant', 'no no', 'sorry i', 'wait'
        ]

        text_lower = transcription.lower()
        for marker in correction_markers:
            if marker in text_lower:
                return True

        # Check for repeated words (stammering/correction)
        words = text_lower.split()
        for i in range(1, len(words)):
            if words[i] == words[i-1] and len(words[i]) > 2:
                return True

        return False