File size: 7,846 Bytes
ebba35f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
"""
Reading Pattern Analyzer
Detects if someone is reading prepared answers vs speaking naturally.

Key indicators of reading:
- Consistent speech rate (no natural variation)
- Lack of filler words ("um", "uh", "like", "you know")
- Regular pause patterns
- Monotonic rhythm
"""

import numpy as np
from dataclasses import dataclass, field
from typing import List, Optional


# Common filler words in English
FILLER_WORDS = [
    'um', 'uh', 'uhm', 'umm', 'er', 'ah', 'like', 'you know',
    'basically', 'actually', 'so', 'well', 'i mean', 'kind of',
    'sort of', 'right', 'okay'
]


@dataclass
class ReadingPatternResult:
    """Result of reading pattern analysis."""
    is_reading: bool
    confidence: float  # 0.0 to 1.0
    indicators: List[str] = field(default_factory=list)
    speech_rate_cv: float = 0.0  # Coefficient of variation
    filler_word_rate: float = 0.0  # Fillers per minute
    pause_regularity: float = 0.0  # How regular pauses are


class ReadingPatternAnalyzer:
    """
    Analyzes speech patterns to detect if someone is reading.

    Uses transcription with timestamps to analyze:
    - Speech rate variation
    - Filler word frequency
    - Pause patterns
    """

    def __init__(self,
                 min_speech_rate_cv: float = 0.15,
                 min_filler_rate: float = 2.0,
                 reading_threshold: float = 0.6):
        """
        Args:
            min_speech_rate_cv: Minimum coefficient of variation for natural speech
            min_filler_rate: Minimum filler words per minute for natural speech
            reading_threshold: Confidence threshold to flag as reading
        """
        self.min_speech_rate_cv = min_speech_rate_cv
        self.min_filler_rate = min_filler_rate
        self.reading_threshold = reading_threshold

    def analyze(self, transcription: str, word_timestamps: List[dict],
                duration_seconds: float) -> ReadingPatternResult:
        """
        Analyze transcription for reading patterns.

        Args:
            transcription: Full transcription text
            word_timestamps: List of {'word': str, 'start': float, 'end': float}
            duration_seconds: Total audio duration

        Returns:
            ReadingPatternResult with analysis
        """
        if not word_timestamps or len(word_timestamps) < 10:
            return ReadingPatternResult(
                is_reading=False,
                confidence=0.0,
                indicators=["Insufficient data for analysis"]
            )

        indicators = []
        scores = []

        # 1. Analyze speech rate variation
        speech_rate_cv = self._analyze_speech_rate(word_timestamps)
        if speech_rate_cv < self.min_speech_rate_cv:
            indicators.append(f"Constant speech rate (CV={speech_rate_cv:.2f})")
            scores.append(0.8)
        else:
            scores.append(0.2)

        # 2. Analyze filler word frequency
        filler_rate = self._analyze_filler_words(transcription, duration_seconds)
        if filler_rate < self.min_filler_rate:
            indicators.append(f"Few filler words ({filler_rate:.1f}/min)")
            scores.append(0.7)
        else:
            scores.append(0.2)

        # 3. Analyze pause patterns
        pause_regularity = self._analyze_pause_patterns(word_timestamps)
        if pause_regularity > 0.7:
            indicators.append(f"Regular pause pattern ({pause_regularity:.0%})")
            scores.append(0.6)
        else:
            scores.append(0.2)

        # 4. Check for natural speech markers
        has_corrections = self._has_self_corrections(transcription)
        if not has_corrections:
            indicators.append("No self-corrections detected")
            scores.append(0.5)
        else:
            scores.append(0.1)

        # Calculate overall confidence
        confidence = np.mean(scores)
        is_reading = confidence >= self.reading_threshold

        return ReadingPatternResult(
            is_reading=is_reading,
            confidence=round(confidence, 2),
            indicators=indicators,
            speech_rate_cv=round(speech_rate_cv, 3),
            filler_word_rate=round(filler_rate, 2),
            pause_regularity=round(pause_regularity, 2)
        )

    def _analyze_speech_rate(self, word_timestamps: List[dict]) -> float:
        """
        Calculate coefficient of variation of speech rate.
        Natural speech has variable rate, reading is more constant.
        """
        if len(word_timestamps) < 5:
            return 0.0

        # Calculate words per second in sliding windows
        window_size = 3.0  # seconds
        hop = 1.0  # seconds

        rates = []
        max_time = word_timestamps[-1].get('end', 0)

        for start in np.arange(0, max_time - window_size, hop):
            end = start + window_size
            words_in_window = [
                w for w in word_timestamps
                if w.get('start', 0) >= start and w.get('end', 0) <= end
            ]
            if words_in_window:
                rate = len(words_in_window) / window_size
                rates.append(rate)

        if len(rates) < 3:
            return 0.0

        # Coefficient of variation (std / mean)
        mean_rate = np.mean(rates)
        if mean_rate == 0:
            return 0.0

        cv = np.std(rates) / mean_rate
        return cv

    def _analyze_filler_words(self, transcription: str,
                               duration_seconds: float) -> float:
        """
        Count filler words per minute.
        Natural speech has more fillers, reading has fewer.
        """
        text_lower = transcription.lower()
        filler_count = 0

        for filler in FILLER_WORDS:
            # Count occurrences (word boundaries)
            import re
            pattern = r'\b' + re.escape(filler) + r'\b'
            matches = re.findall(pattern, text_lower)
            filler_count += len(matches)

        # Calculate per minute rate
        minutes = duration_seconds / 60.0
        if minutes < 0.1:
            return 0.0

        return filler_count / minutes

    def _analyze_pause_patterns(self, word_timestamps: List[dict]) -> float:
        """
        Analyze regularity of pauses between words.
        Reading tends to have more regular pauses.
        """
        if len(word_timestamps) < 5:
            return 0.0

        # Calculate gaps between consecutive words
        gaps = []
        for i in range(1, len(word_timestamps)):
            prev_end = word_timestamps[i-1].get('end', 0)
            curr_start = word_timestamps[i].get('start', 0)
            gap = curr_start - prev_end
            if gap > 0.05:  # Ignore very small gaps
                gaps.append(gap)

        if len(gaps) < 3:
            return 0.0

        # Calculate regularity (inverse of coefficient of variation)
        mean_gap = np.mean(gaps)
        if mean_gap == 0:
            return 0.0

        cv = np.std(gaps) / mean_gap
        regularity = 1.0 / (1.0 + cv)  # Higher = more regular

        return regularity

    def _has_self_corrections(self, transcription: str) -> bool:
        """
        Check for self-corrections which indicate natural speech.
        E.g., "I went to the... I mean, I was going to the store"
        """
        correction_markers = [
            'i mean', 'sorry', 'no wait', 'actually', 'let me',
            'what i meant', 'no no', 'sorry i', 'wait'
        ]

        text_lower = transcription.lower()
        for marker in correction_markers:
            if marker in text_lower:
                return True

        # Check for repeated words (stammering/correction)
        words = text_lower.split()
        for i in range(1, len(words)):
            if words[i] == words[i-1] and len(words[i]) > 2:
                return True

        return False