Ranam Hamoud commited on
Commit
521317f
·
1 Parent(s): 4c2ceb8

Update app, classifier, and speech recognizer; rename model file and update examples

Browse files
app.py CHANGED
@@ -117,14 +117,6 @@ def analyze_audio_file(audio_file):
117
  else:
118
  speech_patterns += "Normal pacing |\n"
119
 
120
- speech_patterns += f"| **Non-alpha chars/sec** | {kf['nonalpha_per_sec']:.2f} | "
121
- if kf['nonalpha_per_sec'] > 2.5:
122
- speech_patterns += "High (disfluent) |\n"
123
- elif kf['nonalpha_per_sec'] < 1.5:
124
- speech_patterns += "Low (fluent) |\n"
125
- else:
126
- speech_patterns += "Moderate |\n"
127
-
128
  speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
129
  if kf['filler_rate'] > 0.05:
130
  speech_patterns += "High (spontaneous) |\n"
@@ -141,11 +133,51 @@ def analyze_audio_file(audio_file):
141
  else:
142
  speech_patterns += "Few |\n"
143
 
144
- speech_patterns += f"| **Alpha Ratio** | {kf['alpha_ratio']:.2f} | "
145
- if kf['alpha_ratio'] > 0.85:
146
- speech_patterns += "Clean text |\n"
 
 
 
 
 
 
 
 
 
 
147
  else:
148
- speech_patterns += "With artifacts |\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  speech_patterns += "\n"
151
 
@@ -217,9 +249,7 @@ def analyze_audio_file(audio_file):
217
  return (error_msg, "", "", "", "", "")
218
 
219
 
220
- def create_interface():
221
- """Create and configure Gradio interface."""
222
-
223
  custom_css = """
224
  @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
225
 
@@ -343,25 +373,6 @@ def create_interface():
343
  size="lg"
344
  )
345
 
346
- # Add example audio files
347
- gr.HTML("""
348
- <div style='margin-top: 20px; margin-bottom: 10px;'>
349
- <h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
350
- </div>
351
- """)
352
-
353
- examples_dir = os.path.join(os.path.dirname(__file__), "examples")
354
- gr.Examples(
355
- examples=[
356
- [os.path.join(examples_dir, "read1.ogg")],
357
- [os.path.join(examples_dir, "spontaneous1.ogg")]
358
- ],
359
- inputs=[audio_input],
360
- label="",
361
- examples_per_page=2,
362
- cache_examples=False
363
- )
364
-
365
  gr.HTML("""
366
  <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
367
  <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
@@ -402,7 +413,33 @@ def create_interface():
402
  with gr.Tab("AI Detection"):
403
  ai_output = gr.Markdown()
404
 
405
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
  def show_loading():
408
  loading_html = """
 
117
  else:
118
  speech_patterns += "Normal pacing |\n"
119
 
 
 
 
 
 
 
 
 
120
  speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
121
  if kf['filler_rate'] > 0.05:
122
  speech_patterns += "High (spontaneous) |\n"
 
133
  else:
134
  speech_patterns += "Few |\n"
135
 
136
+ speech_patterns += "\n---\n\n"
137
+ speech_patterns += "#### Reading Style Indicators\n\n"
138
+
139
+ speech_patterns += "| Feature | Value | Interpretation |\n"
140
+ speech_patterns += "|---------|-------|----------------|\n"
141
+
142
+ # Pause regularity
143
+ pause_reg = kf.get('pause_regularity', 0.5)
144
+ speech_patterns += f"| **Pause Regularity** | {pause_reg:.2f} | "
145
+ if pause_reg > 0.7:
146
+ speech_patterns += "Very regular (read) |\n"
147
+ elif pause_reg > 0.4:
148
+ speech_patterns += "Moderate |\n"
149
  else:
150
+ speech_patterns += "Irregular (spontaneous) |\n"
151
+
152
+ # Speech rate variability
153
+ rate_var = kf.get('speech_rate_variability', 0.0)
154
+ speech_patterns += f"| **Rate Variability** | {rate_var:.2f} | "
155
+ if rate_var > 0.6:
156
+ speech_patterns += "High (spontaneous) |\n"
157
+ elif rate_var > 0.3:
158
+ speech_patterns += "Moderate |\n"
159
+ else:
160
+ speech_patterns += "Steady pace (read) |\n"
161
+
162
+ # Sentence variance
163
+ sent_var = kf.get('sentence_length_variance', 0.0)
164
+ speech_patterns += f"| **Sentence Variance** | {sent_var:.2f} | "
165
+ if sent_var > 0.5:
166
+ speech_patterns += "Variable (spontaneous) |\n"
167
+ elif sent_var > 0.25:
168
+ speech_patterns += "Moderate |\n"
169
+ else:
170
+ speech_patterns += "Uniform (read) |\n"
171
+
172
+ # Self-corrections
173
+ corrections = kf.get('self_correction_count', 0)
174
+ speech_patterns += f"| **Self-Corrections** | {corrections} | "
175
+ if corrections > 2:
176
+ speech_patterns += "Multiple (spontaneous) |\n"
177
+ elif corrections > 0:
178
+ speech_patterns += "Few |\n"
179
+ else:
180
+ speech_patterns += "None (scripted) |\n"
181
 
182
  speech_patterns += "\n"
183
 
 
249
  return (error_msg, "", "", "", "", "")
250
 
251
 
252
+ def create_interface():
 
 
253
  custom_css = """
254
  @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
255
 
 
373
  size="lg"
374
  )
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  gr.HTML("""
377
  <div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
378
  <h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
 
413
  with gr.Tab("AI Detection"):
414
  ai_output = gr.Markdown()
415
 
416
+
417
+ # Add example audio files with caching
418
+ gr.HTML("""
419
+ <div style='margin-top: 20px; margin-bottom: 10px;'>
420
+ <h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
421
+ </div>
422
+ """)
423
+
424
+ examples_dir = os.path.join(os.path.dirname(__file__), "examples")
425
+ gr.Examples(
426
+ examples=[
427
+ [os.path.join(examples_dir, "read1.ogg")],
428
+ [os.path.join(examples_dir, "spontaneous1.ogg")]
429
+ ],
430
+ inputs=[audio_input],
431
+ outputs=[
432
+ overall_output,
433
+ acoustic_output,
434
+ transcription_output,
435
+ speech_output,
436
+ ai_output,
437
+ ],
438
+ fn=analyze_audio_file,
439
+ label="",
440
+ examples_per_page=2,
441
+ cache_examples=True
442
+ )
443
 
444
  def show_loading():
445
  loading_html = """
audio_classifier.py CHANGED
@@ -78,7 +78,7 @@ class SpeechStyleCNN(nn.Module):
78
 
79
  class AudioClassifier:
80
  AVAILABLE_MODELS = {
81
- '3s_window': 'spectrogram_cnn_3s_window (1).pth',
82
  # '4s_window': 'spectrogram_cnn_4s_window.pth',
83
  # '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
84
  }
@@ -100,7 +100,7 @@ class AudioClassifier:
100
 
101
  if model_path is None:
102
  import os
103
- model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window (1).pth')
104
 
105
  try:
106
  print(f"Attempting to load model from: {model_path}")
@@ -120,7 +120,6 @@ class AudioClassifier:
120
  self.hop_length = 512
121
 
122
  def extract_mel_spectrogram(self, audio_path: str, window_size: float = 3.0) -> np.ndarray:
123
- """Extract mel spectrogram from audio, using windowing if audio is longer than window_size."""
124
  audio, sr = librosa.load(audio_path, sr=self.sample_rate)
125
 
126
  # If audio is longer than window_size, take multiple windows and average
@@ -215,18 +214,8 @@ class AudioClassifier:
215
  return features
216
 
217
  def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
218
- """
219
- Optimized prosody scoring based on feature analysis:
220
- - spectral_centroid_std: 80% accuracy (threshold ~1017, read >= threshold)
221
- - zcr_mean: 75% accuracy (threshold ~0.11, read >= threshold)
222
- - energy_mean: 70% accuracy (threshold ~0.06, read < threshold)
223
- - pitch_range: 75% accuracy (threshold ~3837, read < threshold)
224
- """
225
  individual_scores = {}
226
-
227
- # 1. Spectral centroid std - MOST discriminative (separation: 1.11)
228
- # Read: 1087 avg, Spontaneous: 1017 avg
229
- # Threshold: ~1050, read >= threshold
230
  sc_std = features['spectral_centroid_std']
231
  if sc_std >= 1100:
232
  spectral_score = 0.9 # Strongly indicates read
@@ -245,9 +234,6 @@ class AudioClassifier:
245
  'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
246
  }
247
 
248
- # 2. ZCR mean - Second most discriminative (separation: 0.81)
249
- # Read: 0.12 avg, Spontaneous: 0.10 avg
250
- # Threshold: ~0.11, read >= threshold
251
  zcr = features['zcr_mean']
252
  if zcr >= 0.13:
253
  zcr_score = 0.9 # Strongly indicates read
@@ -303,10 +289,10 @@ class AudioClassifier:
303
 
304
  # Optimized weights based on feature separation scores
305
  weights = {
306
- 'spectral_variability': 0.40, # Best discriminator (1.11 separation)
307
- 'zcr_mean': 0.30, # Second best (0.81 separation)
308
- 'energy_level': 0.20, # Third (0.69 separation)
309
- 'tempo': 0.10 # Weakest (0.22 separation)
310
  }
311
 
312
  overall_score = (
@@ -316,7 +302,6 @@ class AudioClassifier:
316
  tempo_score * weights['tempo']
317
  )
318
 
319
- # More decisive thresholds
320
  if overall_score > 0.60:
321
  classification = 'read'
322
  confidence = 0.5 + (overall_score - 0.5) * 0.8
@@ -324,7 +309,6 @@ class AudioClassifier:
324
  classification = 'spontaneous'
325
  confidence = 0.5 + (0.5 - overall_score) * 0.8
326
  else:
327
- # Borderline - slight lean based on score
328
  classification = 'read' if overall_score >= 0.5 else 'spontaneous'
329
  confidence = 0.5 + abs(overall_score - 0.5) * 0.6
330
 
@@ -346,7 +330,6 @@ class AudioClassifier:
346
  predicted_class = torch.argmax(probabilities, dim=1).item()
347
  cnn_confidence = probabilities[0, predicted_class].item()
348
 
349
- # Debug output - Model: Class 0=read, Class 1=spontaneous
350
  print(f"CNN Logits: {logits[0].cpu().numpy()}")
351
  print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
352
  print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
@@ -362,13 +345,11 @@ class AudioClassifier:
362
  print(f"CNN classification: {cnn_class_name}")
363
  print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
364
 
365
- # Weighted combination: Prosody is more reliable (60% acc) than CNN (50% acc)
366
- # Convert classifications to scores: read=1, spontaneous=0
367
  cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
368
  prosody_score = 1.0 if prosody_classification == 'read' else 0.0
369
 
370
- # Weight prosody more heavily (0.6) than CNN (0.4)
371
- # Also factor in confidence
372
  weighted_score = (
373
  cnn_score * cnn_confidence * 0.4 +
374
  prosody_score * prosody_confidence * 0.6
 
78
 
79
  class AudioClassifier:
80
  AVAILABLE_MODELS = {
81
+ '3s_window': 'spectrogram_cnn_3s_window.pth',
82
  # '4s_window': 'spectrogram_cnn_4s_window.pth',
83
  # '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
84
  }
 
100
 
101
  if model_path is None:
102
  import os
103
+ model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
104
 
105
  try:
106
  print(f"Attempting to load model from: {model_path}")
 
120
  self.hop_length = 512
121
 
122
  def extract_mel_spectrogram(self, audio_path: str, window_size: float = 3.0) -> np.ndarray:
 
123
  audio, sr = librosa.load(audio_path, sr=self.sample_rate)
124
 
125
  # If audio is longer than window_size, take multiple windows and average
 
214
  return features
215
 
216
  def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
 
 
 
 
 
 
 
217
  individual_scores = {}
218
+
 
 
 
219
  sc_std = features['spectral_centroid_std']
220
  if sc_std >= 1100:
221
  spectral_score = 0.9 # Strongly indicates read
 
234
  'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
235
  }
236
 
 
 
 
237
  zcr = features['zcr_mean']
238
  if zcr >= 0.13:
239
  zcr_score = 0.9 # Strongly indicates read
 
289
 
290
  # Optimized weights based on feature separation scores
291
  weights = {
292
+ 'spectral_variability': 0.40,
293
+ 'zcr_mean': 0.30,
294
+ 'energy_level': 0.20,
295
+ 'tempo': 0.10
296
  }
297
 
298
  overall_score = (
 
302
  tempo_score * weights['tempo']
303
  )
304
 
 
305
  if overall_score > 0.60:
306
  classification = 'read'
307
  confidence = 0.5 + (overall_score - 0.5) * 0.8
 
309
  classification = 'spontaneous'
310
  confidence = 0.5 + (0.5 - overall_score) * 0.8
311
  else:
 
312
  classification = 'read' if overall_score >= 0.5 else 'spontaneous'
313
  confidence = 0.5 + abs(overall_score - 0.5) * 0.6
314
 
 
330
  predicted_class = torch.argmax(probabilities, dim=1).item()
331
  cnn_confidence = probabilities[0, predicted_class].item()
332
 
 
333
  print(f"CNN Logits: {logits[0].cpu().numpy()}")
334
  print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
335
  print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
 
345
  print(f"CNN classification: {cnn_class_name}")
346
  print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
347
 
348
+
 
349
  cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
350
  prosody_score = 1.0 if prosody_classification == 'read' else 0.0
351
 
352
+
 
353
  weighted_score = (
354
  cnn_score * cnn_confidence * 0.4 +
355
  prosody_score * prosody_confidence * 0.6
examples/read1.ogg CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a13e626fdda037d19f32574aa244b3c5d5d8cee9a29777bdc9aa2923ff1035d2
3
- size 67654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8e969d50e75835caf2a52f33c19accdb1cdfa1e069501bad0fc2fe470ea761
3
+ size 157216
examples/read4.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9dbcda832552f5051a60de21bc10dd1166cfb7039077e4108d6a8e239148ec3
3
- size 898430
 
 
 
 
examples/spontaneous1.ogg CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bece27541be3a7e5b09132eec245c141d26b5114a44ba3124b8678914b67345
3
- size 102198
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69b8aeffd1e7a02ed90bcff98d202cd7a97cc57cd1d16a4cdbd4aac2e770b6db
3
+ size 323869
spectrogram_cnn_3s_window (1).pth → spectrogram_cnn_3s_window.pth RENAMED
File without changes
speech_recognizer.py CHANGED
@@ -42,7 +42,9 @@ class SpeechRecognizer:
42
  analysis = self._analyze_transcription(transcription, segments)
43
 
44
  duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
45
- kopparapu_features = self._extract_kopparapu_features(transcription, duration)
 
 
46
  kopparapu_score = self._calculate_kopparapu_score(kopparapu_features)
47
 
48
  return {
@@ -140,10 +142,13 @@ class SpeechRecognizer:
140
  'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
141
  }
142
 
143
- def _extract_kopparapu_features(self, text: str, duration_sec: float) -> Dict:
 
 
 
144
  """
145
- Extract Kopparapu-like linguistic features from transcription.
146
- Based on: https://arxiv.org/pdf/2306.08012
147
  """
148
  text = text.strip()
149
  if len(text) == 0:
@@ -153,7 +158,11 @@ class SpeechRecognizer:
153
  'words_per_sec': 0.0,
154
  'nonalpha_per_sec': 0.0,
155
  'repetition_count': 0,
156
- 'filler_rate': 0.0
 
 
 
 
157
  }
158
 
159
  total_chars = len(text)
@@ -170,8 +179,10 @@ class SpeechRecognizer:
170
  words_per_sec = num_words / duration_sec
171
  nonalpha_per_sec = nonalpha_chars / duration_sec
172
 
 
173
  char_reps = len(re.findall(r'(.)\1{2,}', text))
174
 
 
175
  words_list = text.lower().split()
176
  word_reps = 0
177
  for i in range(len(words_list) - 1):
@@ -180,6 +191,7 @@ class SpeechRecognizer:
180
 
181
  repetition_count = char_reps + word_reps
182
 
 
183
  lower = text.lower()
184
  filler_patterns = [
185
  r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
@@ -193,29 +205,154 @@ class SpeechRecognizer:
193
  filler_count += len(re.findall(pattern, lower))
194
  filler_rate = filler_count / num_words
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  return {
197
  'alpha_ratio': float(alpha_ratio),
198
  'chars_per_word': float(chars_per_word),
199
  'words_per_sec': float(words_per_sec),
200
  'nonalpha_per_sec': float(nonalpha_per_sec),
201
  'repetition_count': int(repetition_count),
202
- 'filler_rate': float(filler_rate)
 
 
 
 
203
  }
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def _logistic(self, x: float, a: float, b: float) -> float:
 
206
  return 1.0 / (1.0 + np.exp(-(x - a) / b))
207
 
208
  def _calculate_kopparapu_score(self, features: Dict) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  f1 = features['chars_per_word']
210
- L1 = self._logistic(f1, a=5.0, b=1.5)
211
 
 
212
  f2 = features['words_per_sec']
213
- L2 = self._logistic(f2, a=2.0, b=0.7)
214
-
215
- f3_raw = features['nonalpha_per_sec'] + 10.0 * features['filler_rate']
216
- L3 = self._logistic(-f3_raw, a=0.0, b=1.0)
217
-
218
- score = 0.4 * L1 + 0.4 * L2 + 0.2 * L3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  return float(score)
221
 
 
42
  analysis = self._analyze_transcription(transcription, segments)
43
 
44
  duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
45
+ kopparapu_features = self._extract_kopparapu_features(
46
+ transcription, duration, segments, analysis['pause_patterns']
47
+ )
48
  kopparapu_score = self._calculate_kopparapu_score(kopparapu_features)
49
 
50
  return {
 
142
  'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
143
  }
144
 
145
+ def _extract_kopparapu_features(
146
+ self, text: str, duration_sec: float,
147
+ segments: List[Dict] = None, pause_patterns: Dict = None
148
+ ) -> Dict:
149
  """
150
+ Extract enhanced Kopparapu-like linguistic features for read speech detection.
151
+ Based on: https://arxiv.org/pdf/2306.08012 with extensions.
152
  """
153
  text = text.strip()
154
  if len(text) == 0:
 
158
  'words_per_sec': 0.0,
159
  'nonalpha_per_sec': 0.0,
160
  'repetition_count': 0,
161
+ 'filler_rate': 0.0,
162
+ 'pause_regularity': 0.5,
163
+ 'speech_rate_variability': 0.0,
164
+ 'sentence_length_variance': 0.0,
165
+ 'self_correction_count': 0
166
  }
167
 
168
  total_chars = len(text)
 
179
  words_per_sec = num_words / duration_sec
180
  nonalpha_per_sec = nonalpha_chars / duration_sec
181
 
182
+ # Character repetitions (e.g., "sooo", "ummmm")
183
  char_reps = len(re.findall(r'(.)\1{2,}', text))
184
 
185
+ # Word repetitions (e.g., "I I think", "the the")
186
  words_list = text.lower().split()
187
  word_reps = 0
188
  for i in range(len(words_list) - 1):
 
191
 
192
  repetition_count = char_reps + word_reps
193
 
194
+ # Filler words detection
195
  lower = text.lower()
196
  filler_patterns = [
197
  r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
 
205
  filler_count += len(re.findall(pattern, lower))
206
  filler_rate = filler_count / num_words
207
 
208
+ # NEW: Pause regularity - read speech has regular pauses at punctuation
209
+ # Low variability = regular pauses = likely read
210
+ pause_regularity = 0.5 # neutral default
211
+ if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
212
+ pause_var = pause_patterns.get('pause_variability', 0.5)
213
+ # Normalize: low variability (< 0.2) -> high regularity (close to 1)
214
+ # High variability (> 0.6) -> low regularity (close to 0)
215
+ pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
216
+
217
+ # NEW: Speech rate variability across segments
218
+ # Read speech has consistent pacing; spontaneous varies with thinking
219
+ speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
220
+
221
+ # NEW: Sentence length variance - read text has more uniform structure
222
+ sentence_length_variance = self._compute_sentence_variance(text)
223
+
224
+ # NEW: Self-corrections and false starts (spontaneous speech markers)
225
+ self_correction_patterns = [
226
+ r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
227
+ r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
228
+ r'\bI meant\b', r'\bhold on\b', r'\bwhat was I\b', r'\bor rather\b'
229
+ ]
230
+ self_correction_count = 0
231
+ for pattern in self_correction_patterns:
232
+ self_correction_count += len(re.findall(pattern, lower))
233
+
234
  return {
235
  'alpha_ratio': float(alpha_ratio),
236
  'chars_per_word': float(chars_per_word),
237
  'words_per_sec': float(words_per_sec),
238
  'nonalpha_per_sec': float(nonalpha_per_sec),
239
  'repetition_count': int(repetition_count),
240
+ 'filler_rate': float(filler_rate),
241
+ 'pause_regularity': float(pause_regularity),
242
+ 'speech_rate_variability': float(speech_rate_variability),
243
+ 'sentence_length_variance': float(sentence_length_variance),
244
+ 'self_correction_count': int(self_correction_count)
245
  }
246
 
247
+ def _compute_rate_variability(self, segments: List[Dict]) -> float:
248
+ """
249
+ Compute speech rate variability across segments.
250
+ Read speech has consistent rate; spontaneous varies with thinking.
251
+ Returns 0-1 where higher = more variable = more spontaneous.
252
+ """
253
+ if not segments or len(segments) < 3:
254
+ return 0.0
255
+
256
+ segment_rates = []
257
+ for seg in segments:
258
+ duration = seg.get('end', 0) - seg.get('start', 0)
259
+ if duration > 0.3: # Only consider segments > 300ms
260
+ words_in_seg = len(seg.get('text', '').split())
261
+ rate = words_in_seg / duration
262
+ if rate > 0:
263
+ segment_rates.append(rate)
264
+
265
+ if len(segment_rates) < 3:
266
+ return 0.0
267
+
268
+ mean_rate = np.mean(segment_rates)
269
+ std_rate = np.std(segment_rates)
270
+
271
+ # Coefficient of variation normalized to 0-1
272
+ cv = std_rate / mean_rate if mean_rate > 0 else 0
273
+ return float(min(1.0, cv / 0.5)) # CV of 0.5+ maps to 1.0
274
+
275
+ def _compute_sentence_variance(self, text: str) -> float:
276
+ """
277
+ Compute variance in sentence lengths.
278
+ Read/scripted text tends to have more uniform sentence structure.
279
+ Returns 0-1 where higher = more variance = more spontaneous.
280
+ """
281
+ # Split into sentences
282
+ sentences = re.split(r'[.!?]+', text)
283
+ sentences = [s.strip() for s in sentences if s.strip()]
284
+
285
+ if len(sentences) < 2:
286
+ return 0.0
287
+
288
+ lengths = [len(s.split()) for s in sentences]
289
+ mean_len = np.mean(lengths)
290
+ std_len = np.std(lengths)
291
+
292
+ # Coefficient of variation normalized
293
+ cv = std_len / mean_len if mean_len > 0 else 0
294
+ return float(min(1.0, cv / 0.6)) # CV of 0.6+ maps to 1.0
295
+
296
  def _logistic(self, x: float, a: float, b: float) -> float:
297
+ """Sigmoid function centered at 'a' with steepness 'b'."""
298
  return 1.0 / (1.0 + np.exp(-(x - a) / b))
299
 
300
  def _calculate_kopparapu_score(self, features: Dict) -> float:
301
+ """
302
+ Calculate enhanced Kopparapu score for read vs spontaneous classification.
303
+ Score closer to 1 = more likely READ, closer to 0 = more likely SPONTANEOUS.
304
+
305
+ Key signals for READ speech:
306
+ - Higher chars_per_word (formal vocabulary)
307
+ - Faster, steadier words_per_sec
308
+ - Lower filler rate and disfluencies
309
+ - Regular pause patterns (pause_regularity high)
310
+ - Low speech rate variability
311
+ - Uniform sentence lengths
312
+ """
313
+ # L1: Vocabulary complexity - higher chars/word = more formal = read
314
  f1 = features['chars_per_word']
315
+ L1 = self._logistic(f1, a=4.8, b=1.2)
316
 
317
+ # L2: Speaking rate - faster, steadier = read
318
  f2 = features['words_per_sec']
319
+ L2 = self._logistic(f2, a=2.2, b=0.6)
320
+
321
+ # L3: Disfluency signal (inverted) - less disfluency = more read
322
+ # Combines filler rate, nonalpha, and repetitions
323
+ disfluency = (
324
+ features['nonalpha_per_sec'] +
325
+ 8.0 * features['filler_rate'] +
326
+ 0.5 * features['repetition_count']
327
+ )
328
+ L3 = self._logistic(-disfluency, a=0.0, b=0.8)
329
+
330
+ # L4: Pause regularity - regular pauses = read (already 0-1)
331
+ L4 = features.get('pause_regularity', 0.5)
332
+
333
+ # L5: Rate variability (inverted) - low variability = read
334
+ rate_var = features.get('speech_rate_variability', 0.0)
335
+ L5 = 1.0 - rate_var
336
+
337
+ # L6: Sentence variance (inverted) - uniform sentences = read
338
+ sent_var = features.get('sentence_length_variance', 0.0)
339
+ L6 = 1.0 - sent_var
340
+
341
+ # L7: Self-corrections (inverted) - more corrections = spontaneous
342
+ corrections = features.get('self_correction_count', 0)
343
+ L7 = self._logistic(-corrections, a=0.0, b=1.5)
344
+
345
+ # Weighted combination optimized for read detection
346
+ # Higher weights on pause regularity and rate consistency (key read markers)
347
+ score = (
348
+ 0.15 * L1 + # Vocabulary complexity
349
+ 0.15 * L2 + # Speaking rate
350
+ 0.15 * L3 + # Disfluency (filler/repetition)
351
+ 0.20 * L4 + # Pause regularity (strong read signal)
352
+ 0.15 * L5 + # Rate variability
353
+ 0.10 * L6 + # Sentence uniformity
354
+ 0.10 * L7 # Self-corrections
355
+ )
356
 
357
  return float(score)
358