Spaces:
Sleeping
Sleeping
Ranam Hamoud
commited on
Commit
·
521317f
1
Parent(s):
4c2ceb8
Update app, classifier, and speech recognizer; rename model file and update examples
Browse files- app.py +72 -35
- audio_classifier.py +9 -28
- examples/read1.ogg +2 -2
- examples/read4.wav +0 -3
- examples/spontaneous1.ogg +2 -2
- spectrogram_cnn_3s_window (1).pth → spectrogram_cnn_3s_window.pth +0 -0
- speech_recognizer.py +150 -13
app.py
CHANGED
|
@@ -117,14 +117,6 @@ def analyze_audio_file(audio_file):
|
|
| 117 |
else:
|
| 118 |
speech_patterns += "Normal pacing |\n"
|
| 119 |
|
| 120 |
-
speech_patterns += f"| **Non-alpha chars/sec** | {kf['nonalpha_per_sec']:.2f} | "
|
| 121 |
-
if kf['nonalpha_per_sec'] > 2.5:
|
| 122 |
-
speech_patterns += "High (disfluent) |\n"
|
| 123 |
-
elif kf['nonalpha_per_sec'] < 1.5:
|
| 124 |
-
speech_patterns += "Low (fluent) |\n"
|
| 125 |
-
else:
|
| 126 |
-
speech_patterns += "Moderate |\n"
|
| 127 |
-
|
| 128 |
speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
|
| 129 |
if kf['filler_rate'] > 0.05:
|
| 130 |
speech_patterns += "High (spontaneous) |\n"
|
|
@@ -141,11 +133,51 @@ def analyze_audio_file(audio_file):
|
|
| 141 |
else:
|
| 142 |
speech_patterns += "Few |\n"
|
| 143 |
|
| 144 |
-
speech_patterns +=
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
else:
|
| 148 |
-
speech_patterns += "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
speech_patterns += "\n"
|
| 151 |
|
|
@@ -217,9 +249,7 @@ def analyze_audio_file(audio_file):
|
|
| 217 |
return (error_msg, "", "", "", "", "")
|
| 218 |
|
| 219 |
|
| 220 |
-
def create_interface():
|
| 221 |
-
"""Create and configure Gradio interface."""
|
| 222 |
-
|
| 223 |
custom_css = """
|
| 224 |
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
|
| 225 |
|
|
@@ -343,25 +373,6 @@ def create_interface():
|
|
| 343 |
size="lg"
|
| 344 |
)
|
| 345 |
|
| 346 |
-
# Add example audio files
|
| 347 |
-
gr.HTML("""
|
| 348 |
-
<div style='margin-top: 20px; margin-bottom: 10px;'>
|
| 349 |
-
<h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
|
| 350 |
-
</div>
|
| 351 |
-
""")
|
| 352 |
-
|
| 353 |
-
examples_dir = os.path.join(os.path.dirname(__file__), "examples")
|
| 354 |
-
gr.Examples(
|
| 355 |
-
examples=[
|
| 356 |
-
[os.path.join(examples_dir, "read1.ogg")],
|
| 357 |
-
[os.path.join(examples_dir, "spontaneous1.ogg")]
|
| 358 |
-
],
|
| 359 |
-
inputs=[audio_input],
|
| 360 |
-
label="",
|
| 361 |
-
examples_per_page=2,
|
| 362 |
-
cache_examples=False
|
| 363 |
-
)
|
| 364 |
-
|
| 365 |
gr.HTML("""
|
| 366 |
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
|
| 367 |
<h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
|
|
@@ -402,7 +413,33 @@ def create_interface():
|
|
| 402 |
with gr.Tab("AI Detection"):
|
| 403 |
ai_output = gr.Markdown()
|
| 404 |
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
def show_loading():
|
| 408 |
loading_html = """
|
|
|
|
| 117 |
else:
|
| 118 |
speech_patterns += "Normal pacing |\n"
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
speech_patterns += f"| **Filler Rate** | {kf['filler_rate']*100:.1f}% | "
|
| 121 |
if kf['filler_rate'] > 0.05:
|
| 122 |
speech_patterns += "High (spontaneous) |\n"
|
|
|
|
| 133 |
else:
|
| 134 |
speech_patterns += "Few |\n"
|
| 135 |
|
| 136 |
+
speech_patterns += "\n---\n\n"
|
| 137 |
+
speech_patterns += "#### Reading Style Indicators\n\n"
|
| 138 |
+
|
| 139 |
+
speech_patterns += "| Feature | Value | Interpretation |\n"
|
| 140 |
+
speech_patterns += "|---------|-------|----------------|\n"
|
| 141 |
+
|
| 142 |
+
# Pause regularity
|
| 143 |
+
pause_reg = kf.get('pause_regularity', 0.5)
|
| 144 |
+
speech_patterns += f"| **Pause Regularity** | {pause_reg:.2f} | "
|
| 145 |
+
if pause_reg > 0.7:
|
| 146 |
+
speech_patterns += "Very regular (read) |\n"
|
| 147 |
+
elif pause_reg > 0.4:
|
| 148 |
+
speech_patterns += "Moderate |\n"
|
| 149 |
else:
|
| 150 |
+
speech_patterns += "Irregular (spontaneous) |\n"
|
| 151 |
+
|
| 152 |
+
# Speech rate variability
|
| 153 |
+
rate_var = kf.get('speech_rate_variability', 0.0)
|
| 154 |
+
speech_patterns += f"| **Rate Variability** | {rate_var:.2f} | "
|
| 155 |
+
if rate_var > 0.6:
|
| 156 |
+
speech_patterns += "High (spontaneous) |\n"
|
| 157 |
+
elif rate_var > 0.3:
|
| 158 |
+
speech_patterns += "Moderate |\n"
|
| 159 |
+
else:
|
| 160 |
+
speech_patterns += "Steady pace (read) |\n"
|
| 161 |
+
|
| 162 |
+
# Sentence variance
|
| 163 |
+
sent_var = kf.get('sentence_length_variance', 0.0)
|
| 164 |
+
speech_patterns += f"| **Sentence Variance** | {sent_var:.2f} | "
|
| 165 |
+
if sent_var > 0.5:
|
| 166 |
+
speech_patterns += "Variable (spontaneous) |\n"
|
| 167 |
+
elif sent_var > 0.25:
|
| 168 |
+
speech_patterns += "Moderate |\n"
|
| 169 |
+
else:
|
| 170 |
+
speech_patterns += "Uniform (read) |\n"
|
| 171 |
+
|
| 172 |
+
# Self-corrections
|
| 173 |
+
corrections = kf.get('self_correction_count', 0)
|
| 174 |
+
speech_patterns += f"| **Self-Corrections** | {corrections} | "
|
| 175 |
+
if corrections > 2:
|
| 176 |
+
speech_patterns += "Multiple (spontaneous) |\n"
|
| 177 |
+
elif corrections > 0:
|
| 178 |
+
speech_patterns += "Few |\n"
|
| 179 |
+
else:
|
| 180 |
+
speech_patterns += "None (scripted) |\n"
|
| 181 |
|
| 182 |
speech_patterns += "\n"
|
| 183 |
|
|
|
|
| 249 |
return (error_msg, "", "", "", "", "")
|
| 250 |
|
| 251 |
|
| 252 |
+
def create_interface():
|
|
|
|
|
|
|
| 253 |
custom_css = """
|
| 254 |
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@300;400;500;600;700&display=swap');
|
| 255 |
|
|
|
|
| 373 |
size="lg"
|
| 374 |
)
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
gr.HTML("""
|
| 377 |
<div style='background: white; border: 1px solid #e5e7eb; padding: 20px; border-radius: 16px; margin-top: 20px;'>
|
| 378 |
<h4 style='margin: 0 0 12px 0; font-size: 14px; font-weight: 600; color: #111827;'>Requirements</h4>
|
|
|
|
| 413 |
with gr.Tab("AI Detection"):
|
| 414 |
ai_output = gr.Markdown()
|
| 415 |
|
| 416 |
+
|
| 417 |
+
# Add example audio files with caching
|
| 418 |
+
gr.HTML("""
|
| 419 |
+
<div style='margin-top: 20px; margin-bottom: 10px;'>
|
| 420 |
+
<h4 style='margin: 0 0 8px 0; font-size: 14px; font-weight: 600; color: #111827;'>Try these examples:</h4>
|
| 421 |
+
</div>
|
| 422 |
+
""")
|
| 423 |
+
|
| 424 |
+
examples_dir = os.path.join(os.path.dirname(__file__), "examples")
|
| 425 |
+
gr.Examples(
|
| 426 |
+
examples=[
|
| 427 |
+
[os.path.join(examples_dir, "read1.ogg")],
|
| 428 |
+
[os.path.join(examples_dir, "spontaneous1.ogg")]
|
| 429 |
+
],
|
| 430 |
+
inputs=[audio_input],
|
| 431 |
+
outputs=[
|
| 432 |
+
overall_output,
|
| 433 |
+
acoustic_output,
|
| 434 |
+
transcription_output,
|
| 435 |
+
speech_output,
|
| 436 |
+
ai_output,
|
| 437 |
+
],
|
| 438 |
+
fn=analyze_audio_file,
|
| 439 |
+
label="",
|
| 440 |
+
examples_per_page=2,
|
| 441 |
+
cache_examples=True
|
| 442 |
+
)
|
| 443 |
|
| 444 |
def show_loading():
|
| 445 |
loading_html = """
|
audio_classifier.py
CHANGED
|
@@ -78,7 +78,7 @@ class SpeechStyleCNN(nn.Module):
|
|
| 78 |
|
| 79 |
class AudioClassifier:
|
| 80 |
AVAILABLE_MODELS = {
|
| 81 |
-
'3s_window': 'spectrogram_cnn_3s_window
|
| 82 |
# '4s_window': 'spectrogram_cnn_4s_window.pth',
|
| 83 |
# '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
|
| 84 |
}
|
|
@@ -100,7 +100,7 @@ class AudioClassifier:
|
|
| 100 |
|
| 101 |
if model_path is None:
|
| 102 |
import os
|
| 103 |
-
model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window
|
| 104 |
|
| 105 |
try:
|
| 106 |
print(f"Attempting to load model from: {model_path}")
|
|
@@ -120,7 +120,6 @@ class AudioClassifier:
|
|
| 120 |
self.hop_length = 512
|
| 121 |
|
| 122 |
def extract_mel_spectrogram(self, audio_path: str, window_size: float = 3.0) -> np.ndarray:
|
| 123 |
-
"""Extract mel spectrogram from audio, using windowing if audio is longer than window_size."""
|
| 124 |
audio, sr = librosa.load(audio_path, sr=self.sample_rate)
|
| 125 |
|
| 126 |
# If audio is longer than window_size, take multiple windows and average
|
|
@@ -215,18 +214,8 @@ class AudioClassifier:
|
|
| 215 |
return features
|
| 216 |
|
| 217 |
def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
|
| 218 |
-
"""
|
| 219 |
-
Optimized prosody scoring based on feature analysis:
|
| 220 |
-
- spectral_centroid_std: 80% accuracy (threshold ~1017, read >= threshold)
|
| 221 |
-
- zcr_mean: 75% accuracy (threshold ~0.11, read >= threshold)
|
| 222 |
-
- energy_mean: 70% accuracy (threshold ~0.06, read < threshold)
|
| 223 |
-
- pitch_range: 75% accuracy (threshold ~3837, read < threshold)
|
| 224 |
-
"""
|
| 225 |
individual_scores = {}
|
| 226 |
-
|
| 227 |
-
# 1. Spectral centroid std - MOST discriminative (separation: 1.11)
|
| 228 |
-
# Read: 1087 avg, Spontaneous: 1017 avg
|
| 229 |
-
# Threshold: ~1050, read >= threshold
|
| 230 |
sc_std = features['spectral_centroid_std']
|
| 231 |
if sc_std >= 1100:
|
| 232 |
spectral_score = 0.9 # Strongly indicates read
|
|
@@ -245,9 +234,6 @@ class AudioClassifier:
|
|
| 245 |
'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
|
| 246 |
}
|
| 247 |
|
| 248 |
-
# 2. ZCR mean - Second most discriminative (separation: 0.81)
|
| 249 |
-
# Read: 0.12 avg, Spontaneous: 0.10 avg
|
| 250 |
-
# Threshold: ~0.11, read >= threshold
|
| 251 |
zcr = features['zcr_mean']
|
| 252 |
if zcr >= 0.13:
|
| 253 |
zcr_score = 0.9 # Strongly indicates read
|
|
@@ -303,10 +289,10 @@ class AudioClassifier:
|
|
| 303 |
|
| 304 |
# Optimized weights based on feature separation scores
|
| 305 |
weights = {
|
| 306 |
-
'spectral_variability': 0.40,
|
| 307 |
-
'zcr_mean': 0.30,
|
| 308 |
-
'energy_level': 0.20,
|
| 309 |
-
'tempo': 0.10
|
| 310 |
}
|
| 311 |
|
| 312 |
overall_score = (
|
|
@@ -316,7 +302,6 @@ class AudioClassifier:
|
|
| 316 |
tempo_score * weights['tempo']
|
| 317 |
)
|
| 318 |
|
| 319 |
-
# More decisive thresholds
|
| 320 |
if overall_score > 0.60:
|
| 321 |
classification = 'read'
|
| 322 |
confidence = 0.5 + (overall_score - 0.5) * 0.8
|
|
@@ -324,7 +309,6 @@ class AudioClassifier:
|
|
| 324 |
classification = 'spontaneous'
|
| 325 |
confidence = 0.5 + (0.5 - overall_score) * 0.8
|
| 326 |
else:
|
| 327 |
-
# Borderline - slight lean based on score
|
| 328 |
classification = 'read' if overall_score >= 0.5 else 'spontaneous'
|
| 329 |
confidence = 0.5 + abs(overall_score - 0.5) * 0.6
|
| 330 |
|
|
@@ -346,7 +330,6 @@ class AudioClassifier:
|
|
| 346 |
predicted_class = torch.argmax(probabilities, dim=1).item()
|
| 347 |
cnn_confidence = probabilities[0, predicted_class].item()
|
| 348 |
|
| 349 |
-
# Debug output - Model: Class 0=read, Class 1=spontaneous
|
| 350 |
print(f"CNN Logits: {logits[0].cpu().numpy()}")
|
| 351 |
print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
|
| 352 |
print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
|
|
@@ -362,13 +345,11 @@ class AudioClassifier:
|
|
| 362 |
print(f"CNN classification: {cnn_class_name}")
|
| 363 |
print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
|
| 364 |
|
| 365 |
-
|
| 366 |
-
# Convert classifications to scores: read=1, spontaneous=0
|
| 367 |
cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
|
| 368 |
prosody_score = 1.0 if prosody_classification == 'read' else 0.0
|
| 369 |
|
| 370 |
-
|
| 371 |
-
# Also factor in confidence
|
| 372 |
weighted_score = (
|
| 373 |
cnn_score * cnn_confidence * 0.4 +
|
| 374 |
prosody_score * prosody_confidence * 0.6
|
|
|
|
| 78 |
|
| 79 |
class AudioClassifier:
|
| 80 |
AVAILABLE_MODELS = {
|
| 81 |
+
'3s_window': 'spectrogram_cnn_3s_window.pth',
|
| 82 |
# '4s_window': 'spectrogram_cnn_4s_window.pth',
|
| 83 |
# '4s_488x488': 'spectrogram_cnn_4s_window_488_x_488.pth'
|
| 84 |
}
|
|
|
|
| 100 |
|
| 101 |
if model_path is None:
|
| 102 |
import os
|
| 103 |
+
model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
|
| 104 |
|
| 105 |
try:
|
| 106 |
print(f"Attempting to load model from: {model_path}")
|
|
|
|
| 120 |
self.hop_length = 512
|
| 121 |
|
| 122 |
def extract_mel_spectrogram(self, audio_path: str, window_size: float = 3.0) -> np.ndarray:
|
|
|
|
| 123 |
audio, sr = librosa.load(audio_path, sr=self.sample_rate)
|
| 124 |
|
| 125 |
# If audio is longer than window_size, take multiple windows and average
|
|
|
|
| 214 |
return features
|
| 215 |
|
| 216 |
def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
individual_scores = {}
|
| 218 |
+
|
|
|
|
|
|
|
|
|
|
| 219 |
sc_std = features['spectral_centroid_std']
|
| 220 |
if sc_std >= 1100:
|
| 221 |
spectral_score = 0.9 # Strongly indicates read
|
|
|
|
| 234 |
'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
|
| 235 |
}
|
| 236 |
|
|
|
|
|
|
|
|
|
|
| 237 |
zcr = features['zcr_mean']
|
| 238 |
if zcr >= 0.13:
|
| 239 |
zcr_score = 0.9 # Strongly indicates read
|
|
|
|
| 289 |
|
| 290 |
# Optimized weights based on feature separation scores
|
| 291 |
weights = {
|
| 292 |
+
'spectral_variability': 0.40,
|
| 293 |
+
'zcr_mean': 0.30,
|
| 294 |
+
'energy_level': 0.20,
|
| 295 |
+
'tempo': 0.10
|
| 296 |
}
|
| 297 |
|
| 298 |
overall_score = (
|
|
|
|
| 302 |
tempo_score * weights['tempo']
|
| 303 |
)
|
| 304 |
|
|
|
|
| 305 |
if overall_score > 0.60:
|
| 306 |
classification = 'read'
|
| 307 |
confidence = 0.5 + (overall_score - 0.5) * 0.8
|
|
|
|
| 309 |
classification = 'spontaneous'
|
| 310 |
confidence = 0.5 + (0.5 - overall_score) * 0.8
|
| 311 |
else:
|
|
|
|
| 312 |
classification = 'read' if overall_score >= 0.5 else 'spontaneous'
|
| 313 |
confidence = 0.5 + abs(overall_score - 0.5) * 0.6
|
| 314 |
|
|
|
|
| 330 |
predicted_class = torch.argmax(probabilities, dim=1).item()
|
| 331 |
cnn_confidence = probabilities[0, predicted_class].item()
|
| 332 |
|
|
|
|
| 333 |
print(f"CNN Logits: {logits[0].cpu().numpy()}")
|
| 334 |
print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
|
| 335 |
print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
|
|
|
|
| 345 |
print(f"CNN classification: {cnn_class_name}")
|
| 346 |
print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
|
| 347 |
|
| 348 |
+
|
|
|
|
| 349 |
cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
|
| 350 |
prosody_score = 1.0 if prosody_classification == 'read' else 0.0
|
| 351 |
|
| 352 |
+
|
|
|
|
| 353 |
weighted_score = (
|
| 354 |
cnn_score * cnn_confidence * 0.4 +
|
| 355 |
prosody_score * prosody_confidence * 0.6
|
examples/read1.ogg
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c8e969d50e75835caf2a52f33c19accdb1cdfa1e069501bad0fc2fe470ea761
|
| 3 |
+
size 157216
|
examples/read4.wav
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c9dbcda832552f5051a60de21bc10dd1166cfb7039077e4108d6a8e239148ec3
|
| 3 |
-
size 898430
|
|
|
|
|
|
|
|
|
|
|
|
examples/spontaneous1.ogg
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69b8aeffd1e7a02ed90bcff98d202cd7a97cc57cd1d16a4cdbd4aac2e770b6db
|
| 3 |
+
size 323869
|
spectrogram_cnn_3s_window (1).pth → spectrogram_cnn_3s_window.pth
RENAMED
|
File without changes
|
speech_recognizer.py
CHANGED
|
@@ -42,7 +42,9 @@ class SpeechRecognizer:
|
|
| 42 |
analysis = self._analyze_transcription(transcription, segments)
|
| 43 |
|
| 44 |
duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
|
| 45 |
-
kopparapu_features = self._extract_kopparapu_features(
|
|
|
|
|
|
|
| 46 |
kopparapu_score = self._calculate_kopparapu_score(kopparapu_features)
|
| 47 |
|
| 48 |
return {
|
|
@@ -140,10 +142,13 @@ class SpeechRecognizer:
|
|
| 140 |
'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
|
| 141 |
}
|
| 142 |
|
| 143 |
-
def _extract_kopparapu_features(
|
|
|
|
|
|
|
|
|
|
| 144 |
"""
|
| 145 |
-
Extract Kopparapu-like linguistic features
|
| 146 |
-
Based on: https://arxiv.org/pdf/2306.08012
|
| 147 |
"""
|
| 148 |
text = text.strip()
|
| 149 |
if len(text) == 0:
|
|
@@ -153,7 +158,11 @@ class SpeechRecognizer:
|
|
| 153 |
'words_per_sec': 0.0,
|
| 154 |
'nonalpha_per_sec': 0.0,
|
| 155 |
'repetition_count': 0,
|
| 156 |
-
'filler_rate': 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
}
|
| 158 |
|
| 159 |
total_chars = len(text)
|
|
@@ -170,8 +179,10 @@ class SpeechRecognizer:
|
|
| 170 |
words_per_sec = num_words / duration_sec
|
| 171 |
nonalpha_per_sec = nonalpha_chars / duration_sec
|
| 172 |
|
|
|
|
| 173 |
char_reps = len(re.findall(r'(.)\1{2,}', text))
|
| 174 |
|
|
|
|
| 175 |
words_list = text.lower().split()
|
| 176 |
word_reps = 0
|
| 177 |
for i in range(len(words_list) - 1):
|
|
@@ -180,6 +191,7 @@ class SpeechRecognizer:
|
|
| 180 |
|
| 181 |
repetition_count = char_reps + word_reps
|
| 182 |
|
|
|
|
| 183 |
lower = text.lower()
|
| 184 |
filler_patterns = [
|
| 185 |
r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
|
|
@@ -193,29 +205,154 @@ class SpeechRecognizer:
|
|
| 193 |
filler_count += len(re.findall(pattern, lower))
|
| 194 |
filler_rate = filler_count / num_words
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
return {
|
| 197 |
'alpha_ratio': float(alpha_ratio),
|
| 198 |
'chars_per_word': float(chars_per_word),
|
| 199 |
'words_per_sec': float(words_per_sec),
|
| 200 |
'nonalpha_per_sec': float(nonalpha_per_sec),
|
| 201 |
'repetition_count': int(repetition_count),
|
| 202 |
-
'filler_rate': float(filler_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
}
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
def _logistic(self, x: float, a: float, b: float) -> float:
|
|
|
|
| 206 |
return 1.0 / (1.0 + np.exp(-(x - a) / b))
|
| 207 |
|
| 208 |
def _calculate_kopparapu_score(self, features: Dict) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
f1 = features['chars_per_word']
|
| 210 |
-
L1 = self._logistic(f1, a=
|
| 211 |
|
|
|
|
| 212 |
f2 = features['words_per_sec']
|
| 213 |
-
L2 = self._logistic(f2, a=2.
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
return float(score)
|
| 221 |
|
|
|
|
| 42 |
analysis = self._analyze_transcription(transcription, segments)
|
| 43 |
|
| 44 |
duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
|
| 45 |
+
kopparapu_features = self._extract_kopparapu_features(
|
| 46 |
+
transcription, duration, segments, analysis['pause_patterns']
|
| 47 |
+
)
|
| 48 |
kopparapu_score = self._calculate_kopparapu_score(kopparapu_features)
|
| 49 |
|
| 50 |
return {
|
|
|
|
| 142 |
'pause_variability': float(np.std(pauses)) if len(pauses) > 1 else 0.0
|
| 143 |
}
|
| 144 |
|
| 145 |
+
def _extract_kopparapu_features(
|
| 146 |
+
self, text: str, duration_sec: float,
|
| 147 |
+
segments: List[Dict] = None, pause_patterns: Dict = None
|
| 148 |
+
) -> Dict:
|
| 149 |
"""
|
| 150 |
+
Extract enhanced Kopparapu-like linguistic features for read speech detection.
|
| 151 |
+
Based on: https://arxiv.org/pdf/2306.08012 with extensions.
|
| 152 |
"""
|
| 153 |
text = text.strip()
|
| 154 |
if len(text) == 0:
|
|
|
|
| 158 |
'words_per_sec': 0.0,
|
| 159 |
'nonalpha_per_sec': 0.0,
|
| 160 |
'repetition_count': 0,
|
| 161 |
+
'filler_rate': 0.0,
|
| 162 |
+
'pause_regularity': 0.5,
|
| 163 |
+
'speech_rate_variability': 0.0,
|
| 164 |
+
'sentence_length_variance': 0.0,
|
| 165 |
+
'self_correction_count': 0
|
| 166 |
}
|
| 167 |
|
| 168 |
total_chars = len(text)
|
|
|
|
| 179 |
words_per_sec = num_words / duration_sec
|
| 180 |
nonalpha_per_sec = nonalpha_chars / duration_sec
|
| 181 |
|
| 182 |
+
# Character repetitions (e.g., "sooo", "ummmm")
|
| 183 |
char_reps = len(re.findall(r'(.)\1{2,}', text))
|
| 184 |
|
| 185 |
+
# Word repetitions (e.g., "I I think", "the the")
|
| 186 |
words_list = text.lower().split()
|
| 187 |
word_reps = 0
|
| 188 |
for i in range(len(words_list) - 1):
|
|
|
|
| 191 |
|
| 192 |
repetition_count = char_reps + word_reps
|
| 193 |
|
| 194 |
+
# Filler words detection
|
| 195 |
lower = text.lower()
|
| 196 |
filler_patterns = [
|
| 197 |
r'\bum\b', r'\buh\b', r'\buhm\b', r'\ber\b', r'\bah\b',
|
|
|
|
| 205 |
filler_count += len(re.findall(pattern, lower))
|
| 206 |
filler_rate = filler_count / num_words
|
| 207 |
|
| 208 |
+
# NEW: Pause regularity - read speech has regular pauses at punctuation
|
| 209 |
+
# Low variability = regular pauses = likely read
|
| 210 |
+
pause_regularity = 0.5 # neutral default
|
| 211 |
+
if pause_patterns and pause_patterns.get('num_pauses', 0) > 2:
|
| 212 |
+
pause_var = pause_patterns.get('pause_variability', 0.5)
|
| 213 |
+
# Normalize: low variability (< 0.2) -> high regularity (close to 1)
|
| 214 |
+
# High variability (> 0.6) -> low regularity (close to 0)
|
| 215 |
+
pause_regularity = max(0.0, min(1.0, 1.0 - (pause_var / 0.6)))
|
| 216 |
+
|
| 217 |
+
# NEW: Speech rate variability across segments
|
| 218 |
+
# Read speech has consistent pacing; spontaneous varies with thinking
|
| 219 |
+
speech_rate_variability = self._compute_rate_variability(segments) if segments else 0.0
|
| 220 |
+
|
| 221 |
+
# NEW: Sentence length variance - read text has more uniform structure
|
| 222 |
+
sentence_length_variance = self._compute_sentence_variance(text)
|
| 223 |
+
|
| 224 |
+
# NEW: Self-corrections and false starts (spontaneous speech markers)
|
| 225 |
+
self_correction_patterns = [
|
| 226 |
+
r'\bwait\b', r'\bsorry\b', r'\bno\s*,?\s*I\b',
|
| 227 |
+
r'\bactually\s*,?\s*no\b', r'\blet me\b', r'\bwhat I meant\b',
|
| 228 |
+
r'\bI meant\b', r'\bhold on\b', r'\bwhat was I\b', r'\bor rather\b'
|
| 229 |
+
]
|
| 230 |
+
self_correction_count = 0
|
| 231 |
+
for pattern in self_correction_patterns:
|
| 232 |
+
self_correction_count += len(re.findall(pattern, lower))
|
| 233 |
+
|
| 234 |
return {
|
| 235 |
'alpha_ratio': float(alpha_ratio),
|
| 236 |
'chars_per_word': float(chars_per_word),
|
| 237 |
'words_per_sec': float(words_per_sec),
|
| 238 |
'nonalpha_per_sec': float(nonalpha_per_sec),
|
| 239 |
'repetition_count': int(repetition_count),
|
| 240 |
+
'filler_rate': float(filler_rate),
|
| 241 |
+
'pause_regularity': float(pause_regularity),
|
| 242 |
+
'speech_rate_variability': float(speech_rate_variability),
|
| 243 |
+
'sentence_length_variance': float(sentence_length_variance),
|
| 244 |
+
'self_correction_count': int(self_correction_count)
|
| 245 |
}
|
| 246 |
|
| 247 |
+
def _compute_rate_variability(self, segments: List[Dict]) -> float:
|
| 248 |
+
"""
|
| 249 |
+
Compute speech rate variability across segments.
|
| 250 |
+
Read speech has consistent rate; spontaneous varies with thinking.
|
| 251 |
+
Returns 0-1 where higher = more variable = more spontaneous.
|
| 252 |
+
"""
|
| 253 |
+
if not segments or len(segments) < 3:
|
| 254 |
+
return 0.0
|
| 255 |
+
|
| 256 |
+
segment_rates = []
|
| 257 |
+
for seg in segments:
|
| 258 |
+
duration = seg.get('end', 0) - seg.get('start', 0)
|
| 259 |
+
if duration > 0.3: # Only consider segments > 300ms
|
| 260 |
+
words_in_seg = len(seg.get('text', '').split())
|
| 261 |
+
rate = words_in_seg / duration
|
| 262 |
+
if rate > 0:
|
| 263 |
+
segment_rates.append(rate)
|
| 264 |
+
|
| 265 |
+
if len(segment_rates) < 3:
|
| 266 |
+
return 0.0
|
| 267 |
+
|
| 268 |
+
mean_rate = np.mean(segment_rates)
|
| 269 |
+
std_rate = np.std(segment_rates)
|
| 270 |
+
|
| 271 |
+
# Coefficient of variation normalized to 0-1
|
| 272 |
+
cv = std_rate / mean_rate if mean_rate > 0 else 0
|
| 273 |
+
return float(min(1.0, cv / 0.5)) # CV of 0.5+ maps to 1.0
|
| 274 |
+
|
| 275 |
+
def _compute_sentence_variance(self, text: str) -> float:
|
| 276 |
+
"""
|
| 277 |
+
Compute variance in sentence lengths.
|
| 278 |
+
Read/scripted text tends to have more uniform sentence structure.
|
| 279 |
+
Returns 0-1 where higher = more variance = more spontaneous.
|
| 280 |
+
"""
|
| 281 |
+
# Split into sentences
|
| 282 |
+
sentences = re.split(r'[.!?]+', text)
|
| 283 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 284 |
+
|
| 285 |
+
if len(sentences) < 2:
|
| 286 |
+
return 0.0
|
| 287 |
+
|
| 288 |
+
lengths = [len(s.split()) for s in sentences]
|
| 289 |
+
mean_len = np.mean(lengths)
|
| 290 |
+
std_len = np.std(lengths)
|
| 291 |
+
|
| 292 |
+
# Coefficient of variation normalized
|
| 293 |
+
cv = std_len / mean_len if mean_len > 0 else 0
|
| 294 |
+
return float(min(1.0, cv / 0.6)) # CV of 0.6+ maps to 1.0
|
| 295 |
+
|
| 296 |
def _logistic(self, x: float, a: float, b: float) -> float:
|
| 297 |
+
"""Sigmoid function centered at 'a' with steepness 'b'."""
|
| 298 |
return 1.0 / (1.0 + np.exp(-(x - a) / b))
|
| 299 |
|
| 300 |
def _calculate_kopparapu_score(self, features: Dict) -> float:
|
| 301 |
+
"""
|
| 302 |
+
Calculate enhanced Kopparapu score for read vs spontaneous classification.
|
| 303 |
+
Score closer to 1 = more likely READ, closer to 0 = more likely SPONTANEOUS.
|
| 304 |
+
|
| 305 |
+
Key signals for READ speech:
|
| 306 |
+
- Higher chars_per_word (formal vocabulary)
|
| 307 |
+
- Faster, steadier words_per_sec
|
| 308 |
+
- Lower filler rate and disfluencies
|
| 309 |
+
- Regular pause patterns (pause_regularity high)
|
| 310 |
+
- Low speech rate variability
|
| 311 |
+
- Uniform sentence lengths
|
| 312 |
+
"""
|
| 313 |
+
# L1: Vocabulary complexity - higher chars/word = more formal = read
|
| 314 |
f1 = features['chars_per_word']
|
| 315 |
+
L1 = self._logistic(f1, a=4.8, b=1.2)
|
| 316 |
|
| 317 |
+
# L2: Speaking rate - faster, steadier = read
|
| 318 |
f2 = features['words_per_sec']
|
| 319 |
+
L2 = self._logistic(f2, a=2.2, b=0.6)
|
| 320 |
+
|
| 321 |
+
# L3: Disfluency signal (inverted) - less disfluency = more read
|
| 322 |
+
# Combines filler rate, nonalpha, and repetitions
|
| 323 |
+
disfluency = (
|
| 324 |
+
features['nonalpha_per_sec'] +
|
| 325 |
+
8.0 * features['filler_rate'] +
|
| 326 |
+
0.5 * features['repetition_count']
|
| 327 |
+
)
|
| 328 |
+
L3 = self._logistic(-disfluency, a=0.0, b=0.8)
|
| 329 |
+
|
| 330 |
+
# L4: Pause regularity - regular pauses = read (already 0-1)
|
| 331 |
+
L4 = features.get('pause_regularity', 0.5)
|
| 332 |
+
|
| 333 |
+
# L5: Rate variability (inverted) - low variability = read
|
| 334 |
+
rate_var = features.get('speech_rate_variability', 0.0)
|
| 335 |
+
L5 = 1.0 - rate_var
|
| 336 |
+
|
| 337 |
+
# L6: Sentence variance (inverted) - uniform sentences = read
|
| 338 |
+
sent_var = features.get('sentence_length_variance', 0.0)
|
| 339 |
+
L6 = 1.0 - sent_var
|
| 340 |
+
|
| 341 |
+
# L7: Self-corrections (inverted) - more corrections = spontaneous
|
| 342 |
+
corrections = features.get('self_correction_count', 0)
|
| 343 |
+
L7 = self._logistic(-corrections, a=0.0, b=1.5)
|
| 344 |
+
|
| 345 |
+
# Weighted combination optimized for read detection
|
| 346 |
+
# Higher weights on pause regularity and rate consistency (key read markers)
|
| 347 |
+
score = (
|
| 348 |
+
0.15 * L1 + # Vocabulary complexity
|
| 349 |
+
0.15 * L2 + # Speaking rate
|
| 350 |
+
0.15 * L3 + # Disfluency (filler/repetition)
|
| 351 |
+
0.20 * L4 + # Pause regularity (strong read signal)
|
| 352 |
+
0.15 * L5 + # Rate variability
|
| 353 |
+
0.10 * L6 + # Sentence uniformity
|
| 354 |
+
0.10 * L7 # Self-corrections
|
| 355 |
+
)
|
| 356 |
|
| 357 |
return float(score)
|
| 358 |
|