Spaces:
Sleeping
Sleeping
Ranam Hamoud
commited on
Commit
·
95ad43e
1
Parent(s):
887ba32
Update audio_classifier and pipeline with latest improvements
Browse files- audio_classifier.py +111 -82
- pipeline.py +18 -14
audio_classifier.py
CHANGED
|
@@ -215,102 +215,122 @@ class AudioClassifier:
|
|
| 215 |
return features
|
| 216 |
|
| 217 |
def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
individual_scores = {}
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
| 231 |
else:
|
| 232 |
-
|
| 233 |
|
| 234 |
-
individual_scores['
|
| 235 |
-
'score':
|
| 236 |
-
'value':
|
| 237 |
-
'interpretation': '
|
| 238 |
}
|
| 239 |
|
| 240 |
-
#
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
else:
|
| 248 |
-
|
| 249 |
|
| 250 |
-
individual_scores['
|
| 251 |
-
'score':
|
| 252 |
-
'value':
|
| 253 |
-
'interpretation': '
|
| 254 |
}
|
| 255 |
|
| 256 |
-
#
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
| 263 |
else:
|
| 264 |
-
|
| 265 |
|
| 266 |
-
individual_scores['
|
| 267 |
-
'score':
|
| 268 |
-
'value':
|
| 269 |
-
'interpretation': '
|
| 270 |
}
|
| 271 |
|
| 272 |
-
#
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
| 277 |
else:
|
| 278 |
-
|
| 279 |
|
| 280 |
-
individual_scores['
|
| 281 |
-
'score':
|
| 282 |
-
'value':
|
| 283 |
-
'interpretation': '
|
| 284 |
}
|
| 285 |
|
|
|
|
| 286 |
weights = {
|
| 287 |
-
'
|
| 288 |
-
'
|
| 289 |
-
'
|
| 290 |
-
'
|
| 291 |
}
|
| 292 |
|
| 293 |
overall_score = (
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
)
|
| 299 |
|
| 300 |
-
|
|
|
|
| 301 |
classification = 'read'
|
| 302 |
-
confidence = 0.5 + (overall_score - 0.5)
|
| 303 |
-
elif overall_score < 0.
|
| 304 |
classification = 'spontaneous'
|
| 305 |
-
confidence = 0.5 + (0.5 - overall_score)
|
| 306 |
else:
|
| 307 |
-
# Borderline
|
| 308 |
classification = 'read' if overall_score >= 0.5 else 'spontaneous'
|
| 309 |
-
confidence = 0.5 + abs(overall_score - 0.5) * 0.
|
| 310 |
|
| 311 |
return {
|
| 312 |
'classification': classification,
|
| 313 |
-
'confidence': confidence,
|
| 314 |
'overall_score': overall_score,
|
| 315 |
'individual_scores': individual_scores
|
| 316 |
}
|
|
@@ -326,7 +346,7 @@ class AudioClassifier:
|
|
| 326 |
predicted_class = torch.argmax(probabilities, dim=1).item()
|
| 327 |
cnn_confidence = probabilities[0, predicted_class].item()
|
| 328 |
|
| 329 |
-
# Debug output
|
| 330 |
print(f"CNN Logits: {logits[0].cpu().numpy()}")
|
| 331 |
print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
|
| 332 |
print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
|
|
@@ -337,21 +357,31 @@ class AudioClassifier:
|
|
| 337 |
prosody_classification = prosody_scores['classification']
|
| 338 |
prosody_confidence = prosody_scores['confidence']
|
| 339 |
|
| 340 |
-
#
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
else:
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
final_classification = prosody_classification
|
| 355 |
|
| 356 |
return {
|
| 357 |
'classification': final_classification,
|
|
@@ -405,4 +435,3 @@ if __name__ == "__main__":
|
|
| 405 |
|
| 406 |
print("\nModel architecture:")
|
| 407 |
print(classifier.model)
|
| 408 |
-
|
|
|
|
| 215 |
return features
|
| 216 |
|
| 217 |
def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
|
| 218 |
+
"""
|
| 219 |
+
Optimized prosody scoring based on feature analysis:
|
| 220 |
+
- spectral_centroid_std: 80% accuracy (threshold ~1017, read >= threshold)
|
| 221 |
+
- zcr_mean: 75% accuracy (threshold ~0.11, read >= threshold)
|
| 222 |
+
- energy_mean: 70% accuracy (threshold ~0.06, read < threshold)
|
| 223 |
+
- pitch_range: 75% accuracy (threshold ~3837, read < threshold)
|
| 224 |
+
"""
|
| 225 |
individual_scores = {}
|
| 226 |
+
|
| 227 |
+
# 1. Spectral centroid std - MOST discriminative (separation: 1.11)
|
| 228 |
+
# Read: 1087 avg, Spontaneous: 1017 avg
|
| 229 |
+
# Threshold: ~1050, read >= threshold
|
| 230 |
+
sc_std = features['spectral_centroid_std']
|
| 231 |
+
if sc_std >= 1100:
|
| 232 |
+
spectral_score = 0.9 # Strongly indicates read
|
| 233 |
+
elif sc_std >= 1050:
|
| 234 |
+
spectral_score = 0.7 # Likely read
|
| 235 |
+
elif sc_std >= 1000:
|
| 236 |
+
spectral_score = 0.5 # Borderline
|
| 237 |
+
elif sc_std >= 950:
|
| 238 |
+
spectral_score = 0.3 # Likely spontaneous
|
| 239 |
else:
|
| 240 |
+
spectral_score = 0.1 # Strongly spontaneous
|
| 241 |
|
| 242 |
+
individual_scores['spectral_variability'] = {
|
| 243 |
+
'score': spectral_score,
|
| 244 |
+
'value': sc_std,
|
| 245 |
+
'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
|
| 246 |
}
|
| 247 |
|
| 248 |
+
# 2. ZCR mean - Second most discriminative (separation: 0.81)
|
| 249 |
+
# Read: 0.12 avg, Spontaneous: 0.10 avg
|
| 250 |
+
# Threshold: ~0.11, read >= threshold
|
| 251 |
+
zcr = features['zcr_mean']
|
| 252 |
+
if zcr >= 0.13:
|
| 253 |
+
zcr_score = 0.9 # Strongly indicates read
|
| 254 |
+
elif zcr >= 0.115:
|
| 255 |
+
zcr_score = 0.7 # Likely read
|
| 256 |
+
elif zcr >= 0.105:
|
| 257 |
+
zcr_score = 0.5 # Borderline
|
| 258 |
+
elif zcr >= 0.095:
|
| 259 |
+
zcr_score = 0.3 # Likely spontaneous
|
| 260 |
else:
|
| 261 |
+
zcr_score = 0.1 # Strongly spontaneous
|
| 262 |
|
| 263 |
+
individual_scores['zcr_mean'] = {
|
| 264 |
+
'score': zcr_score,
|
| 265 |
+
'value': zcr,
|
| 266 |
+
'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
|
| 267 |
}
|
| 268 |
|
| 269 |
+
# 3. Energy mean (separation: 0.69)
|
| 270 |
+
# Read: 0.06 avg, Spontaneous: 0.06 avg but spontaneous tends higher
|
| 271 |
+
# Threshold: ~0.06, read < threshold
|
| 272 |
+
energy = features['energy_mean']
|
| 273 |
+
if energy < 0.055:
|
| 274 |
+
energy_score = 0.8 # Low energy -> likely read
|
| 275 |
+
elif energy < 0.065:
|
| 276 |
+
energy_score = 0.5 # Moderate
|
| 277 |
+
elif energy < 0.075:
|
| 278 |
+
energy_score = 0.3 # Higher energy -> likely spontaneous
|
| 279 |
else:
|
| 280 |
+
energy_score = 0.1 # High energy -> spontaneous
|
| 281 |
|
| 282 |
+
individual_scores['energy_level'] = {
|
| 283 |
+
'score': energy_score,
|
| 284 |
+
'value': energy,
|
| 285 |
+
'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
|
| 286 |
}
|
| 287 |
|
| 288 |
+
# 4. Tempo (separation: 0.22) - less discriminative but still useful
|
| 289 |
+
# Read: 122 avg, Spontaneous: 125 avg
|
| 290 |
+
tempo = features['tempo']
|
| 291 |
+
if tempo < 115:
|
| 292 |
+
tempo_score = 0.7 # Slower -> could be read (more deliberate)
|
| 293 |
+
elif tempo < 125:
|
| 294 |
+
tempo_score = 0.5 # Moderate
|
| 295 |
else:
|
| 296 |
+
tempo_score = 0.3 # Faster -> could be spontaneous
|
| 297 |
|
| 298 |
+
individual_scores['tempo'] = {
|
| 299 |
+
'score': tempo_score,
|
| 300 |
+
'value': tempo,
|
| 301 |
+
'interpretation': 'slow (read)' if tempo_score > 0.6 else 'fast (spontaneous)' if tempo_score < 0.4 else 'moderate'
|
| 302 |
}
|
| 303 |
|
| 304 |
+
# Optimized weights based on feature separation scores
|
| 305 |
weights = {
|
| 306 |
+
'spectral_variability': 0.40, # Best discriminator (1.11 separation)
|
| 307 |
+
'zcr_mean': 0.30, # Second best (0.81 separation)
|
| 308 |
+
'energy_level': 0.20, # Third (0.69 separation)
|
| 309 |
+
'tempo': 0.10 # Weakest (0.22 separation)
|
| 310 |
}
|
| 311 |
|
| 312 |
overall_score = (
|
| 313 |
+
spectral_score * weights['spectral_variability'] +
|
| 314 |
+
zcr_score * weights['zcr_mean'] +
|
| 315 |
+
energy_score * weights['energy_level'] +
|
| 316 |
+
tempo_score * weights['tempo']
|
| 317 |
)
|
| 318 |
|
| 319 |
+
# More decisive thresholds
|
| 320 |
+
if overall_score > 0.60:
|
| 321 |
classification = 'read'
|
| 322 |
+
confidence = 0.5 + (overall_score - 0.5) * 0.8
|
| 323 |
+
elif overall_score < 0.40:
|
| 324 |
classification = 'spontaneous'
|
| 325 |
+
confidence = 0.5 + (0.5 - overall_score) * 0.8
|
| 326 |
else:
|
| 327 |
+
# Borderline - slight lean based on score
|
| 328 |
classification = 'read' if overall_score >= 0.5 else 'spontaneous'
|
| 329 |
+
confidence = 0.5 + abs(overall_score - 0.5) * 0.6
|
| 330 |
|
| 331 |
return {
|
| 332 |
'classification': classification,
|
| 333 |
+
'confidence': min(0.95, confidence),
|
| 334 |
'overall_score': overall_score,
|
| 335 |
'individual_scores': individual_scores
|
| 336 |
}
|
|
|
|
| 346 |
predicted_class = torch.argmax(probabilities, dim=1).item()
|
| 347 |
cnn_confidence = probabilities[0, predicted_class].item()
|
| 348 |
|
| 349 |
+
# Debug output - Model: Class 0=read, Class 1=spontaneous
|
| 350 |
print(f"CNN Logits: {logits[0].cpu().numpy()}")
|
| 351 |
print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
|
| 352 |
print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
|
|
|
|
| 357 |
prosody_classification = prosody_scores['classification']
|
| 358 |
prosody_confidence = prosody_scores['confidence']
|
| 359 |
|
| 360 |
+
# Model mapping: Class 0 = read, Class 1 = spontaneous
|
| 361 |
+
cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
|
| 362 |
+
print(f"CNN classification: {cnn_class_name}")
|
| 363 |
+
print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
|
| 364 |
+
|
| 365 |
+
# Weighted combination: Prosody is more reliable (60% acc) than CNN (50% acc)
|
| 366 |
+
# Convert classifications to scores: read=1, spontaneous=0
|
| 367 |
+
cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
|
| 368 |
+
prosody_score = 1.0 if prosody_classification == 'read' else 0.0
|
| 369 |
+
|
| 370 |
+
# Weight prosody more heavily (0.6) than CNN (0.4)
|
| 371 |
+
# Also factor in confidence
|
| 372 |
+
weighted_score = (
|
| 373 |
+
cnn_score * cnn_confidence * 0.4 +
|
| 374 |
+
prosody_score * prosody_confidence * 0.6
|
| 375 |
+
) / (cnn_confidence * 0.4 + prosody_confidence * 0.6)
|
| 376 |
+
|
| 377 |
+
if weighted_score > 0.5:
|
| 378 |
+
final_classification = 'read'
|
| 379 |
+
final_confidence = 0.5 + (weighted_score - 0.5)
|
| 380 |
else:
|
| 381 |
+
final_classification = 'spontaneous'
|
| 382 |
+
final_confidence = 0.5 + (0.5 - weighted_score)
|
| 383 |
+
|
| 384 |
+
final_confidence = min(0.95, final_confidence)
|
|
|
|
| 385 |
|
| 386 |
return {
|
| 387 |
'classification': final_classification,
|
|
|
|
| 435 |
|
| 436 |
print("\nModel architecture:")
|
| 437 |
print(classifier.model)
|
|
|
pipeline.py
CHANGED
|
@@ -1,8 +1,3 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Multimodal Authenticity Detection Pipeline
|
| 3 |
-
Integrates CNN audio classification, Whisper ASR, and text authenticity analysis
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
from typing import Dict, Optional
|
| 7 |
import time
|
| 8 |
from audio_classifier import AudioClassifier
|
|
@@ -102,22 +97,32 @@ class AuthenticityDetectionPipeline:
|
|
| 102 |
text_results: Dict
|
| 103 |
) -> Dict:
|
| 104 |
|
|
|
|
| 105 |
if audio_results['classification'] == 'spontaneous':
|
| 106 |
audio_score = audio_results['confidence']
|
| 107 |
else: # read
|
| 108 |
audio_score = 1.0 - audio_results['confidence']
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
speech_pattern_score = 1.0 - asr_results['kopparapu_score']
|
| 114 |
|
| 115 |
-
|
|
|
|
|
|
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
composite_score = (
|
| 118 |
-
audio_score * 0.
|
| 119 |
-
speech_pattern_score * 0.
|
| 120 |
-
|
|
|
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
if composite_score >= 0.7:
|
|
@@ -186,4 +191,3 @@ if __name__ == "__main__":
|
|
| 186 |
whisper_model_size="base"
|
| 187 |
)
|
| 188 |
print("\nPipeline ready for audio analysis.")
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import Dict, Optional
|
| 2 |
import time
|
| 3 |
from audio_classifier import AudioClassifier
|
|
|
|
| 97 |
text_results: Dict
|
| 98 |
) -> Dict:
|
| 99 |
|
| 100 |
+
# CNN score: spontaneous = authentic (high), read = inauthentic (low)
|
| 101 |
if audio_results['classification'] == 'spontaneous':
|
| 102 |
audio_score = audio_results['confidence']
|
| 103 |
else: # read
|
| 104 |
audio_score = 1.0 - audio_results['confidence']
|
| 105 |
|
| 106 |
+
# Kopparapu score: 0=spontaneous, 1=read
|
| 107 |
+
# Invert so spontaneous (low kopparapu) = high authenticity
|
| 108 |
+
speech_pattern_score = 1.0 - asr_results['kopparapu_score']
|
|
|
|
| 109 |
|
| 110 |
+
# Filler words: higher ratio = more spontaneous = more authentic
|
| 111 |
+
filler_ratio = asr_results['filler_words']['ratio']
|
| 112 |
+
filler_score = min(1.0, filler_ratio / 0.05) # Normalize: 5%+ = max score
|
| 113 |
|
| 114 |
+
# Pause variability: higher = more spontaneous = more authentic
|
| 115 |
+
pause_var = asr_results['pause_patterns']['pause_variability']
|
| 116 |
+
pause_score = min(1.0, pause_var / 0.5) # Normalize: 0.5+ = max score
|
| 117 |
+
|
| 118 |
+
text_auth_score = text_results['authenticity_score']
|
| 119 |
+
|
| 120 |
composite_score = (
|
| 121 |
+
audio_score * 0.15 + # CNN - weakest component
|
| 122 |
+
speech_pattern_score * 0.20 + # Kopparapu linguistic
|
| 123 |
+
filler_score * 0.10 + # Filler word ratio
|
| 124 |
+
pause_score * 0.05 + # Pause variability
|
| 125 |
+
text_auth_score * 0.50 # Text authenticity - strongest signal
|
| 126 |
)
|
| 127 |
|
| 128 |
if composite_score >= 0.7:
|
|
|
|
| 191 |
whisper_model_size="base"
|
| 192 |
)
|
| 193 |
print("\nPipeline ready for audio analysis.")
|
|
|