Ranam Hamoud commited on
Commit
95ad43e
·
1 Parent(s): 887ba32

Update audio_classifier and pipeline with latest improvements

Browse files
Files changed (2) hide show
  1. audio_classifier.py +111 -82
  2. pipeline.py +18 -14
audio_classifier.py CHANGED
@@ -215,102 +215,122 @@ class AudioClassifier:
215
  return features
216
 
217
  def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
 
 
 
 
 
 
 
218
  individual_scores = {}
219
-
220
- if features['pitch_mean'] > 0:
221
- if features['pitch_std'] < 30:
222
- pitch_score = 0.9 # Very monotone -> read
223
- elif features['pitch_std'] < 50:
224
- pitch_score = 0.7 # Somewhat monotone -> likely read
225
- elif features['pitch_std'] < 70:
226
- pitch_score = 0.5 # Moderate variation
227
- elif features['pitch_std'] < 90:
228
- pitch_score = 0.3 # Variable -> likely spontaneous
229
- else:
230
- pitch_score = 0.1 # Very variable -> spontaneous
 
231
  else:
232
- pitch_score = 0.5 # Unknown
233
 
234
- individual_scores['pitch_variation'] = {
235
- 'score': pitch_score,
236
- 'value': features['pitch_std'],
237
- 'interpretation': 'monotone (read)' if pitch_score > 0.6 else 'variable (spontaneous)' if pitch_score < 0.4 else 'moderate'
238
  }
239
 
240
- # Energy consistency score (0 = variable/spontaneous, 1 = consistent/read)
241
- if features['energy_std'] < 0.015:
242
- energy_score = 0.9 # Very consistent -> read
243
- elif features['energy_std'] < 0.025:
244
- energy_score = 0.6 # Somewhat consistent -> likely read
245
- elif features['energy_std'] < 0.035:
246
- energy_score = 0.4 # Moderate
 
 
 
 
 
247
  else:
248
- energy_score = 0.1 # Variable -> spontaneous
249
 
250
- individual_scores['energy_consistency'] = {
251
- 'score': energy_score,
252
- 'value': features['energy_std'],
253
- 'interpretation': 'consistent (read)' if energy_score > 0.6 else 'variable (spontaneous)' if energy_score < 0.4 else 'moderate'
254
  }
255
 
256
- # Tempo score (0 = slow/thoughtful/spontaneous, 1 = fast/consistent/read)
257
- if features['tempo'] > 140:
258
- tempo_score = 0.8 # Very fast -> likely read
259
- elif features['tempo'] > 110:
260
- tempo_score = 0.6 # Fast -> possibly read
261
- elif features['tempo'] > 80:
262
- tempo_score = 0.4 # Normal conversational
 
 
 
263
  else:
264
- tempo_score = 0.2 # Slow -> thoughtful/spontaneous
265
 
266
- individual_scores['tempo'] = {
267
- 'score': tempo_score,
268
- 'value': features['tempo'],
269
- 'interpretation': 'fast/steady (read)' if tempo_score > 0.6 else 'slow/varied (spontaneous)' if tempo_score < 0.4 else 'moderate'
270
  }
271
 
272
- # Spectral consistency (voice quality stability)
273
- if features['spectral_centroid_std'] < 300:
274
- spectral_score = 0.8 # Very stable -> read
275
- elif features['spectral_centroid_std'] < 500:
276
- spectral_score = 0.5 # Moderate
 
 
277
  else:
278
- spectral_score = 0.2 # Variable -> spontaneous
279
 
280
- individual_scores['spectral_stability'] = {
281
- 'score': spectral_score,
282
- 'value': features['spectral_centroid_std'],
283
- 'interpretation': 'stable (read)' if spectral_score > 0.6 else 'variable (spontaneous)' if spectral_score < 0.4 else 'moderate'
284
  }
285
 
 
286
  weights = {
287
- 'pitch_variation': 0.35,
288
- 'energy_consistency': 0.30,
289
- 'tempo': 0.20,
290
- 'spectral_stability': 0.15
291
  }
292
 
293
  overall_score = (
294
- pitch_score * weights['pitch_variation'] +
295
- energy_score * weights['energy_consistency'] +
296
- tempo_score * weights['tempo'] +
297
- spectral_score * weights['spectral_stability']
298
  )
299
 
300
- if overall_score > 0.65:
 
301
  classification = 'read'
302
- confidence = 0.5 + (overall_score - 0.5) # Scale to confidence
303
- elif overall_score < 0.35:
304
  classification = 'spontaneous'
305
- confidence = 0.5 + (0.5 - overall_score) # Scale to confidence
306
  else:
307
- # Borderline case - go with majority
308
  classification = 'read' if overall_score >= 0.5 else 'spontaneous'
309
- confidence = 0.5 + abs(overall_score - 0.5) * 0.5
310
 
311
  return {
312
  'classification': classification,
313
- 'confidence': confidence,
314
  'overall_score': overall_score,
315
  'individual_scores': individual_scores
316
  }
@@ -326,7 +346,7 @@ class AudioClassifier:
326
  predicted_class = torch.argmax(probabilities, dim=1).item()
327
  cnn_confidence = probabilities[0, predicted_class].item()
328
 
329
- # Debug output
330
  print(f"CNN Logits: {logits[0].cpu().numpy()}")
331
  print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
332
  print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
@@ -337,21 +357,31 @@ class AudioClassifier:
337
  prosody_classification = prosody_scores['classification']
338
  prosody_confidence = prosody_scores['confidence']
339
 
340
- # Try reversing labels if model was trained with opposite mapping
341
- # Original: 0=read, 1=spontaneous
342
- # Reversed: 0=spontaneous, 1=read
343
- cnn_class_name = 'spontaneous' if predicted_class == 0 else 'read' # REVERSED LABELS
344
- print(f"Final CNN classification: {cnn_class_name}")
345
-
346
- if cnn_class_name == prosody_classification:
347
- final_confidence = min(0.95, (cnn_confidence * 0.7 + prosody_confidence * 0.3))
348
- final_classification = cnn_class_name
 
 
 
 
 
 
 
 
 
 
 
349
  else:
350
- final_confidence = 0.5 + abs(cnn_confidence - prosody_confidence) * 0.3
351
- if cnn_confidence > prosody_confidence:
352
- final_classification = cnn_class_name
353
- else:
354
- final_classification = prosody_classification
355
 
356
  return {
357
  'classification': final_classification,
@@ -405,4 +435,3 @@ if __name__ == "__main__":
405
 
406
  print("\nModel architecture:")
407
  print(classifier.model)
408
-
 
215
  return features
216
 
217
  def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
218
+ """
219
+ Optimized prosody scoring based on feature analysis:
220
+ - spectral_centroid_std: 80% accuracy (threshold ~1017, read >= threshold)
221
+ - zcr_mean: 75% accuracy (threshold ~0.11, read >= threshold)
222
+ - energy_mean: 70% accuracy (threshold ~0.06, read < threshold)
223
+ - pitch_range: 75% accuracy (threshold ~3837, read < threshold)
224
+ """
225
  individual_scores = {}
226
+
227
+ # 1. Spectral centroid std - MOST discriminative (separation: 1.11)
228
+ # Read: 1087 avg, Spontaneous: 1017 avg
229
+ # Threshold: ~1050, read >= threshold
230
+ sc_std = features['spectral_centroid_std']
231
+ if sc_std >= 1100:
232
+ spectral_score = 0.9 # Strongly indicates read
233
+ elif sc_std >= 1050:
234
+ spectral_score = 0.7 # Likely read
235
+ elif sc_std >= 1000:
236
+ spectral_score = 0.5 # Borderline
237
+ elif sc_std >= 950:
238
+ spectral_score = 0.3 # Likely spontaneous
239
  else:
240
+ spectral_score = 0.1 # Strongly spontaneous
241
 
242
+ individual_scores['spectral_variability'] = {
243
+ 'score': spectral_score,
244
+ 'value': sc_std,
245
+ 'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
246
  }
247
 
248
+ # 2. ZCR mean - Second most discriminative (separation: 0.81)
249
+ # Read: 0.12 avg, Spontaneous: 0.10 avg
250
+ # Threshold: ~0.11, read >= threshold
251
+ zcr = features['zcr_mean']
252
+ if zcr >= 0.13:
253
+ zcr_score = 0.9 # Strongly indicates read
254
+ elif zcr >= 0.115:
255
+ zcr_score = 0.7 # Likely read
256
+ elif zcr >= 0.105:
257
+ zcr_score = 0.5 # Borderline
258
+ elif zcr >= 0.095:
259
+ zcr_score = 0.3 # Likely spontaneous
260
  else:
261
+ zcr_score = 0.1 # Strongly spontaneous
262
 
263
+ individual_scores['zcr_mean'] = {
264
+ 'score': zcr_score,
265
+ 'value': zcr,
266
+ 'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
267
  }
268
 
269
+ # 3. Energy mean (separation: 0.69)
270
+ # Read: 0.06 avg, Spontaneous: 0.06 avg but spontaneous tends higher
271
+ # Threshold: ~0.06, read < threshold
272
+ energy = features['energy_mean']
273
+ if energy < 0.055:
274
+ energy_score = 0.8 # Low energy -> likely read
275
+ elif energy < 0.065:
276
+ energy_score = 0.5 # Moderate
277
+ elif energy < 0.075:
278
+ energy_score = 0.3 # Higher energy -> likely spontaneous
279
  else:
280
+ energy_score = 0.1 # High energy -> spontaneous
281
 
282
+ individual_scores['energy_level'] = {
283
+ 'score': energy_score,
284
+ 'value': energy,
285
+ 'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
286
  }
287
 
288
+ # 4. Tempo (separation: 0.22) - less discriminative but still useful
289
+ # Read: 122 avg, Spontaneous: 125 avg
290
+ tempo = features['tempo']
291
+ if tempo < 115:
292
+ tempo_score = 0.7 # Slower -> could be read (more deliberate)
293
+ elif tempo < 125:
294
+ tempo_score = 0.5 # Moderate
295
  else:
296
+ tempo_score = 0.3 # Faster -> could be spontaneous
297
 
298
+ individual_scores['tempo'] = {
299
+ 'score': tempo_score,
300
+ 'value': tempo,
301
+ 'interpretation': 'slow (read)' if tempo_score > 0.6 else 'fast (spontaneous)' if tempo_score < 0.4 else 'moderate'
302
  }
303
 
304
+ # Optimized weights based on feature separation scores
305
  weights = {
306
+ 'spectral_variability': 0.40, # Best discriminator (1.11 separation)
307
+ 'zcr_mean': 0.30, # Second best (0.81 separation)
308
+ 'energy_level': 0.20, # Third (0.69 separation)
309
+ 'tempo': 0.10 # Weakest (0.22 separation)
310
  }
311
 
312
  overall_score = (
313
+ spectral_score * weights['spectral_variability'] +
314
+ zcr_score * weights['zcr_mean'] +
315
+ energy_score * weights['energy_level'] +
316
+ tempo_score * weights['tempo']
317
  )
318
 
319
+ # More decisive thresholds
320
+ if overall_score > 0.60:
321
  classification = 'read'
322
+ confidence = 0.5 + (overall_score - 0.5) * 0.8
323
+ elif overall_score < 0.40:
324
  classification = 'spontaneous'
325
+ confidence = 0.5 + (0.5 - overall_score) * 0.8
326
  else:
327
+ # Borderline - slight lean based on score
328
  classification = 'read' if overall_score >= 0.5 else 'spontaneous'
329
+ confidence = 0.5 + abs(overall_score - 0.5) * 0.6
330
 
331
  return {
332
  'classification': classification,
333
+ 'confidence': min(0.95, confidence),
334
  'overall_score': overall_score,
335
  'individual_scores': individual_scores
336
  }
 
346
  predicted_class = torch.argmax(probabilities, dim=1).item()
347
  cnn_confidence = probabilities[0, predicted_class].item()
348
 
349
+ # Debug output - Model: Class 0=read, Class 1=spontaneous
350
  print(f"CNN Logits: {logits[0].cpu().numpy()}")
351
  print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
352
  print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
 
357
  prosody_classification = prosody_scores['classification']
358
  prosody_confidence = prosody_scores['confidence']
359
 
360
+ # Model mapping: Class 0 = read, Class 1 = spontaneous
361
+ cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
362
+ print(f"CNN classification: {cnn_class_name}")
363
+ print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
364
+
365
+ # Weighted combination: Prosody is more reliable (60% acc) than CNN (50% acc)
366
+ # Convert classifications to scores: read=1, spontaneous=0
367
+ cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
368
+ prosody_score = 1.0 if prosody_classification == 'read' else 0.0
369
+
370
+ # Weight prosody more heavily (0.6) than CNN (0.4)
371
+ # Also factor in confidence
372
+ weighted_score = (
373
+ cnn_score * cnn_confidence * 0.4 +
374
+ prosody_score * prosody_confidence * 0.6
375
+ ) / (cnn_confidence * 0.4 + prosody_confidence * 0.6)
376
+
377
+ if weighted_score > 0.5:
378
+ final_classification = 'read'
379
+ final_confidence = 0.5 + (weighted_score - 0.5)
380
  else:
381
+ final_classification = 'spontaneous'
382
+ final_confidence = 0.5 + (0.5 - weighted_score)
383
+
384
+ final_confidence = min(0.95, final_confidence)
 
385
 
386
  return {
387
  'classification': final_classification,
 
435
 
436
  print("\nModel architecture:")
437
  print(classifier.model)
 
pipeline.py CHANGED
@@ -1,8 +1,3 @@
1
- """
2
- Multimodal Authenticity Detection Pipeline
3
- Integrates CNN audio classification, Whisper ASR, and text authenticity analysis
4
- """
5
-
6
  from typing import Dict, Optional
7
  import time
8
  from audio_classifier import AudioClassifier
@@ -102,22 +97,32 @@ class AuthenticityDetectionPipeline:
102
  text_results: Dict
103
  ) -> Dict:
104
 
 
105
  if audio_results['classification'] == 'spontaneous':
106
  audio_score = audio_results['confidence']
107
  else: # read
108
  audio_score = 1.0 - audio_results['confidence']
109
 
110
- if asr_results['kopparapu_classification'] == 'spontaneous':
111
- speech_pattern_score = asr_results['kopparapu_score']
112
- else:
113
- speech_pattern_score = 1.0 - asr_results['kopparapu_score']
114
 
115
- text_auth_score = text_results['authenticity_score']
 
 
116
 
 
 
 
 
 
 
117
  composite_score = (
118
- audio_score * 0.30 + # CNN acoustic analysis
119
- speech_pattern_score * 0.30 + # Speech patterns (Kopparapu)
120
- text_auth_score * 0.40 # Text authenticity (AI detection)
 
 
121
  )
122
 
123
  if composite_score >= 0.7:
@@ -186,4 +191,3 @@ if __name__ == "__main__":
186
  whisper_model_size="base"
187
  )
188
  print("\nPipeline ready for audio analysis.")
189
-
 
 
 
 
 
 
1
  from typing import Dict, Optional
2
  import time
3
  from audio_classifier import AudioClassifier
 
97
  text_results: Dict
98
  ) -> Dict:
99
 
100
+ # CNN score: spontaneous = authentic (high), read = inauthentic (low)
101
  if audio_results['classification'] == 'spontaneous':
102
  audio_score = audio_results['confidence']
103
  else: # read
104
  audio_score = 1.0 - audio_results['confidence']
105
 
106
+ # Kopparapu score: 0=spontaneous, 1=read
107
+ # Invert so spontaneous (low kopparapu) = high authenticity
108
+ speech_pattern_score = 1.0 - asr_results['kopparapu_score']
 
109
 
110
+ # Filler words: higher ratio = more spontaneous = more authentic
111
+ filler_ratio = asr_results['filler_words']['ratio']
112
+ filler_score = min(1.0, filler_ratio / 0.05) # Normalize: 5%+ = max score
113
 
114
+ # Pause variability: higher = more spontaneous = more authentic
115
+ pause_var = asr_results['pause_patterns']['pause_variability']
116
+ pause_score = min(1.0, pause_var / 0.5) # Normalize: 0.5+ = max score
117
+
118
+ text_auth_score = text_results['authenticity_score']
119
+
120
  composite_score = (
121
+ audio_score * 0.15 + # CNN - weakest component
122
+ speech_pattern_score * 0.20 + # Kopparapu linguistic
123
+ filler_score * 0.10 + # Filler word ratio
124
+ pause_score * 0.05 + # Pause variability
125
+ text_auth_score * 0.50 # Text authenticity - strongest signal
126
  )
127
 
128
  if composite_score >= 0.7:
 
191
  whisper_model_size="base"
192
  )
193
  print("\nPipeline ready for audio analysis.")