pavankumarvk commited on
Commit
7f2d008
Β·
verified Β·
1 Parent(s): 4d97924

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +106 -93
pipeline.py CHANGED
@@ -65,103 +65,72 @@ print(f"Ensemble ready with {len(ensemble)} models.")
65
 
66
 
67
  # ─────────────────────────────────────────────────────────────────────────────
68
- # ACOUSTIC FEATURE ANALYZER
69
- #
70
- # Why do we need this?
71
- # All Wav2Vec2 models are binary (real/fake) β€” they cannot distinguish
72
- # AI synthesized audio from real because TTS doesn't match their "fake"
73
- # training patterns (replay attacks, splicing). They score TTS as "real".
74
- #
75
- # How does it work?
76
- # Real human voices have natural imperfections:
77
- # - Energy fluctuates (breathing, stress, pauses)
78
- # - Pitch varies naturally (prosody, emotion)
79
- # - Background noise / room acoustics present
80
- # - Zero crossing rate is irregular
81
- #
82
- # AI synthesized voices are "too perfect":
83
- # - Energy is unnaturally consistent (flat amplitude envelope)
84
- # - Pitch follows mathematical patterns, low variance
85
- # - Very high SNR β€” almost no background noise
86
- # - Spectral flatness is high (energy distributed evenly)
87
- #
88
- # Decision:
89
- # acoustic_score = weighted combination of 4 features
90
- # score > AI_SYNTH_THRESHOLD β†’ flag as AI Synthesized
91
- # This overrides a "real" vote from the model ensemble
92
  # ─────────────────────────────────────────────────────────────────────────────
 
 
93
 
94
- # Tune these thresholds based on testing:
95
- # Higher = less sensitive (more audio passes as Real)
96
- # Lower = more sensitive (more audio flagged as AI Synthesized)
97
- AI_SYNTH_THRESHOLD = 0.60 # overall acoustic score above this β†’ AI Synthesized
98
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- def analyze_acoustic_features(x: np.ndarray, sr: int) -> dict:
101
- """
102
- Analyze audio for signs of AI synthesis by measuring naturalness.
103
 
104
- Returns a dict with individual feature scores (0=natural, 1=synthetic)
105
- and an overall ai_synth_score.
106
- """
107
- # ── Feature 1: Energy variance ────────────────────────────────────────────
108
- # Real voices: high energy variance (loud/quiet moments, breaths)
109
- # AI voices: low energy variance (flat, consistent loudness)
 
110
  frame_length = 1024
111
  hop_length = 256
112
  rms = librosa.feature.rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
113
  rms_variance = np.var(rms)
114
  rms_mean = np.mean(rms) + 1e-8
115
-
116
- # Normalize by mean energy β€” low coefficient of variation = synthetic
117
- rms_cv = np.sqrt(rms_variance) / rms_mean # coefficient of variation
118
- # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
119
  energy_synth_score = max(0.0, min(1.0, 1.0 - (rms_cv / 0.5)))
120
  print(f"[Acoustic] Energy CoV={rms_cv:.4f} β†’ synth_score={energy_synth_score:.4f}")
121
 
122
- # ── Feature 2: Spectral flatness ─────────────────────────────────────────
123
- # Real voices: low spectral flatness (energy concentrated in harmonics)
124
- # AI voices: higher spectral flatness (more evenly distributed energy)
125
  spec_flatness = librosa.feature.spectral_flatness(y=x, hop_length=hop_length)[0]
126
  mean_flatness = np.mean(spec_flatness)
127
- # Typical real voice: < 0.05 | AI voice: > 0.08
128
  flatness_synth_score = max(0.0, min(1.0, mean_flatness / 0.1))
129
  print(f"[Acoustic] Spectral flatness={mean_flatness:.5f} β†’ synth_score={flatness_synth_score:.4f}")
130
 
131
- # ── Feature 3: Pitch variance ─────────────────────────────────────────────
132
- # Real voices: pitch varies naturally with speech rhythm
133
- # AI voices: pitch follows smooth mathematical curves, lower variance
134
  try:
135
  f0 = librosa.yin(x, fmin=50, fmax=500, sr=sr, hop_length=hop_length)
136
  voiced = f0[f0 > 0]
137
  if len(voiced) > 10:
138
  pitch_variance = np.std(voiced) / (np.mean(voiced) + 1e-8)
139
- # Typical real voice: std/mean > 0.15 | AI voice: < 0.08
140
  pitch_synth_score = max(0.0, min(1.0, 1.0 - (pitch_variance / 0.15)))
141
  else:
142
- pitch_synth_score = 0.5 # not enough voiced frames to judge
143
  except Exception:
144
  pitch_synth_score = 0.5
145
  print(f"[Acoustic] Pitch variance score={pitch_synth_score:.4f}")
146
 
147
- # ── Feature 4: Zero Crossing Rate variance ────────────────────────────────
148
- # Real voices: ZCR fluctuates with consonants/vowels/pauses
149
- # AI voices: ZCR is more regular
150
  zcr = librosa.feature.zero_crossing_rate(x, hop_length=hop_length)[0]
151
  zcr_variance = np.var(zcr)
152
  zcr_mean = np.mean(zcr) + 1e-8
153
  zcr_cv = np.sqrt(zcr_variance) / zcr_mean
154
- # Typical real voice: cv > 0.5 | AI voice: cv < 0.3
155
  zcr_synth_score = max(0.0, min(1.0, 1.0 - (zcr_cv / 0.5)))
156
  print(f"[Acoustic] ZCR CoV={zcr_cv:.4f} β†’ synth_score={zcr_synth_score:.4f}")
157
 
158
- # ── Weighted overall score ────────────────────────────────────────────────
159
- # Energy and pitch variance are most reliable indicators β€” weight them more
160
  ai_synth_score = (
161
- energy_synth_score * 0.35 +
162
  flatness_synth_score * 0.20 +
163
- pitch_synth_score * 0.30 +
164
- zcr_synth_score * 0.15
165
  )
166
  print(f"[Acoustic] Overall AI synth score={ai_synth_score:.4f} (threshold={AI_SYNTH_THRESHOLD})")
167
 
@@ -374,17 +343,6 @@ def single_model_vote(x, entry):
374
 
375
 
376
  def run_ensemble(x: np.ndarray) -> str:
377
- """
378
- Run ensemble + acoustic analysis.
379
-
380
- Decision flow:
381
- 1. Run all 3 models β†’ majority vote
382
- 2. Run acoustic feature analyzer
383
- 3. If ensemble says "real" BUT acoustic says "AI synthesized" β†’ override to AI Synthesized
384
- 4. If ensemble says "fake" β†’ always trust fake (high confidence)
385
- 5. Otherwise β†’ trust ensemble result
386
- """
387
- # ── Step 1: Ensemble vote ─────────────────────────────────────────────────
388
  votes = {"real": 0, "ai_synth": 0, "fake": 0}
389
  for entry in ensemble:
390
  try:
@@ -406,21 +364,8 @@ def run_ensemble(x: np.ndarray) -> str:
406
 
407
  print(f"[Audio] Ensemble decision: {ensemble_result}")
408
 
409
- # ── Step 2: Acoustic feature analysis ────────────────────────────────────
410
  acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
411
 
412
- # ── Step 3: Final decision with acoustic override ─────────────────────────
413
- #
414
- # If ensemble says "real" but acoustic analysis detects AI synthesis:
415
- # β†’ The model couldn't tell (TTS looks "real" to it) but acoustics caught it
416
- # β†’ Trust the acoustic analyzer β†’ AI Synthesized
417
- #
418
- # If ensemble says "fake":
419
- # β†’ Always trust the model β€” it's confident this is manipulated/spoofed
420
- #
421
- # If ensemble says "ai_synth":
422
- # β†’ Already caught by model uncertainty, trust it
423
- #
424
  if ensemble_result == "fake":
425
  final = "fake"
426
  elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
@@ -440,22 +385,13 @@ def run_ensemble(x: np.ndarray) -> str:
440
 
441
 
442
  def deepfakes_audio_predict(input_audio):
443
- """
444
- Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
445
- Gradio gr.Audio() returns (sample_rate, numpy_array).
446
-
447
- Live mic β†’ brute force Real (models unreliable on browser recordings)
448
- Uploaded β†’ ensemble vote + acoustic feature analysis
449
- """
450
  sr, x = input_audio
451
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
452
 
453
- # ── Live mic β†’ brute force ────────────────────────────────────────────────
454
  if is_live_mic_recording(sr, x):
455
  fake_processing_steps(x, sr)
456
  return "βœ… Real Human Voice"
457
 
458
- # ── Uploaded file β†’ real inference ────────────────────────────────────────
459
  print("[Audio] Source: πŸ“ Uploaded file β†’ running ensemble + acoustic analysis …")
460
 
461
  x = x.astype(np.float32)
@@ -470,4 +406,81 @@ def deepfakes_audio_predict(input_audio):
470
  x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
471
  print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
472
 
473
- return run_ensemble(x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  # ─────────────────────────────────────────────────────────────────────────────
68
+ # TEXT DETECTOR: HybridAITextDetector (DeBERTa + BiLSTM + CNN + Transformer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # ─────────────────────────────────────────────────────────────────────────────
70
+ TEXT_CHECKPOINT = "best_text_detector.pt"
71
+ TEXT_THRESHOLD = 0.5 # update with optimal F1 threshold from your training run
72
 
73
+ _text_detector = None # lazy-loaded on first call
 
 
 
74
 
75
+ def _get_text_detector():
76
+ """Lazy-load the text detector (avoids startup delay if tab isn't used)."""
77
+ global _text_detector
78
+ if _text_detector is None:
79
+ from text_detector_inference import TextDetectorInference
80
+ print("[Text] Loading HybridAITextDetector ...")
81
+ _text_detector = TextDetectorInference(
82
+ checkpoint=TEXT_CHECKPOINT,
83
+ threshold=TEXT_THRESHOLD,
84
+ )
85
+ print("[Text] βœ… Text detector ready")
86
+ return _text_detector
87
 
 
 
 
88
 
89
+ # ─────────────────────────────────────────────────────────────────────────────
90
+ # ACOUSTIC FEATURE ANALYZER
91
+ # ─────────────────────────────────────────────────────────────────────────────
92
+ AI_SYNTH_THRESHOLD = 0.60
93
+
94
+
95
+ def analyze_acoustic_features(x: np.ndarray, sr: int) -> dict:
96
  frame_length = 1024
97
  hop_length = 256
98
  rms = librosa.feature.rms(y=x, frame_length=frame_length, hop_length=hop_length)[0]
99
  rms_variance = np.var(rms)
100
  rms_mean = np.mean(rms) + 1e-8
101
+ rms_cv = np.sqrt(rms_variance) / rms_mean
 
 
 
102
  energy_synth_score = max(0.0, min(1.0, 1.0 - (rms_cv / 0.5)))
103
  print(f"[Acoustic] Energy CoV={rms_cv:.4f} β†’ synth_score={energy_synth_score:.4f}")
104
 
 
 
 
105
  spec_flatness = librosa.feature.spectral_flatness(y=x, hop_length=hop_length)[0]
106
  mean_flatness = np.mean(spec_flatness)
 
107
  flatness_synth_score = max(0.0, min(1.0, mean_flatness / 0.1))
108
  print(f"[Acoustic] Spectral flatness={mean_flatness:.5f} β†’ synth_score={flatness_synth_score:.4f}")
109
 
 
 
 
110
  try:
111
  f0 = librosa.yin(x, fmin=50, fmax=500, sr=sr, hop_length=hop_length)
112
  voiced = f0[f0 > 0]
113
  if len(voiced) > 10:
114
  pitch_variance = np.std(voiced) / (np.mean(voiced) + 1e-8)
 
115
  pitch_synth_score = max(0.0, min(1.0, 1.0 - (pitch_variance / 0.15)))
116
  else:
117
+ pitch_synth_score = 0.5
118
  except Exception:
119
  pitch_synth_score = 0.5
120
  print(f"[Acoustic] Pitch variance score={pitch_synth_score:.4f}")
121
 
 
 
 
122
  zcr = librosa.feature.zero_crossing_rate(x, hop_length=hop_length)[0]
123
  zcr_variance = np.var(zcr)
124
  zcr_mean = np.mean(zcr) + 1e-8
125
  zcr_cv = np.sqrt(zcr_variance) / zcr_mean
 
126
  zcr_synth_score = max(0.0, min(1.0, 1.0 - (zcr_cv / 0.5)))
127
  print(f"[Acoustic] ZCR CoV={zcr_cv:.4f} β†’ synth_score={zcr_synth_score:.4f}")
128
 
 
 
129
  ai_synth_score = (
130
+ energy_synth_score * 0.35 +
131
  flatness_synth_score * 0.20 +
132
+ pitch_synth_score * 0.30 +
133
+ zcr_synth_score * 0.15
134
  )
135
  print(f"[Acoustic] Overall AI synth score={ai_synth_score:.4f} (threshold={AI_SYNTH_THRESHOLD})")
136
 
 
343
 
344
 
345
  def run_ensemble(x: np.ndarray) -> str:
 
 
 
 
 
 
 
 
 
 
 
346
  votes = {"real": 0, "ai_synth": 0, "fake": 0}
347
  for entry in ensemble:
348
  try:
 
364
 
365
  print(f"[Audio] Ensemble decision: {ensemble_result}")
366
 
 
367
  acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
368
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  if ensemble_result == "fake":
370
  final = "fake"
371
  elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
 
385
 
386
 
387
  def deepfakes_audio_predict(input_audio):
 
 
 
 
 
 
 
388
  sr, x = input_audio
389
  print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
390
 
 
391
  if is_live_mic_recording(sr, x):
392
  fake_processing_steps(x, sr)
393
  return "βœ… Real Human Voice"
394
 
 
395
  print("[Audio] Source: πŸ“ Uploaded file β†’ running ensemble + acoustic analysis …")
396
 
397
  x = x.astype(np.float32)
 
406
  x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
407
  print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
408
 
409
+ return run_ensemble(x)
410
+
411
+
412
+ # ─────────────────────────────────────────────────────────────────────────────
413
+ # TEXT DEEPFAKE DETECTION
414
+ # Hybrid DeBERTa-v3-small + BiLSTM + CNN + Transformer
415
+ # Returns: "βœ… Human-Written" / "πŸ€– AI-Generated"
416
+ # ─────────────────────────────────────────────────────────────────────────────
417
+
418
+ def deepfakes_text_predict(input_text: str) -> str:
419
+ """
420
+ Detect whether the input text is human-written or AI-generated.
421
+
422
+ Parameters
423
+ ----------
424
+ input_text : str
425
+ The text to analyse (articles, essays, descriptions, etc.)
426
+
427
+ Returns
428
+ -------
429
+ str
430
+ A formatted result string for display in the Gradio textbox.
431
+ """
432
+ if not input_text or not input_text.strip():
433
+ return "⚠️ Please enter some text to analyse."
434
+
435
+ text = input_text.strip()
436
+ word_count = len(text.split())
437
+ print(f"[Text] Input: {word_count} words")
438
+
439
+ if word_count < 10:
440
+ return (
441
+ "⚠️ Input too short β€” please provide at least 10 words for a reliable result.\n"
442
+ f" (You entered {word_count} word{'s' if word_count != 1 else ''})"
443
+ )
444
+
445
+ try:
446
+ detector = _get_text_detector()
447
+ result = detector.predict(text)
448
+
449
+ if "error" in result:
450
+ return f"❌ Error: {result['error']}"
451
+
452
+ label = result["label"]
453
+ ai_prob = result["ai_prob"]
454
+ human_prob = result["human_prob"]
455
+ confidence = result["confidence"]
456
+
457
+ print(f"[Text] label={label} | ai_prob={ai_prob:.4f} | human_prob={human_prob:.4f}")
458
+
459
+ # ── Format output ─────────────────────────────────────────────────────
460
+ if label == "AI-Generated":
461
+ verdict_icon = "πŸ€–"
462
+ verdict_text = "AI-Generated Text"
463
+ else:
464
+ verdict_icon = "βœ…"
465
+ verdict_text = "Human-Written Text"
466
+
467
+ # Confidence bar (ASCII, 20 chars)
468
+ bar_filled = round(confidence * 20)
469
+ bar = "β–ˆ" * bar_filled + "β–‘" * (20 - bar_filled)
470
+
471
+ output = (
472
+ f"{verdict_icon} {verdict_text}\n"
473
+ f"\n"
474
+ f"Confidence [{bar}] {confidence*100:.1f}%\n"
475
+ f"\n"
476
+ f"P(AI-Generated) : {ai_prob*100:.1f}%\n"
477
+ f"P(Human-Written) : {human_prob*100:.1f}%\n"
478
+ f"\n"
479
+ f"Words analysed : {word_count}\n"
480
+ f"(First 128 tokens used β€” ~100 words)"
481
+ )
482
+ return output
483
+
484
+ except Exception as e:
485
+ print(f"[Text] ❌ Prediction failed: {e}")
486
+ return f"❌ Text detection failed: {str(e)}\nMake sure best_text_detector.pt is present in the Space."