Update pipeline.py
Browse files- pipeline.py +30 -41
pipeline.py
CHANGED
|
@@ -29,22 +29,11 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
|
|
| 29 |
|
| 30 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
# Audio Model: Wav2Vec2 fine-tuned for deepfake detection
|
| 32 |
-
#
|
| 33 |
-
#
|
| 34 |
-
# RawNet2 was trained on ASVspoof 2019 β a dataset that predates modern TTS
|
| 35 |
-
# systems (ElevenLabs, Vall-E, XTTS, Bark, etc.). It has never seen this
|
| 36 |
-
# class of audio and consistently misclassifies it as "Real".
|
| 37 |
-
#
|
| 38 |
-
# Why Wav2Vec2?
|
| 39 |
-
# "mo-thecreator/deepfake-audio-detection" is a Wav2Vec2-base model
|
| 40 |
-
# fine-tuned on FakeAVCeleb + ASVspoof 2021 LA, covering:
|
| 41 |
-
# - Genuine human speech
|
| 42 |
-
# - Neural TTS (modern AI voices)
|
| 43 |
-
# - Voice conversion / cloning
|
| 44 |
-
# - Replay / splicing attacks
|
| 45 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
AUDIO_MODEL_ID = "mo-thecreator/deepfake-audio-detection"
|
| 47 |
-
AUDIO_SAMPLE_RATE = 16000
|
| 48 |
|
| 49 |
print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
|
| 50 |
audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
|
|
@@ -61,13 +50,18 @@ LABEL_MAP = {
|
|
| 61 |
}
|
| 62 |
|
| 63 |
# βββ Confidence thresholds ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
-
#
|
| 65 |
-
#
|
| 66 |
-
#
|
| 67 |
-
#
|
| 68 |
-
#
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
def convert_to_mp4(input_path):
|
|
@@ -173,12 +167,12 @@ def deepfakes_video_predict(input_video):
|
|
| 173 |
|
| 174 |
real_mean = np.mean(real_res)
|
| 175 |
fake_mean = np.mean(fake_res)
|
| 176 |
-
print(f"
|
| 177 |
|
| 178 |
if real_mean >= 0.5:
|
| 179 |
-
return "The video is REAL.
|
| 180 |
else:
|
| 181 |
-
return "The video is FAKE.
|
| 182 |
|
| 183 |
|
| 184 |
def deepfakes_image_predict(input_image):
|
|
@@ -187,36 +181,31 @@ def deepfakes_image_predict(input_image):
|
|
| 187 |
pred = efficientnet_model(np.expand_dims(face2, axis=0))
|
| 188 |
pred = list(pred.values())[0].numpy()[0]
|
| 189 |
real, fake = pred[0], pred[1]
|
|
|
|
|
|
|
| 190 |
if real > 0.5:
|
| 191 |
-
return "The image is REAL.
|
| 192 |
else:
|
| 193 |
-
return "The image is FAKE.
|
| 194 |
|
| 195 |
|
| 196 |
def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
|
| 197 |
"""
|
| 198 |
-
Map 2-class probabilities β 3-class
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
|
|
|
| 203 |
"""
|
| 204 |
print(f"[Audio] real_prob={real_prob:.4f} fake_prob={fake_prob:.4f}")
|
| 205 |
|
| 206 |
if real_prob >= REAL_THRESHOLD:
|
| 207 |
-
return
|
| 208 |
-
|
| 209 |
elif fake_prob >= FAKE_THRESHOLD:
|
| 210 |
-
return
|
| 211 |
-
|
| 212 |
else:
|
| 213 |
-
|
| 214 |
-
ai_conf = round(max(fake_prob, 1 - real_prob) * 100, 2)
|
| 215 |
-
return (
|
| 216 |
-
f"π€ AI Synthesized / Voice Cloned\n"
|
| 217 |
-
f"Confidence: {ai_conf}%\n"
|
| 218 |
-
f"(Model uncertainty indicates modern neural TTS or voice cloning)"
|
| 219 |
-
)
|
| 220 |
|
| 221 |
|
| 222 |
def deepfakes_audio_predict(input_audio):
|
|
|
|
| 29 |
|
| 30 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
# Audio Model: Wav2Vec2 fine-tuned for deepfake detection
|
| 32 |
+
# "mo-thecreator/deepfake-audio-detection"
|
| 33 |
+
# Fine-tuned on FakeAVCeleb + ASVspoof 2021 LA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
AUDIO_MODEL_ID = "mo-thecreator/deepfake-audio-detection"
|
| 36 |
+
AUDIO_SAMPLE_RATE = 16000
|
| 37 |
|
| 38 |
print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
|
| 39 |
audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
|
|
|
|
| 50 |
}
|
| 51 |
|
| 52 |
# βββ Confidence thresholds ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
# REAL_THRESHOLD = 0.55 (loose)
|
| 54 |
+
# Lowered so genuine human voices are not incorrectly rejected.
|
| 55 |
+
# The model only needs to be 55% confident to call it real.
|
| 56 |
+
#
|
| 57 |
+
# FAKE_THRESHOLD = 0.90 (strict)
|
| 58 |
+
# Raised so real voices are never falsely flagged as fake.
|
| 59 |
+
# The model must be 90% confident before labelling audio as manipulated.
|
| 60 |
+
#
|
| 61 |
+
# Zone between the two β AI Synthesized / Voice Cloned
|
| 62 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
REAL_THRESHOLD = 0.55
|
| 64 |
+
FAKE_THRESHOLD = 0.90
|
| 65 |
|
| 66 |
|
| 67 |
def convert_to_mp4(input_path):
|
|
|
|
| 167 |
|
| 168 |
real_mean = np.mean(real_res)
|
| 169 |
fake_mean = np.mean(fake_res)
|
| 170 |
+
print(f"[Video] Real={real_mean:.4f} | Fake={fake_mean:.4f}")
|
| 171 |
|
| 172 |
if real_mean >= 0.5:
|
| 173 |
+
return "β
The video is REAL."
|
| 174 |
else:
|
| 175 |
+
return "π¨ The video is FAKE."
|
| 176 |
|
| 177 |
|
| 178 |
def deepfakes_image_predict(input_image):
|
|
|
|
| 181 |
pred = efficientnet_model(np.expand_dims(face2, axis=0))
|
| 182 |
pred = list(pred.values())[0].numpy()[0]
|
| 183 |
real, fake = pred[0], pred[1]
|
| 184 |
+
print(f"[Image] Real={real:.4f} | Fake={fake:.4f}")
|
| 185 |
+
|
| 186 |
if real > 0.5:
|
| 187 |
+
return "β
The image is REAL."
|
| 188 |
else:
|
| 189 |
+
return "π¨ The image is FAKE."
|
| 190 |
|
| 191 |
|
| 192 |
def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
|
| 193 |
"""
|
| 194 |
+
Map 2-class probabilities β 3-class result.
|
| 195 |
|
| 196 |
+
Threshold logic:
|
| 197 |
+
real_prob >= 0.55 β Real Human Voice (loose β avoids false negatives)
|
| 198 |
+
fake_prob >= 0.90 β Fake / Manipulated (strict β avoids false positives)
|
| 199 |
+
in between β AI Synthesized / Cloned
|
| 200 |
"""
|
| 201 |
print(f"[Audio] real_prob={real_prob:.4f} fake_prob={fake_prob:.4f}")
|
| 202 |
|
| 203 |
if real_prob >= REAL_THRESHOLD:
|
| 204 |
+
return "β
Real Human Voice"
|
|
|
|
| 205 |
elif fake_prob >= FAKE_THRESHOLD:
|
| 206 |
+
return "π¨ Fake / Manipulated Audio"
|
|
|
|
| 207 |
else:
|
| 208 |
+
return "π€ AI Synthesized / Voice Cloned"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
def deepfakes_audio_predict(input_audio):
|