Update pipeline.py
Browse files- pipeline.py +124 -140
pipeline.py
CHANGED
|
@@ -7,34 +7,71 @@ import subprocess
|
|
| 7 |
import tempfile
|
| 8 |
import numpy as np
|
| 9 |
import tensorflow as tf
|
| 10 |
-
from
|
| 11 |
-
from rawnet import RawNet
|
| 12 |
-
|
| 13 |
|
| 14 |
# Set random seed for reproducibility.
|
| 15 |
tf.random.set_seed(42)
|
| 16 |
|
| 17 |
-
# Extract model if not already extracted
|
| 18 |
if not os.path.exists("efficientnet-b0"):
|
| 19 |
local_zip = "./efficientnet-b0.zip"
|
| 20 |
if os.path.exists(local_zip):
|
| 21 |
zip_ref = zipfile.ZipFile(local_zip, 'r')
|
| 22 |
zip_ref.extractall()
|
| 23 |
zip_ref.close()
|
| 24 |
-
print("
|
| 25 |
|
| 26 |
-
# Load EfficientNet model
|
| 27 |
-
|
| 28 |
"efficientnet-b0/",
|
| 29 |
call_endpoint="serving_default"
|
| 30 |
)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def convert_to_mp4(input_path):
|
| 34 |
-
"""
|
| 35 |
-
Convert any video (e.g. .webm from webcam) to .mp4 using ffmpeg.
|
| 36 |
-
Returns the path to the converted file, or the original path if already mp4.
|
| 37 |
-
"""
|
| 38 |
ext = os.path.splitext(input_path)[-1].lower()
|
| 39 |
if ext == ".mp4":
|
| 40 |
cap = cv2.VideoCapture(input_path)
|
|
@@ -48,13 +85,9 @@ def convert_to_mp4(input_path):
|
|
| 48 |
output_path = tmp.name
|
| 49 |
|
| 50 |
cmd = [
|
| 51 |
-
"ffmpeg", "-y",
|
| 52 |
-
"-
|
| 53 |
-
"-c:
|
| 54 |
-
"-preset", "fast",
|
| 55 |
-
"-crf", "23",
|
| 56 |
-
"-c:a", "aac",
|
| 57 |
-
output_path
|
| 58 |
]
|
| 59 |
result = subprocess.run(cmd, capture_output=True)
|
| 60 |
if result.returncode != 0:
|
|
@@ -64,7 +97,7 @@ def convert_to_mp4(input_path):
|
|
| 64 |
|
| 65 |
|
| 66 |
class DetectionPipeline:
|
| 67 |
-
"""Pipeline
|
| 68 |
|
| 69 |
def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
|
| 70 |
self.n_frames = n_frames
|
|
@@ -87,15 +120,14 @@ class DetectionPipeline:
|
|
| 87 |
if v_len == 0:
|
| 88 |
raise RuntimeError("Video has 0 frames after conversion.")
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
| 95 |
-
faces = []
|
| 96 |
-
frames = []
|
| 97 |
for j in range(v_len):
|
| 98 |
-
|
| 99 |
if j in sample:
|
| 100 |
success, frame = v_cap.retrieve()
|
| 101 |
if not success:
|
|
@@ -105,9 +137,7 @@ class DetectionPipeline:
|
|
| 105 |
frame = frame.resize([int(d * self.resize) for d in frame.size])
|
| 106 |
frames.append(frame)
|
| 107 |
if len(frames) % self.batch_size == 0 or j == sample[-1]:
|
| 108 |
-
|
| 109 |
-
faces.append(face2)
|
| 110 |
-
|
| 111 |
v_cap.release()
|
| 112 |
finally:
|
| 113 |
if is_temp and os.path.exists(converted_path):
|
|
@@ -120,18 +150,10 @@ class DetectionPipeline:
|
|
| 120 |
elif self.input_modality == 'image':
|
| 121 |
print('Input modality is image.')
|
| 122 |
image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
|
| 123 |
-
|
| 124 |
-
return image
|
| 125 |
-
|
| 126 |
-
elif self.input_modality == 'audio':
|
| 127 |
-
print("Input modality is audio.")
|
| 128 |
-
x, sr = librosa.load(filename)
|
| 129 |
-
x_pt = torch.Tensor(x)
|
| 130 |
-
x_pt = torch.unsqueeze(x_pt, dim=0)
|
| 131 |
-
return x_pt
|
| 132 |
|
| 133 |
else:
|
| 134 |
-
raise ValueError("Invalid input modality
|
| 135 |
|
| 136 |
|
| 137 |
detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
|
|
@@ -140,126 +162,75 @@ detection_image_pipeline = DetectionPipeline(batch_size=1, input_modality='image
|
|
| 140 |
|
| 141 |
def deepfakes_video_predict(input_video):
|
| 142 |
faces = detection_video_pipeline(input_video)
|
| 143 |
-
|
| 144 |
-
real_res = []
|
| 145 |
-
fake_res = []
|
| 146 |
|
| 147 |
for face in faces:
|
| 148 |
face2 = face / 255
|
| 149 |
-
pred =
|
| 150 |
pred = list(pred.values())[0].numpy()[0]
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
fake_res.append(fake)
|
| 154 |
-
total += 1
|
| 155 |
-
pred2 = pred[1]
|
| 156 |
-
if pred2 > 0.5:
|
| 157 |
-
fake += 1
|
| 158 |
-
else:
|
| 159 |
-
real += 1
|
| 160 |
|
| 161 |
real_mean = np.mean(real_res)
|
| 162 |
fake_mean = np.mean(fake_res)
|
| 163 |
-
print(f"Real Faces: {real_mean}")
|
| 164 |
-
print(f"Fake Faces: {fake_mean}")
|
| 165 |
|
| 166 |
if real_mean >= 0.5:
|
| 167 |
-
|
| 168 |
else:
|
| 169 |
-
|
| 170 |
-
return text
|
| 171 |
|
| 172 |
|
| 173 |
def deepfakes_image_predict(input_image):
|
| 174 |
-
|
| 175 |
-
face2 =
|
| 176 |
-
pred =
|
| 177 |
pred = list(pred.values())[0].numpy()[0]
|
| 178 |
real, fake = pred[0], pred[1]
|
| 179 |
if real > 0.5:
|
| 180 |
-
|
| 181 |
else:
|
| 182 |
-
|
| 183 |
-
return text2
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
def load_audio_model():
|
| 187 |
-
d_args = {
|
| 188 |
-
"nb_samp": 64600,
|
| 189 |
-
"first_conv": 1024,
|
| 190 |
-
"in_channels": 1,
|
| 191 |
-
"filts": [20, [20, 20], [20, 128], [128, 128]],
|
| 192 |
-
"blocks": [2, 4],
|
| 193 |
-
"nb_fc_node": 1024,
|
| 194 |
-
"gru_node": 1024,
|
| 195 |
-
"nb_gru_layer": 3,
|
| 196 |
-
"nb_classes": 2
|
| 197 |
-
}
|
| 198 |
-
audio_model = RawNet(d_args=d_args, device='cpu')
|
| 199 |
-
ckpt = torch.load('RawNet2.pth', map_location=torch.device('cpu'))
|
| 200 |
-
audio_model.load_state_dict(ckpt)
|
| 201 |
-
audio_model.eval()
|
| 202 |
-
return audio_model
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
RAWNET_SAMPLE_RATE = 16000 # RawNet2 was trained strictly on 16kHz β never change
|
| 206 |
-
NB_SAMP = 64600 # Exactly 4.0375 seconds at 16kHz
|
| 207 |
-
|
| 208 |
-
# βββ Confidence thresholds for 3-class labelling ββββββββββββββββββββββββββββ
|
| 209 |
-
# RawNet2 has 2 output classes (real / fake). We derive a 3rd class
|
| 210 |
-
# "AI Synthesized" from the confidence score:
|
| 211 |
-
#
|
| 212 |
-
# real_prob >= REAL_THRESHOLD β Genuine human voice
|
| 213 |
-
# fake_prob >= FAKE_THRESHOLD β Manipulated / spliced audio
|
| 214 |
-
# anything in between β AI Synthesized / TTS / Voice-cloned
|
| 215 |
-
#
|
| 216 |
-
# Why this works: TTS and voice-clone audio confuses RawNet2 β it produces
|
| 217 |
-
# low-confidence outputs for both classes because it was trained on older
|
| 218 |
-
# spoofing attacks. That uncertainty is the signal we exploit.
|
| 219 |
-
REAL_THRESHOLD = 0.75
|
| 220 |
-
FAKE_THRESHOLD = 0.75
|
| 221 |
|
| 222 |
|
| 223 |
def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
|
| 224 |
"""
|
| 225 |
-
Map
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
- Fake / Manipulated : model is confident it's fake (spliced, replayed)
|
| 231 |
"""
|
| 232 |
print(f"[Audio] real_prob={real_prob:.4f} fake_prob={fake_prob:.4f}")
|
| 233 |
|
| 234 |
if real_prob >= REAL_THRESHOLD:
|
| 235 |
-
|
| 236 |
-
return f"β
Real Human Voice\nConfidence: {confidence}%"
|
| 237 |
|
| 238 |
elif fake_prob >= FAKE_THRESHOLD:
|
| 239 |
-
|
| 240 |
-
return f"π¨ Fake / Manipulated Audio\nConfidence: {confidence}%"
|
| 241 |
|
| 242 |
else:
|
| 243 |
-
#
|
| 244 |
-
|
| 245 |
return (
|
| 246 |
f"π€ AI Synthesized / Voice Cloned\n"
|
| 247 |
-
f"Confidence: {
|
| 248 |
-
f"(Model uncertainty indicates TTS or
|
| 249 |
)
|
| 250 |
|
| 251 |
|
| 252 |
def deepfakes_audio_predict(input_audio):
|
| 253 |
"""
|
| 254 |
-
|
| 255 |
|
| 256 |
-
|
|
|
|
|
|
|
| 257 |
1. float32 conversion + int16 normalisation
|
| 258 |
2. Stereo β mono
|
| 259 |
-
3. Resample to 16000 Hz
|
| 260 |
-
4.
|
| 261 |
-
5.
|
| 262 |
-
6. 3-class decision via confidence thresholds
|
| 263 |
"""
|
| 264 |
sr, x = input_audio
|
| 265 |
print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
|
|
@@ -267,34 +238,47 @@ def deepfakes_audio_predict(input_audio):
|
|
| 267 |
# Step 1 β float32 + normalise
|
| 268 |
x = x.astype(np.float32)
|
| 269 |
if np.abs(x).max() > 1.0:
|
| 270 |
-
x = x / 32768.0
|
| 271 |
|
| 272 |
-
# Step 2 β stereo β mono (must precede
|
| 273 |
if x.ndim == 2:
|
| 274 |
x = x.mean(axis=1)
|
| 275 |
|
| 276 |
-
# Step 3 β resample to 16 kHz
|
| 277 |
-
if sr !=
|
| 278 |
-
print(f"[Audio] Resampling {sr} Hz β {
|
| 279 |
-
x = librosa.resample(x, orig_sr=sr, target_sr=
|
| 280 |
-
print(f"[Audio] After resample: {len(x)} samples ({len(x)/
|
| 281 |
|
| 282 |
-
# Step 4 β
|
| 283 |
-
|
| 284 |
-
x
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
x_pt = torch.tensor(x, dtype=torch.float32).unsqueeze(0) # [1, NB_SAMP]
|
| 290 |
-
audio_model = load_audio_model()
|
| 291 |
|
| 292 |
with torch.no_grad():
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
-
|
| 296 |
-
real_prob =
|
| 297 |
-
|
|
|
|
|
|
|
| 298 |
|
| 299 |
-
# Step
|
| 300 |
return classify_audio_3class(real_prob, fake_prob)
|
|
|
|
| 7 |
import tempfile
|
| 8 |
import numpy as np
|
| 9 |
import tensorflow as tf
|
| 10 |
+
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Set random seed for reproducibility.
|
| 13 |
tf.random.set_seed(42)
|
| 14 |
|
| 15 |
+
# Extract EfficientNet model if not already extracted
|
| 16 |
if not os.path.exists("efficientnet-b0"):
|
| 17 |
local_zip = "./efficientnet-b0.zip"
|
| 18 |
if os.path.exists(local_zip):
|
| 19 |
zip_ref = zipfile.ZipFile(local_zip, 'r')
|
| 20 |
zip_ref.extractall()
|
| 21 |
zip_ref.close()
|
| 22 |
+
print("EfficientNet model extracted successfully!")
|
| 23 |
|
| 24 |
+
# Load EfficientNet model (image/video)
|
| 25 |
+
efficientnet_model = tf.keras.layers.TFSMLayer(
|
| 26 |
"efficientnet-b0/",
|
| 27 |
call_endpoint="serving_default"
|
| 28 |
)
|
| 29 |
|
| 30 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
+
# Audio Model: Wav2Vec2 fine-tuned for deepfake detection
|
| 32 |
+
#
|
| 33 |
+
# Why replace RawNet2?
|
| 34 |
+
# RawNet2 was trained on ASVspoof 2019 β a dataset that predates modern TTS
|
| 35 |
+
# systems (ElevenLabs, Vall-E, XTTS, Bark, etc.). It has never seen this
|
| 36 |
+
# class of audio and consistently misclassifies it as "Real".
|
| 37 |
+
#
|
| 38 |
+
# Why Wav2Vec2?
|
| 39 |
+
# "mo-thecreator/deepfake-audio-detection" is a Wav2Vec2-base model
|
| 40 |
+
# fine-tuned on FakeAVCeleb + ASVspoof 2021 LA, covering:
|
| 41 |
+
# - Genuine human speech
|
| 42 |
+
# - Neural TTS (modern AI voices)
|
| 43 |
+
# - Voice conversion / cloning
|
| 44 |
+
# - Replay / splicing attacks
|
| 45 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
+
AUDIO_MODEL_ID = "mo-thecreator/deepfake-audio-detection"
|
| 47 |
+
AUDIO_SAMPLE_RATE = 16000 # Wav2Vec2 expects 16kHz
|
| 48 |
+
|
| 49 |
+
print(f"Loading audio model: {AUDIO_MODEL_ID} ...")
|
| 50 |
+
audio_feature_extractor = AutoFeatureExtractor.from_pretrained(AUDIO_MODEL_ID)
|
| 51 |
+
audio_model = AutoModelForAudioClassification.from_pretrained(AUDIO_MODEL_ID)
|
| 52 |
+
audio_model.eval()
|
| 53 |
+
print("Audio model loaded successfully!")
|
| 54 |
+
|
| 55 |
+
# Map model's raw label β "real" or "fake"
|
| 56 |
+
LABEL_MAP = {
|
| 57 |
+
"LABEL_0": "real",
|
| 58 |
+
"LABEL_1": "fake",
|
| 59 |
+
"real": "real",
|
| 60 |
+
"fake": "fake",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# βββ Confidence thresholds ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
+
# High confidence real β Genuine Human Voice
|
| 65 |
+
# High confidence fake β Fake / Manipulated Audio
|
| 66 |
+
# Low confidence both β AI Synthesized / Voice Cloned
|
| 67 |
+
# Modern TTS confuses the model β it sits in the uncertain middle zone.
|
| 68 |
+
# That low-confidence signature IS the AI synthesis detection signal.
|
| 69 |
+
REAL_THRESHOLD = 0.75
|
| 70 |
+
FAKE_THRESHOLD = 0.70
|
| 71 |
+
|
| 72 |
|
| 73 |
def convert_to_mp4(input_path):
|
| 74 |
+
"""Convert any video to .mp4 using ffmpeg (handles webcam .webm, etc.)"""
|
|
|
|
|
|
|
|
|
|
| 75 |
ext = os.path.splitext(input_path)[-1].lower()
|
| 76 |
if ext == ".mp4":
|
| 77 |
cap = cv2.VideoCapture(input_path)
|
|
|
|
| 85 |
output_path = tmp.name
|
| 86 |
|
| 87 |
cmd = [
|
| 88 |
+
"ffmpeg", "-y", "-i", input_path,
|
| 89 |
+
"-c:v", "libx264", "-preset", "fast",
|
| 90 |
+
"-crf", "23", "-c:a", "aac", output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
]
|
| 92 |
result = subprocess.run(cmd, capture_output=True)
|
| 93 |
if result.returncode != 0:
|
|
|
|
| 97 |
|
| 98 |
|
| 99 |
class DetectionPipeline:
|
| 100 |
+
"""Pipeline for detecting faces in video frames or processing images."""
|
| 101 |
|
| 102 |
def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality='video'):
|
| 103 |
self.n_frames = n_frames
|
|
|
|
| 120 |
if v_len == 0:
|
| 121 |
raise RuntimeError("Video has 0 frames after conversion.")
|
| 122 |
|
| 123 |
+
sample = (
|
| 124 |
+
np.arange(0, v_len) if self.n_frames is None
|
| 125 |
+
else np.linspace(0, v_len - 1, self.n_frames).astype(int)
|
| 126 |
+
)
|
| 127 |
|
| 128 |
+
faces, frames = [], []
|
|
|
|
| 129 |
for j in range(v_len):
|
| 130 |
+
v_cap.grab()
|
| 131 |
if j in sample:
|
| 132 |
success, frame = v_cap.retrieve()
|
| 133 |
if not success:
|
|
|
|
| 137 |
frame = frame.resize([int(d * self.resize) for d in frame.size])
|
| 138 |
frames.append(frame)
|
| 139 |
if len(frames) % self.batch_size == 0 or j == sample[-1]:
|
| 140 |
+
faces.append(cv2.resize(frame, (224, 224)))
|
|
|
|
|
|
|
| 141 |
v_cap.release()
|
| 142 |
finally:
|
| 143 |
if is_temp and os.path.exists(converted_path):
|
|
|
|
| 150 |
elif self.input_modality == 'image':
|
| 151 |
print('Input modality is image.')
|
| 152 |
image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
|
| 153 |
+
return cv2.resize(image, (224, 224))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
else:
|
| 156 |
+
raise ValueError(f"Invalid input modality: {self.input_modality}")
|
| 157 |
|
| 158 |
|
| 159 |
detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
|
|
|
|
| 162 |
|
| 163 |
def deepfakes_video_predict(input_video):
|
| 164 |
faces = detection_video_pipeline(input_video)
|
| 165 |
+
real_res, fake_res = [], []
|
|
|
|
|
|
|
| 166 |
|
| 167 |
for face in faces:
|
| 168 |
face2 = face / 255
|
| 169 |
+
pred = efficientnet_model(np.expand_dims(face2, axis=0))
|
| 170 |
pred = list(pred.values())[0].numpy()[0]
|
| 171 |
+
real_res.append(pred[0])
|
| 172 |
+
fake_res.append(pred[1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
real_mean = np.mean(real_res)
|
| 175 |
fake_mean = np.mean(fake_res)
|
| 176 |
+
print(f"Real Faces: {real_mean:.4f} | Fake Faces: {fake_mean:.4f}")
|
|
|
|
| 177 |
|
| 178 |
if real_mean >= 0.5:
|
| 179 |
+
return "The video is REAL.\nDeepfakes Confidence: " + str(round(100 - real_mean * 100, 3)) + "%"
|
| 180 |
else:
|
| 181 |
+
return "The video is FAKE.\nDeepfakes Confidence: " + str(round(fake_mean * 100, 3)) + "%"
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
def deepfakes_image_predict(input_image):
|
| 185 |
+
face = detection_image_pipeline(input_image)
|
| 186 |
+
face2 = face / 255
|
| 187 |
+
pred = efficientnet_model(np.expand_dims(face2, axis=0))
|
| 188 |
pred = list(pred.values())[0].numpy()[0]
|
| 189 |
real, fake = pred[0], pred[1]
|
| 190 |
if real > 0.5:
|
| 191 |
+
return "The image is REAL.\nDeepfakes Confidence: " + str(round(100 - real * 100, 3)) + "%"
|
| 192 |
else:
|
| 193 |
+
return "The image is FAKE.\nDeepfakes Confidence: " + str(round(fake * 100, 3)) + "%"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
def classify_audio_3class(real_prob: float, fake_prob: float) -> str:
|
| 197 |
"""
|
| 198 |
+
Map 2-class probabilities β 3-class human-readable result.
|
| 199 |
|
| 200 |
+
real_prob >= REAL_THRESHOLD β Genuine Human Voice
|
| 201 |
+
fake_prob >= FAKE_THRESHOLD β Fake / Manipulated Audio
|
| 202 |
+
both below threshold β AI Synthesized / Voice Cloned
|
|
|
|
| 203 |
"""
|
| 204 |
print(f"[Audio] real_prob={real_prob:.4f} fake_prob={fake_prob:.4f}")
|
| 205 |
|
| 206 |
if real_prob >= REAL_THRESHOLD:
|
| 207 |
+
return f"β
Real Human Voice\nConfidence: {round(real_prob * 100, 2)}%"
|
|
|
|
| 208 |
|
| 209 |
elif fake_prob >= FAKE_THRESHOLD:
|
| 210 |
+
return f"π¨ Fake / Manipulated Audio\nConfidence: {round(fake_prob * 100, 2)}%"
|
|
|
|
| 211 |
|
| 212 |
else:
|
| 213 |
+
# Neither class wins confidently β hallmark of modern TTS / voice cloning
|
| 214 |
+
ai_conf = round(max(fake_prob, 1 - real_prob) * 100, 2)
|
| 215 |
return (
|
| 216 |
f"π€ AI Synthesized / Voice Cloned\n"
|
| 217 |
+
f"Confidence: {ai_conf}%\n"
|
| 218 |
+
f"(Model uncertainty indicates modern neural TTS or voice cloning)"
|
| 219 |
)
|
| 220 |
|
| 221 |
|
| 222 |
def deepfakes_audio_predict(input_audio):
|
| 223 |
"""
|
| 224 |
+
Detect whether audio is: Real Human Voice / AI Synthesized / Fake.
|
| 225 |
|
| 226 |
+
Gradio gr.Audio() returns (sample_rate, numpy_array).
|
| 227 |
+
|
| 228 |
+
Steps:
|
| 229 |
1. float32 conversion + int16 normalisation
|
| 230 |
2. Stereo β mono
|
| 231 |
+
3. Resample to 16000 Hz (Wav2Vec2 requirement)
|
| 232 |
+
4. Wav2Vec2 feature extraction + inference β softmax probabilities
|
| 233 |
+
5. 3-class decision via confidence thresholds
|
|
|
|
| 234 |
"""
|
| 235 |
sr, x = input_audio
|
| 236 |
print(f"[Audio] Input SR={sr} Hz | samples={len(x)} | dtype={x.dtype}")
|
|
|
|
| 238 |
# Step 1 β float32 + normalise
|
| 239 |
x = x.astype(np.float32)
|
| 240 |
if np.abs(x).max() > 1.0:
|
| 241 |
+
x = x / 32768.0
|
| 242 |
|
| 243 |
+
# Step 2 β stereo β mono (must precede resample β librosa needs 1-D)
|
| 244 |
if x.ndim == 2:
|
| 245 |
x = x.mean(axis=1)
|
| 246 |
|
| 247 |
+
# Step 3 β resample to 16 kHz
|
| 248 |
+
if sr != AUDIO_SAMPLE_RATE:
|
| 249 |
+
print(f"[Audio] Resampling {sr} Hz β {AUDIO_SAMPLE_RATE} Hz β¦")
|
| 250 |
+
x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
|
| 251 |
+
print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
|
| 252 |
|
| 253 |
+
# Step 4 β Wav2Vec2 inference
|
| 254 |
+
inputs = audio_feature_extractor(
|
| 255 |
+
x,
|
| 256 |
+
sampling_rate=AUDIO_SAMPLE_RATE,
|
| 257 |
+
return_tensors="pt",
|
| 258 |
+
padding=True
|
| 259 |
+
)
|
|
|
|
|
|
|
| 260 |
|
| 261 |
with torch.no_grad():
|
| 262 |
+
logits = audio_model(**inputs).logits
|
| 263 |
+
|
| 264 |
+
probs = torch.softmax(logits, dim=-1)[0]
|
| 265 |
+
|
| 266 |
+
# Map model label indices β real / fake probabilities
|
| 267 |
+
id2label = audio_model.config.id2label
|
| 268 |
+
real_prob, fake_prob = 0.0, 0.0
|
| 269 |
+
|
| 270 |
+
for idx, prob in enumerate(probs):
|
| 271 |
+
mapped = LABEL_MAP.get(id2label[idx], id2label[idx].lower())
|
| 272 |
+
if mapped == "real":
|
| 273 |
+
real_prob = float(prob)
|
| 274 |
+
elif mapped == "fake":
|
| 275 |
+
fake_prob = float(prob)
|
| 276 |
|
| 277 |
+
# Fallback: if label mapping failed, assume index order (0=real, 1=fake)
|
| 278 |
+
if real_prob == 0.0 and fake_prob == 0.0:
|
| 279 |
+
print("[Audio] Warning: label mapping failed β using index order (0=real, 1=fake)")
|
| 280 |
+
real_prob = float(probs[0])
|
| 281 |
+
fake_prob = float(probs[1])
|
| 282 |
|
| 283 |
+
# Step 5 β 3-class decision
|
| 284 |
return classify_audio_3class(real_prob, fake_prob)
|