Spaces:

DeepSoft-AI
/

deepdetect

Sleeping

App Files Files Community

Kimata commited on Jul 18, 2023

Commit

8871fa0

1 Parent(s): 10efb9c

update changes

Browse files

Files changed (4) hide show

__pycache__/inference.cpython-39.pyc +0 -0
app.py +3 -3
inference.py +33 -52
requirements.txt +2 -1

__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (6.2 kB). View file

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ title="Multimodal deepfake detector"
 description="Deepfake detection for videos, images and audio modalities."
-video_interface = gr.Interface(pipeline.deepfakes_video_predict,
                     gr.Video(),
                     "text",
                     examples = ["videos/celeb_synthesis.mp4", "videos/real-1.mp4"],
@@ -14,14 +14,14 @@ video_interface = gr.Interface(pipeline.deepfakes_video_predict,
                     )
-image_interface = gr.Interface(pipeline.deepfakes_image_predict,
                     gr.Image(),
                     "text",
                     examples = ["images/lady.jpg", "images/fake_image.jpg"],
                     cache_examples=False
                     )
-audio_interface = gr.Interface(pipeline.deepfakes_audio_predict,
                                gr.Audio(),
                                "text",
                                examples = ["audios/DF_E_2000027.flac", "audios/DF_E_2000031.flac"],

 description="Deepfake detection for videos, images and audio modalities."
+video_interface = gr.Interface(inference.deepfakes_video_predict,
                     gr.Video(),
                     "text",
                     examples = ["videos/celeb_synthesis.mp4", "videos/real-1.mp4"],
                     )
+image_interface = gr.Interface(inference.deepfakes_image_predict,
                     gr.Image(),
                     "text",
                     examples = ["images/lady.jpg", "images/fake_image.jpg"],
                     cache_examples=False
                     )
+audio_interface = gr.Interface(inference.deepfakes_spec_predict,
                                gr.Audio(),
                                "text",
                                examples = ["audios/DF_E_2000027.flac", "audios/DF_E_2000031.flac"],

inference.py CHANGED Viewed

@@ -5,8 +5,6 @@ import argparse
 import numpy as np
 import torch.nn as nn
 from models.TMC import ETMC
-from torchsummary import summary
 from models import image
 #Set random seed for reproducibility.
@@ -90,66 +88,70 @@ def load_spec_modality_model(args):
     spec_encoder.eval()
     return spec_encoder
 def preprocess_img(face):
     face = face / 255
     face = cv2.resize(face, (256, 256))
-    face = face.permute(2, 0, 1) #(W, H, C) -> (C, W, H)
-    face = torch.unsqueeze(face, dim = 0)
-    face_pt = torch.Tensor(face)
     return face_pt
 def preprocess_audio(audio_file):
-    audio = torch.unsqueeze(audio_file, dim = 0)
-    audio_pt = torch.Tensor(audio)
     return audio_pt
 def deepfakes_spec_predict(input_audio):
-    audio = preprocess_audio(input_audio)
-    #Load audio and multimodal model.
-    multimodal = load_multimodal_model()
-    spec_model = load_spec_modality_model()
     spec_grads = spec_model.forward(audio)
     multimodal_grads = multimodal.spec_depth[0].forward(spec_grads)
     out = nn.Softmax()(multimodal_grads)
     max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
     max_value = out[max] #Actual value of the tensor.
     if max_value > 0.5:
         preds = round(100 - (max_value*100), 3)
-        text2 = f"The audio is REAL. \n Deepfakes Confidence: {preds}%"
     else:
         preds = round(max_value*100, 3)
-        text2 = "The audio is FAKE. \n Deepfakes Confidence: {preds}%"
-    return max, max_value, text2
 def deepfakes_image_predict(input_image):
     face = preprocess_img(input_image)
-    #Load image and multimodal model.
-    multimodal = load_multimodal_model()
-    img_model = load_img_modality_model()
     img_grads = img_model.forward(face)
     multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
     out = nn.Softmax()(multimodal_grads)
-    max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
     max_value = out[max] #Actual value of the tensor.
     if max_value > 0.5:
         preds = round(100 - (max_value*100), 3)
-        text2 = f"The image is REAL. \n Deepfakes Confidence: {preds}%"
     else:
         preds = round(max_value*100, 3)
-        text2 = "The image is FAKE. \n Deepfakes Confidence: {preds}%"
-    return max, max_value, text2
 def preprocess_video(input_video, n_frames = 5):
@@ -181,9 +183,7 @@ def preprocess_video(input_video, n_frames = 5):
 def deepfakes_video_predict(input_video):
     '''Perform inference on a video.'''
     video_frames = preprocess_video(input_video)
-    #Load image and multimodal model.
-    multimodal = load_multimodal_model()
-    img_model = load_img_modality_model()
     real_grads = []
     fake_grads = []
@@ -192,38 +192,19 @@ def deepfakes_video_predict(input_video):
         multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
         out = nn.Softmax()(multimodal_grads)
-        real_grads.append(out[0].detach().numpy())
-        fake_grads.append(out[1].detch().numpy())
-        # max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
-        # max_value = out[max] #Actual value of the tensor.
     real_grads_mean = np.mean(real_grads)
     fake_grads_mean = np.mean(fake_grads)
     if real_grads_mean > fake_grads_mean:
         res = round(real_grads_mean * 100, 3)
-        text = f"The video is REAL. \n Deepfakes Confidence: {res}%"
     else:
         res = round(100 - (real_grads_mean * 100), 3)
-        text = f"The video is FAKE. \n Deepfakes Confidence: {res}%"
     return text
-def cli_main():
-    parser = argparse.ArgumentParser(description="Train Models")
-    get_args(parser)
-    args, remaining_args = parser.parse_known_args()
-    assert remaining_args == [], remaining_args
-    # image_multimodal_inference(args)
-    # spec_multimodal_inference(args)
-    model_summary(args)
-if __name__ == "__main__":
-    import warnings
-    warnings.filterwarnings("ignore")
-    cli_main()

 import numpy as np
 import torch.nn as nn
 from models.TMC import ETMC
 from models import image
 #Set random seed for reproducibility.
     spec_encoder.eval()
     return spec_encoder
+#Load models.
+parser = argparse.ArgumentParser(description="Train Models")
+get_args(parser)
+args, remaining_args = parser.parse_known_args()
+assert remaining_args == [], remaining_args
+multimodal = load_multimodal_model(args)
+spec_model = load_spec_modality_model(args)
+img_model = load_img_modality_model(args)
 def preprocess_img(face):
     face = face / 255
     face = cv2.resize(face, (256, 256))
+    face = face.transpose(2, 0, 1) #(W, H, C) -> (C, W, H)
+    face_pt = torch.unsqueeze(torch.Tensor(face), dim = 0)
     return face_pt
 def preprocess_audio(audio_file):
+    audio_pt = torch.unsqueeze(torch.Tensor(audio_file), dim = 0)
     return audio_pt
 def deepfakes_spec_predict(input_audio):
+    x, _ = input_audio
+    audio = preprocess_audio(x)
     spec_grads = spec_model.forward(audio)
     multimodal_grads = multimodal.spec_depth[0].forward(spec_grads)
     out = nn.Softmax()(multimodal_grads)
     max = torch.argmax(out, dim = -1) #Index of the max value in the tensor.
     max_value = out[max] #Actual value of the tensor.
+    max_value = np.argmax(out[max].detach().numpy())
     if max_value > 0.5:
         preds = round(100 - (max_value*100), 3)
+        text2 = f"The audio is REAL."
     else:
         preds = round(max_value*100, 3)
+        text2 = f"The audio is FAKE."
+    return text2
 def deepfakes_image_predict(input_image):
     face = preprocess_img(input_image)
     img_grads = img_model.forward(face)
     multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
     out = nn.Softmax()(multimodal_grads)
+    max = torch.argmax(out, dim=-1) #Index of the max value in the tensor.
     max_value = out[max] #Actual value of the tensor.
+    max_value = np.argmax(out[max].detach().numpy())
     if max_value > 0.5:
         preds = round(100 - (max_value*100), 3)
+        text2 = f"The image is REAL."
     else:
         preds = round(max_value*100, 3)
+        text2 = f"The image is FAKE."
+    return text2
 def preprocess_video(input_video, n_frames = 5):
 def deepfakes_video_predict(input_video):
     '''Perform inference on a video.'''
     video_frames = preprocess_video(input_video)
     real_grads = []
     fake_grads = []
         multimodal_grads = multimodal.clf_rgb[0].forward(img_grads)
         out = nn.Softmax()(multimodal_grads)
+        real_grads.append(out.cpu().detach().numpy()[0])
+        print(f"Video out tensor shape is: {out.shape}, {out}")
+        fake_grads.append(out.cpu().detach().numpy()[0])
     real_grads_mean = np.mean(real_grads)
     fake_grads_mean = np.mean(fake_grads)
     if real_grads_mean > fake_grads_mean:
         res = round(real_grads_mean * 100, 3)
+        text = f"The video is REAL."
     else:
         res = round(100 - (real_grads_mean * 100), 3)
+        text = f"The video is FAKE."
     return text

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ moviepy
 librosa
 ffmpeg
 albumentations
-opencv-python

 librosa
 ffmpeg
 albumentations
+opencv-python
+torchsummary