Renu05's picture
Update app.py
3a47659 verified
import gradio as gr
import inference_2 as inference
title = "Multimodal Deepfake Detector"
description = """
Deepfake detection for videos, images, and audio modalities.
**Example Workflow:**
1. **Upload Your Media File:**
- Choose and upload a video, image, or audio file by clicking the "Upload" button in the respective tab.
2. **Select the Tab for Analysis:**
- **Image Inference**: For analyzing images for face swapping or facial manipulation.
- **Video Inference**: For detecting deepfakes in videos (e.g., face swaps or expressions).
- **Audio Inference**: For detecting voice cloning or other audio manipulations.
3. **Review Results:**
- The tool will process your file and provide a result, indicating if the media is real or fake with details on detected manipulations.
4. **Test Example Files:**
- You can also try the preloaded example files to see how the model works with real and fake samples.
---
**Types of Deepfakes**
1. **Face Swapping**
- **Purpose:** Replaces one person’s face with another.
- **Process:** Extracts facial features (e.g., eyes, nose, mouth) from a source face and blends them into the target while maintaining expressions and structure.
- **Applications:**
- Creating deepfake videos where someone appears to be performing actions they didn’t.
- Used in movies or for entertainment purposes.
2. **Facial Manipulation**
- **Purpose:** Alters or modifies the expressions or movements of the face without changing the person’s identity.
- **Process:** AI detects facial landmarks and adjusts them to create new appearances or expressions (e.g., changing mouth movements or eye positions).
- **Applications:**
- Lip-syncing to match audio in dubbing.
- Adjusting expressions for storytelling in video or animation.
3. **Voice Cloning (Audio Deepfake)**
- **Purpose:** Replicates a person’s voice, allowing AI to generate speech that sounds exactly like the target person.
- **Process:** AI models are trained on samples of a person’s voice to mimic tone, pitch, accent, and speech patterns. Text-to-speech tools (e.g., WaveNet, Tacotron) are commonly used.
- **Applications:**
- **Positive:** Voiceovers for audiobooks or films, enhancing digital assistants, or helping individuals with speech loss.
- **Negative:** Impersonation for scams, fake phone calls, or spreading misinformation.
---
### Comparison Table of Deepfake Techniques
| **Feature** | **Face Swapping** | **Facial Manipulation** | **Voice Cloning** |
|------------------------|--------------------------------------------|-------------------------------------------|----------------------------------------|
| **Primary Goal** | Replace a face entirely with another. | Modify facial expressions or movements. | Replicate a person’s voice. |
| **Identity Impact** | Changes the person’s identity. | Retains the same identity. | Imitates speech, not appearance. |
| **Complexity** | Requires blending two separate faces. | Alters one face. | Needs accurate voice data input. |
| **Example Use Case** | Fake celebrity videos. | Lip-syncing or adjusting emotions. | Fraudulent calls or voiceovers. |
---
"""
description_bottom = """
**Acknowledgments:**
This tool is powered by advanced AI algorithms for deepfake detection.
**Team Project Contribution:**
This project is a collaborative effort by a dedicated team of engineers and AI enthusiasts.
We contributed significantly to:
- Developing the detection pipeline for images and audio deepfakes.
- Testing and optimizing the model for real-world datasets.
- Designing the user interface to ensure an intuitive experience for users.
We are proud of our combined efforts to create a reliable tool for identifying deepfakes across multiple modalities.
"""
# Define interfaces for each modality
video_interface = gr.Interface(
inference.deepfakes_video_predict,
gr.Video(),
"text",
examples=["videos/celeb_synthesis.mp4", "videos/real-1.mp4"],
cache_examples=False
)
image_interface = gr.Interface(
inference.deepfakes_image_predict,
gr.Image(),
"text",
examples=["images/lady.jpg", "images/fake_image.jpg"],
cache_examples=False
)
audio_interface = gr.Interface(
inference.deepfakes_spec_predict,
gr.Audio(),
"text",
examples=["audios/DF_E_2000027.flac", "audios/DF_E_2000031.flac"],
cache_examples=False
)
# Combine into a Blocks container to include the description
with gr.Blocks() as app:
gr.Markdown(f"# <center>{title}</center>")
gr.Markdown(description)
# Display images and descriptions with larger sizes
with gr.Row():
gr.Column([
gr.Image("images/Deepfake 1.png", label="Real Example", elem_id="real-image", show_label=False, interactive=False),
gr.Markdown("**Description:** A deepfake example where the face has been swapped with another. It demonstrates facial manipulation, where emotions are altered.")
])
gr.Column([
gr.Image("images/fakeface.jpg", label="Deepfake Example", elem_id="fake-image", show_label=False, interactive=False),
gr.Markdown("**Description:** The process of detecting deepfake images by splitting the dataset into real and fake faces, training a hyper-parameterized AI model.")
])
# Additional images and descriptions with larger sizes
with gr.Row():
gr.Column([
gr.Image("images/fakeaudio1.png", label="Additional Real Image", elem_id="extra-image-1", show_label=False, interactive=False),
gr.Markdown("**Description:** Two-phase approach for synthetic speech detection: the Sound Segmentation Phase and the Synthetic Speech Detection Phase.")
])
gr.Column([
gr.Image("images/fakeaudio.png", label="Additional Deepfake Image", elem_id="extra-image-2", show_label=False, interactive=False),
gr.Markdown("**Description:** Audio deepfake detection: training learns from real/fake data, while detection classifies audio using extracted features.")
])
gr.TabbedInterface(
interface_list=[image_interface, video_interface, audio_interface],
tab_names=['Image Inference', 'Video Inference', 'Audio Inference']
)
gr.Markdown(description_bottom)
if __name__ == '__main__':
app.launch(share=False)