import gradio as gr
import inference_2 as inference

title = "Multimodal Deepfake Detector"
description = """
Deepfake detection for videos, images, and audio modalities.

**Example Workflow:**
1. **Upload Your Media File:**
   - Choose and upload a video, image, or audio file by clicking the "Upload" button in the respective tab.

2. **Select the Tab for Analysis:**
   - **Image Inference**: For analyzing images for face swapping or facial manipulation.
   - **Video Inference**: For detecting deepfakes in videos (e.g., face swaps or expressions).
   - **Audio Inference**: For detecting voice cloning or other audio manipulations.

3. **Review Results:**
   - The tool will process your file and provide a result, indicating if the media is real or fake with details on detected manipulations.

4. **Test Example Files:**
   - You can also try the preloaded example files to see how the model works with real and fake samples.

---

**Types of Deepfakes**

1. **Face Swapping**
   - **Purpose:** Replaces one person’s face with another.
   - **Process:** Extracts facial features (e.g., eyes, nose, mouth) from a source face and blends them into the target while maintaining expressions and structure.
   - **Applications:**
     - Creating deepfake videos where someone appears to be performing actions they didn’t.
     - Used in movies or for entertainment purposes.
     
2. **Facial Manipulation**
   - **Purpose:** Alters or modifies the expressions or movements of the face without changing the person’s identity.
   - **Process:** AI detects facial landmarks and adjusts them to create new appearances or expressions (e.g., changing mouth movements or eye positions).
   - **Applications:**
     - Lip-syncing to match audio in dubbing.
     - Adjusting expressions for storytelling in video or animation.
     
3. **Voice Cloning (Audio Deepfake)**
   - **Purpose:** Replicates a person’s voice, allowing AI to generate speech that sounds exactly like the target person.
   - **Process:** AI models are trained on samples of a person’s voice to mimic tone, pitch, accent, and speech patterns. Text-to-speech tools (e.g., WaveNet, Tacotron) are commonly used.
   - **Applications:**
     - **Positive:** Voiceovers for audiobooks or films, enhancing digital assistants, or helping individuals with speech loss.
     - **Negative:** Impersonation for scams, fake phone calls, or spreading misinformation.
     
---

### Comparison Table of Deepfake Techniques

| **Feature**           | **Face Swapping**                          | **Facial Manipulation**                   | **Voice Cloning**                     |
|------------------------|--------------------------------------------|-------------------------------------------|----------------------------------------|
| **Primary Goal**       | Replace a face entirely with another.      | Modify facial expressions or movements.   | Replicate a person’s voice.           |
| **Identity Impact**    | Changes the person’s identity.             | Retains the same identity.                | Imitates speech, not appearance.      |
| **Complexity**         | Requires blending two separate faces.      | Alters one face.                          | Needs accurate voice data input.      |
| **Example Use Case**   | Fake celebrity videos.                     | Lip-syncing or adjusting emotions.        | Fraudulent calls or voiceovers.       |

---

"""

description_bottom = """

**Acknowledgments:**
This tool is powered by advanced AI algorithms for deepfake detection. 

**Team Project Contribution:**
This project is a collaborative effort by a dedicated team of engineers and AI enthusiasts.  
We contributed significantly to:
- Developing the detection pipeline for images and audio deepfakes.
- Testing and optimizing the model for real-world datasets.
- Designing the user interface to ensure an intuitive experience for users.

We are proud of our combined efforts to create a reliable tool for identifying deepfakes across multiple modalities.

"""

# Define interfaces for each modality
video_interface = gr.Interface(
    inference.deepfakes_video_predict,
    gr.Video(),
    "text",
    examples=["videos/celeb_synthesis.mp4", "videos/real-1.mp4"],
    cache_examples=False
)

image_interface = gr.Interface(
    inference.deepfakes_image_predict,
    gr.Image(),
    "text",
    examples=["images/lady.jpg", "images/fake_image.jpg"],
    cache_examples=False
)

audio_interface = gr.Interface(
    inference.deepfakes_spec_predict,
    gr.Audio(),
    "text",
    examples=["audios/DF_E_2000027.flac", "audios/DF_E_2000031.flac"],
    cache_examples=False
)

# Combine into a Blocks container to include the description
with gr.Blocks() as app:
    gr.Markdown(f"# <center>{title}</center>")
    gr.Markdown(description)
    
    # Display images and descriptions with larger sizes
    with gr.Row():
        gr.Column([
            gr.Image("images/Deepfake 1.png", label="Real Example", elem_id="real-image", show_label=False, interactive=False),
            gr.Markdown("**Description:** A deepfake example where the face has been swapped with another. It demonstrates facial manipulation, where emotions are altered.")
        ])
        gr.Column([
            gr.Image("images/fakeface.jpg", label="Deepfake Example", elem_id="fake-image", show_label=False, interactive=False),
            gr.Markdown("**Description:** The process of detecting deepfake images by splitting the dataset into real and fake faces, training a hyper-parameterized AI model.")
        ])
    
    # Additional images and descriptions with larger sizes
    with gr.Row():
        gr.Column([
            gr.Image("images/fakeaudio1.png", label="Additional Real Image", elem_id="extra-image-1", show_label=False, interactive=False),
            gr.Markdown("**Description:** Two-phase approach for synthetic speech detection: the Sound Segmentation Phase and the Synthetic Speech Detection Phase.")
        ])
        gr.Column([
            gr.Image("images/fakeaudio.png", label="Additional Deepfake Image", elem_id="extra-image-2", show_label=False, interactive=False),
            gr.Markdown("**Description:** Audio deepfake detection: training learns from real/fake data, while detection classifies audio using extracted features.")
        ])
    
    gr.TabbedInterface(
        interface_list=[image_interface, video_interface, audio_interface],
        tab_names=['Image Inference', 'Video Inference', 'Audio Inference']
    )
    gr.Markdown(description_bottom)

if __name__ == '__main__':
    app.launch(share=False)