"""
app.py — Multimodal Deepfake Detection — Gradio Frontend
=========================================================
Supports:
  • Audio-only detection  (upload WAV / FLAC / MP3)
  • Video-only detection  (upload MP4 — runs GenConViT via subprocess)
  • Multimodal fusion     (upload video with audio track)

Hosting:
  HuggingFace Spaces (recommended) — set HF_SPACE=1 to auto-detect.
  Local:   python app.py

Model weights are downloaded at startup from HuggingFace Hub.
"""

import os
import sys
import json
import time
import tempfile
import subprocess
import textwrap
from pathlib import Path

# ---------------------------------------------------------------------------
# CRITICAL: Monkey-patch gradio_client BEFORE importing gradio.
# Root cause: gradio_client/utils.py:_json_schema_to_python_type() receives
# a bool (False) as `schema` when processing Video/Audio component schemas
# that contain `"additionalProperties": false`. The function then does
# `if "const" in schema` which crashes because booleans are not iterable.
# This bug lives in: gradio_client<=0.9.1 (fixed in 0.10.0 / gradio>=5.0).
# Since we target gradio 4.44.x for Python 3.10, we patch it in-process.
# ---------------------------------------------------------------------------
def _patch_gradio_client():
    try:
        import gradio_client.utils as _gc
        _orig = _gc._json_schema_to_python_type

        def _safe(schema, defs=None):
            # Guard: if schema is not a dict (e.g. bool from additionalProperties:false)
            # return a safe fallback type string instead of crashing.
            if not isinstance(schema, dict):
                return "any"
            return _orig(schema, defs)

        _gc._json_schema_to_python_type = _safe
    except Exception:
        pass  # If gradio_client is not yet installed or already patched, skip.

_patch_gradio_client()

import gradio as gr


# ---------------------------------------------------------------------------
# Project paths
# ---------------------------------------------------------------------------
APP_DIR  = Path(__file__).parent.resolve()
PROJ_DIR = APP_DIR.parent
AUDIO_DIR = PROJ_DIR / "audio_detection"
FUSION_DIR = PROJ_DIR / "fusion"
VIDEO_DIR  = PROJ_DIR / "video_detection"
GENCONVIT_WEIGHT_DIR = VIDEO_DIR / "GenConViT" / "weight"

sys.path.insert(0, str(AUDIO_DIR))
sys.path.insert(0, str(FUSION_DIR))


# ---------------------------------------------------------------------------
# GenConViT weight bootstrap — download at startup if missing
# ---------------------------------------------------------------------------
_GENCONVIT_WEIGHTS = {
    "genconvit_ed_inference.pth":  "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_ed_inference.pth",
    "genconvit_vae_inference.pth": "https://huggingface.co/Deressa/GenConViT/resolve/main/genconvit_vae_inference.pth",
}

def _ensure_genconvit_weights():
    """Download GenConViT pretrained weights if they are not already present."""
    import urllib.request
    GENCONVIT_WEIGHT_DIR.mkdir(parents=True, exist_ok=True)
    for fname, url in _GENCONVIT_WEIGHTS.items():
        dest = GENCONVIT_WEIGHT_DIR / fname
        if not dest.exists():
            print(f"[Video] Downloading {fname} from HuggingFace …")
            try:
                urllib.request.urlretrieve(url, str(dest))
                print(f"[Video] ✓ {fname} downloaded ({dest.stat().st_size // 1_048_576} MB)")
            except Exception as exc:
                print(f"[Video] ✗ Failed to download {fname}: {exc}")
        else:
            print(f"[Video] Weight already present: {fname}")

_ensure_genconvit_weights()


# ---------------------------------------------------------------------------
# Lazy model loading
# ---------------------------------------------------------------------------
_audio_detector = None
_fusion_module  = None


def get_audio_detector():
    global _audio_detector
    if _audio_detector is None:
        from inference import AudioDeepfakeDetector
        _audio_detector = AudioDeepfakeDetector(
            device="cuda" if _cuda_available() else "cpu"
        )
    return _audio_detector


def get_fusion():
    global _fusion_module
    if _fusion_module is None:
        from fusion import MultimodalFusion
        _fusion_module = MultimodalFusion(strategy="weighted_average", alpha=0.5)
    return _fusion_module


def _cuda_available():
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False


# ---------------------------------------------------------------------------
# Inference helpers
# ---------------------------------------------------------------------------

def _score_bar(score_fake: float) -> str:
    """
    Build a text-based confidence bar for display.
    score_fake: 0..1, 1=100% fake
    """
    filled = int(round(score_fake * 20))
    bar = "█" * filled + "░" * (20 - filled)
    return f"[{bar}]  {score_fake*100:.1f}% Fake"


def run_audio_inference(audio_path: str):
    """Run Nes2Net on a local audio file. Returns result dict."""
    if audio_path is None:
        return None
    os.chdir(str(AUDIO_DIR))
    det = get_audio_detector()
    result = det.predict(audio_path)
    os.chdir(str(PROJ_DIR))
    return result


def run_video_inference_via_subprocess(video_path: str):
    """
    Run GenConViT video inference.
    Loads video_detection/inference.py directly via importlib to avoid
    the sys.modules name collision with audio_detection/inference.py.
    """
    video_dir = PROJ_DIR / "video_detection"
    try:
        import importlib.util
        # Load video inference module using its absolute path under a unique name.
        # This bypasses sys.modules where 'inference' is already cached as the
        # audio_detection version — which caused the VideoDeepfakeDetector import error.
        spec = importlib.util.spec_from_file_location(
            "video_inference",                          # unique module name
            str(video_dir / "inference.py"),            # absolute file path
        )
        video_mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(video_mod)
        VideoDeepfakeDetector = video_mod.VideoDeepfakeDetector

        det = VideoDeepfakeDetector(
            ed_weight="genconvit_ed_inference",
            vae_weight="genconvit_vae_inference",
            num_frames=15,
            fp16=False,
        )
        return det.predict(video_path)
    except Exception as exc:
        return {
            "error": str(exc),
            "label": "Unavailable",
            "prob_fake": None,
            "prob_real": None,
            "normalized_score": None,
            "faces_detected": False,
        }


def extract_audio_from_video(video_path: str) -> str | None:
    """Extract 16kHz mono WAV from a video file using ffmpeg."""
    out_path = os.path.join(tempfile.gettempdir(), "deepfake_extracted.wav")
    try:
        subprocess.run(
            [
                "ffmpeg", "-y", "-i", video_path,
                "-vn", "-acodec", "pcm_s16le",
                "-ar", "16000", "-ac", "1",
                out_path,
            ],
            capture_output=True, check=True,
        )
        return out_path
    except Exception:
        return None


# ---------------------------------------------------------------------------
# Gradio callbacks
# ---------------------------------------------------------------------------

def predict_audio(audio_file):
    """Called when user submits an audio file."""
    if audio_file is None:
        return (
            "No file uploaded.",
            "",
            gr.update(visible=False),
        )
    try:
        t0 = time.time()
        result = run_audio_inference(audio_file)
        elapsed = time.time() - t0

        if result is None:
            return "Model not loaded.", "", gr.update(visible=False)

        label      = result.get("label", "Unknown")
        real_score = result.get("real_score", 0.5)
        fake_score = 1 - real_score
        conf       = result.get("confidence", abs(real_score - 0.5) * 2)

        verdict = "FAKE AUDIO DETECTED" if label.upper() == "FAKE" else "GENUINE AUDIO"
        colour  = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"

        bar = _score_bar(fake_score)

        summary_html = f"""
        <div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}">
          <div class="result-title" style="color: {colour};">{verdict}</div>
          <div class="score-container">
            <span class="score-label">Confidence Metrics</span>
            <span class="score-value" style="color: {colour};">{conf:.1%}</span>
          </div>
          <div class="score-container">
            <span class="score-label">P(Fake) vs P(Real)</span>
            <span class="score-value">{fake_score:.4f} &nbsp;&nbsp;|&nbsp;&nbsp; {real_score:.4f}</span>
          </div>
          <div class="progress-track">
            <div class="progress-fill" style="width: {fake_score * 100}%;"></div>
          </div>
          <div style="text-align: right; margin-top: 12px; font-size: 0.85rem; color: #64748b;">
            Execution Time: {elapsed:.2f}s
          </div>
        </div>
        """
        details = json.dumps(result, indent=2)
        return summary_html, details, gr.update(visible=True)
    except Exception as exc:
        return f"<b>Error:</b> {exc}", "", gr.update(visible=False)


def predict_video(video_file):
    """Called when user submits a video file."""
    if video_file is None:
        return "No file uploaded.", "", gr.update(visible=False)
    try:
        t0 = time.time()
        result = run_video_inference_via_subprocess(video_file)
        elapsed = time.time() - t0

        if result.get("error"):
            return (
                f"<b style='color:#ff4b4b'>Video model unavailable locally.</b>"
                f"<br><small>{result['error']}</small>"
                f"<br><br>To evaluate videos, run the notebook on Colab/Kaggle.",
                json.dumps(result, indent=2),
                gr.update(visible=True),
            )

        label      = result.get("label", "Unknown")
        prob_fake  = result.get("prob_fake", 0.5)
        prob_real  = result.get("prob_real", 0.5)
        conf       = result.get("confidence", 0)
        faces      = result.get("faces_detected", False)

        verdict = "DEEPFAKE VIDEO DETECTED" if label.upper() == "FAKE" else "GENUINE VIDEO"
        colour  = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"
        bar     = _score_bar(prob_fake)

        summary_html = f"""
        <div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}">
          <div class="result-title" style="color: {colour};">{verdict}</div>
          <div class="score-container">
            <span class="score-label">Facial Extraction Target</span>
            <span class="score-value">{"Face Found" if faces else "No Face"}</span>
          </div>
          <div class="score-container">
            <span class="score-label">P(Fake) vs P(Real)</span>
            <span class="score-value">{prob_fake:.4f} &nbsp;&nbsp;|&nbsp;&nbsp; {prob_real:.4f}</span>
          </div>
          <div class="progress-track">
            <div class="progress-fill" style="width: {prob_fake * 100}%;"></div>
          </div>
          <div style="text-align: right; margin-top: 12px; font-size: 0.85rem; color: #64748b;">
            Execution Time: {elapsed:.2f}s
          </div>
        </div>
        """
        return summary_html, json.dumps(result, indent=2), gr.update(visible=True)
    except Exception as exc:
        return f"<b>Error:</b> {exc}", "", gr.update(visible=False)


def predict_multimodal(video_file):
    """Fuse audio + video scores from a single video file."""
    if video_file is None:
        return "No file uploaded.", "", gr.update(visible=False)
    try:
        t0 = time.time()

        # Extract audio
        audio_path = extract_audio_from_video(video_file)

        # Run both modalities
        audio_result = run_audio_inference(audio_path) if audio_path else None
        video_result = run_video_inference_via_subprocess(video_file)

        # Fuse
        from fusion import MultimodalFusion
        _, fused = MultimodalFusion.from_detector_results(
            audio_result, video_result, strategy="weighted_average", alpha=0.5
        )

        elapsed = time.time() - t0
        label  = fused["label"]
        score  = fused["fused_score"]
        conf   = fused["confidence"]
        mods   = ", ".join(fused["modalities_used"]) or "none"

        fake_score = 1 - score
        verdict = "DEEPFAKE DETECTED" if label.upper() == "FAKE" else "GENUINE MEDIA"
        colour  = "#ff4b4b" if label.upper() == "FAKE" else "#21c354"
        bar     = _score_bar(fake_score)

        summary_html = f"""
        <div class="glass-panel status-{'fake' if label.upper() == 'FAKE' else 'real'}">
          <div class="result-title" style="color: {colour};">{verdict}</div>
          <div class="score-container">
            <span class="score-label">Multimodal Fused Score (Fake)</span>
            <span class="score-value" style="color: {colour};">{fake_score * 100:.2f}%</span>
          </div>
          <div class="progress-track">
            <div class="progress-fill" style="width: {fake_score * 100}%;"></div>
          </div>
          
          <div style="display: flex; gap: 16px; margin-top: 16px;">
              <div class="score-container" style="flex: 1;">
                <span class="score-label">Audio P(Real)</span>
                <span class="score-value">{fused.get('audio_score', 'N/A')}</span>
              </div>
              <div class="score-container" style="flex: 1;">
                <span class="score-label">Video P(Real)</span>
                <span class="score-value">{fused.get('video_score', 'N/A')}</span>
              </div>
          </div>
          
          <div style="text-align: right; margin-top: 16px; font-size: 0.8rem; color: #64748b;">
            Active Streams: {mods} | Inference Time: {elapsed:.2f}s
          </div>
        </div>
        """
        return summary_html, json.dumps(fused, indent=2), gr.update(visible=True)
    except Exception as exc:
        return f"<b>Error:</b> {exc}", "", gr.update(visible=False)


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------

THEME = gr.themes.Soft(
    primary_hue="violet",
    secondary_hue="blue",
    neutral_hue="slate",
    font=[gr.themes.GoogleFont("Outfit"), "sans-serif"],
).set(
    body_background_fill="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)",
    body_background_fill_dark="linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%)",
    block_background_fill="rgba(30, 41, 59, 0.7)",
    block_background_fill_dark="rgba(30, 41, 59, 0.7)",
    block_border_color="rgba(148, 163, 184, 0.2)",
    block_border_width="1px",
    block_label_text_color="#cbd5e1",
    block_shadow="0 8px 32px 0 rgba(0, 0, 0, 0.3)",
    button_primary_background_fill="linear-gradient(90deg, #8b5cf6 0%, #3b82f6 100%)",
    button_primary_background_fill_hover="linear-gradient(90deg, #7c3aed 0%, #2563eb 100%)",
    button_primary_text_color="#ffffff",
    input_background_fill="rgba(15, 23, 42, 0.6)",
    input_border_color="rgba(99, 102, 241, 0.3)",
    panel_background_fill="rgba(30, 41, 59, 0.4)",
)

DESCRIPTION = textwrap.dedent("""
## Multimodal Deepfake Detection System
**Nes2Net (audio)** | **GenConViT (video)** | **Late fusion**

> Upload an audio clip, a video, or a video with audio to detect deepfakes.
""")


CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&display=swap');

/* Force overriding Gradio's internal container instead of just 'body' */
.gradio-container {
    background-color: #0b0f19 !important;
    background-image: radial-gradient(at 0% 0%, rgba(17, 24, 39, 1) 0, transparent 50%), radial-gradient(at 100% 0%, rgba(30, 27, 75, 1) 0, transparent 50%), radial-gradient(at 50% 100%, rgba(15, 23, 42, 1) 0, transparent 50%) !important;
    background-attachment: fixed !important;
    color: #f8fafc !important;
    font-family: 'Outfit', sans-serif !important;
}

/* Force standard Gradio wrappers to be slightly transparent to see background */
.wrap, .panel, .gap, .form {
    background-color: rgba(15, 23, 42, 0.4) !important;
    border-color: rgba(255, 255, 255, 0.05) !important;
}

.hero-header {
    text-align: center;
    padding: 30px 10px;
    margin-bottom: 30px;
    background: rgba(15, 23, 42, 0.4) !important;
    border-radius: 16px;
    border: 1px solid rgba(255, 255, 255, 0.05) !important;
    backdrop-filter: blur(20px);
    box-shadow: 0 10px 40px rgba(0,0,0,0.5);
}
.hero-title {
    font-size: 3rem;
    font-weight: 800;
    text-transform: uppercase;
    letter-spacing: 2px;
    background: linear-gradient(to right, #8b5cf6, #3b82f6, #06b6d4) !important;
    -webkit-background-clip: text !important;
    background-clip: text !important;
    -webkit-text-fill-color: transparent !important;
    margin-bottom: 10px;
}
.hero-subtitle {
    font-size: 1.1rem;
    color: #94a3b8 !important;
    font-weight: 300;
}

.glass-panel {
    background: linear-gradient(145deg, rgba(30, 41, 59, 0.6) 0%, rgba(15, 23, 42, 0.8) 100%);
    backdrop-filter: blur(24px);
    border: 1px solid rgba(148, 163, 184, 0.1);
    border-radius: 20px;
    padding: 32px;
    box-shadow: 0 15px 35px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.05);
    margin-bottom: 24px;
    transition: transform 0.3s ease, box-shadow 0.3s ease;
}
.glass-panel:hover {
    transform: translateY(-5px);
    box-shadow: 0 20px 40px rgba(0,0,0,0.6), inset 0 1px 0 rgba(255,255,255,0.1);
}

.status-fake { border-top: 4px solid #f43f5e; box-shadow: 0 10px 40px rgba(244, 63, 94, 0.15); }
.status-real { border-top: 4px solid #10b981; box-shadow: 0 10px 40px rgba(16, 185, 129, 0.15); }
.result-title { font-size: 2.2rem; font-weight: 800; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 24px; text-align: center; text-shadow: 0 4px 10px rgba(0,0,0,0.4); }

.score-container { display: flex; justify-content: space-between; align-items: center; padding: 16px 20px; background: rgba(0, 0, 0, 0.3); border-radius: 12px; margin-top: 16px; border: 1px solid rgba(255,255,255,0.03); }
.score-label { font-size: 1rem; color: #94a3b8; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; }
.score-value { font-size: 1.5rem; font-weight: 800; color: #f8fafc; }

@keyframes fillout { from { width: 0; opacity: 0; } to { opacity: 1; } }
.progress-track { width: 100%; height: 14px; background: rgba(0, 0, 0, 0.5); border-radius: 7px; overflow: hidden; margin-top: 12px; box-shadow: inset 0 2px 4px rgba(0,0,0,0.5); }
.progress-fill { height: 100%; border-radius: 7px; animation: fillout 1.2s cubic-bezier(0.16, 1, 0.3, 1) forwards; position: relative; }
.status-fake .progress-fill { background: linear-gradient(90deg, #be123c, #f43f5e); }
.status-real .progress-fill { background: linear-gradient(90deg, #047857, #10b981); }

.gradio-container .prose * { padding: 0 !important; }

.fusion-btn {
    background: linear-gradient(90deg, #8b5cf6, #3b82f6) !important;
    border: none !important;
    box-shadow: 0 0 15px rgba(139, 92, 246, 0.5) !important;
    animation: pulseGlow 2s infinite;
}
@keyframes pulseGlow {
    0% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); }
    50% { box-shadow: 0 0 30px rgba(139, 92, 246, 0.9); }
    100% { box-shadow: 0 0 15px rgba(139, 92, 246, 0.5); }
}
"""

def build_ui():
    # Injecting CSS strictly inline to bypass Windows path resolution bugs in Gradio 4
    with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="Multimodal Deepfake Detection") as demo:
        
        # Premium Hero Banner
        gr.HTML("""
        <div class="hero-header">
            <div class="hero-title">Multimodal Deepfake Detection</div>
            <div class="hero-subtitle">Industry-grade neural security layer fusing Nes2Net acoustics and GenConViT optics.</div>
        </div>
        """)

        with gr.Tabs():
            # ── Tab 1: Audio ──────────────────────────────────────────
            with gr.TabItem("Audio Detection"):
                gr.Markdown("### Upload a speech sample to detect AI-synthesised audio.")
                with gr.Row():
                    with gr.Column(scale=1):
                        audio_input = gr.Audio(
                            label="Upload Audio",
                            type="filepath",
                            sources=["upload", "microphone"],
                        )
                        audio_btn = gr.Button("Analyse Audio", variant="primary")
                    with gr.Column(scale=2):
                        audio_result     = gr.HTML(label="Result")
                        audio_details    = gr.Textbox(label="Raw JSON output",
                                                      lines=10, visible=False,
                                                      interactive=False)
                        audio_expand_btn = gr.Button("Show raw output", size="sm",
                                                     visible=False)

                audio_expand_btn.click(
                    lambda: gr.update(visible=True),
                    outputs=audio_details,
                )
                audio_btn.click(
                    predict_audio,
                    inputs=audio_input,
                    outputs=[audio_result, audio_details, audio_expand_btn],
                )

            # ── Tab 2: Video ──────────────────────────────────────────
            with gr.TabItem("Video Detection"):
                gr.Markdown(
                    "### Upload a video to detect face manipulation.\n"
                    "> **Note:** Full GPU inference requires Colab/Kaggle. "
                    "The local model may report 'Unavailable'."
                )
                with gr.Row():
                    with gr.Column(scale=1):
                        video_input = gr.Video(label="Upload Video")
                        video_btn   = gr.Button("Analyse Video", variant="primary")
                    with gr.Column(scale=2):
                        video_result  = gr.HTML(label="Result")
                        video_details = gr.Textbox(label="Raw JSON output",
                                                   lines=10, visible=False,
                                                   interactive=False)
                        video_expand  = gr.Button("Show raw output", size="sm",
                                                  visible=False)

                video_expand.click(
                    lambda: gr.update(visible=True),
                    outputs=video_details,
                )
                video_btn.click(
                    predict_video,
                    inputs=video_input,
                    outputs=[video_result, video_details, video_expand],
                )

            # ── Tab 3: Multimodal ────────────────────────────────────
            with gr.TabItem("Multimodal Fusion"):
                gr.Markdown(
                    "### Upload a video with audio to get a fused verdict.\n"
                    "Both the audio track and video frames will be analysed "
                    "and combined via weighted-average score fusion."
                )
                with gr.Row():
                    with gr.Column(scale=1):
                        mm_input = gr.Video(label="Upload Video (with audio)")
                        mm_btn   = gr.Button("Launch Deep Multimodal Fusion 🚀", variant="primary", elem_classes=["fusion-btn"])
                    with gr.Column(scale=2):
                        mm_result  = gr.HTML(label="Result")
                        mm_details = gr.Textbox(label="Raw JSON output",
                                                lines=10, visible=False,
                                                interactive=False)
                        mm_expand  = gr.Button("Show raw output", size="sm",
                                               visible=False)

                mm_expand.click(
                    lambda: gr.update(visible=True),
                    outputs=mm_details,
                )
                mm_btn.click(
                    predict_multimodal,
                    inputs=mm_input,
                    outputs=[mm_result, mm_details, mm_expand],
                )

        gr.Markdown("""
---
**Model Architecture:**  
Audio — Wav2Vec 2.0 (XLSR-300M) + Nes2Net (ASVspoof 2021 DF checkpoint)  
Video — GenConViT (ED + VAE ensemble, GenConViT weights)  
Fusion — Weighted-average late fusion (α = 0.5)

**B.Tech Project** — Multimodal Deepfake Detection
        """)

    return demo


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    demo = build_ui()
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True,
        show_api=False,   # disables /api endpoint — prevents gradio_client bool-schema crash
    )