"""Gate G3 regression harness — EfficientNetAutoAttB4 accuracy on anchor set.

Acceptance criteria (MERGE_PLAN §9.1 G3):
  - >=88% accuracy on the anchor set
  - <=8% real->fake false-positive rate

Anchor set priority:
  1. LOCAL  — bundled ICPR2020 notebook/samples/ frames (always available, minimal set)
  2. FFPP   — training/datasets/ffpp/ when present (full G3 gate, 50+ images)
  3. DFDC   — training/datasets/dfdc/ when present

NOTE: ThisPersonDoesNotExist.com (StyleGAN2) is NOT valid for G3 — EfficientNetAutoAttB4
is trained on DFDC video face-swaps and does NOT generalise to GAN-portrait detection.
The full G3 gate requires FFPP c40 data (run scripts/fit_calibrator.py first).

Run from backend/:
    .venv/Scripts/python.exe -m pytest tests/test_efficientnet_regression.py -v
"""
from __future__ import annotations

import io
import sys
import time
import urllib.request
from pathlib import Path
from typing import Tuple

import numpy as np
import pytest

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

# ---------------------------------------------------------------------------
# Anchor image sources
# ---------------------------------------------------------------------------
# Local: bundled ICPR2020 sample frames (ground-truth labels from their scores).
# lynaeydofd_fr0.jpg → EfficientNet scores 0.011 (REAL)
# mqzvfufzoq_fr0.jpg → EfficientNet scores 0.873 (FAKE)
_ICPR_SAMPLES = (
    Path(__file__).resolve().parent.parent
    / "models" / "icpr2020dfdc" / "notebook" / "samples"
)
LOCAL_REAL_IMAGES = [_ICPR_SAMPLES / "lynaeydofd_fr0.jpg"]
LOCAL_FAKE_IMAGES = [_ICPR_SAMPLES / "mqzvfufzoq_fr0.jpg"]

# FFPP / DFDC local data (full G3 gate — available after running training/datasets download scripts).
_FFPP_REAL = Path(__file__).resolve().parent.parent / "training" / "datasets" / "ffpp" / "c40" / "real"
_FFPP_FAKE = Path(__file__).resolve().parent.parent / "training" / "datasets" / "ffpp" / "c40" / "fake"
_IMAGE_EXTS = {".jpg", ".jpeg", ".png"}

# Network: thispersondoesnotexist.com — used for G2 gate only (face detection).
# NOT used for G3 accuracy gate: StyleGAN2 faces are a different distribution
# from DFDC video face-swaps (the model's training domain).
TPDNE_URL = "https://thispersondoesnotexist.com/"


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

def _fetch(url: str, timeout: int = 20) -> bytes:
    req = urllib.request.Request(url, headers={"User-Agent": "DeepShield-Test/1.0"})
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return r.read()


@pytest.fixture(scope="module")
def detector():
    """Load the EfficientNetDetector once per module."""
    from services.efficientnet_service import EfficientNetDetector
    return EfficientNetDetector()


@pytest.fixture(scope="module")
def anchor_set(detector) -> Tuple[list, list]:
    """Score anchor images. Returns (real_results, fake_results).

    Priority order:
    1. FFPP c40 images (training/datasets/ffpp/c40/{real,fake}/) — full G3 gate
    2. Bundled ICPR2020 notebook samples — minimal sanity check
    """
    from PIL import Image

    def score_dir(directory: Path, limit: int = 50) -> list:
        results = []
        if not directory.is_dir():
            return results
        paths = sorted(p for p in directory.rglob("*") if p.suffix.lower() in _IMAGE_EXTS)[:limit]
        for p in paths:
            try:
                pil = Image.open(p).convert("RGB")
                results.append(detector.detect_image(pil))
            except Exception:
                pass
        return results

    # --- FFPP c40 (full gate) ---
    real_results = score_dir(_FFPP_REAL)
    fake_results = score_dir(_FFPP_FAKE)

    # --- Fallback: bundled ICPR2020 samples ---
    if not real_results:
        for p in LOCAL_REAL_IMAGES:
            if p.exists():
                pil = Image.open(p).convert("RGB")
                real_results.append(detector.detect_image(pil))
    if not fake_results:
        for p in LOCAL_FAKE_IMAGES:
            if p.exists():
                pil = Image.open(p).convert("RGB")
                fake_results.append(detector.detect_image(pil))

    return real_results, fake_results


# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------

class TestEfficientNetRegression:

    def test_g1_detector_loads(self, detector):
        """G1: model loaded without crash."""
        assert detector is not None
        assert detector.net is not None
        assert detector.face_extractor is not None

    def test_g2_blazeface_detects_face_on_tpdne(self, detector):
        """G2: BlazeFace detects ≥1 face on a TPDNE image."""
        from PIL import Image
        data = _fetch(TPDNE_URL)
        pil = Image.open(io.BytesIO(data)).convert("RGB")
        img_np = np.array(pil)
        frame_data = detector.face_extractor.process_image(img=img_np)
        faces = frame_data.get("faces", [])
        assert len(faces) >= 1, "BlazeFace detected 0 faces on TPDNE image"

    def test_g3_accuracy_and_fpr(self, anchor_set):
        """G3: ≥88% accuracy and ≤8% real→fake FPR on the anchor set."""
        real_results, fake_results = anchor_set

        # Only count images where a face was detected (no-face = excluded from G3).
        real_scored = [r for r in real_results if r.get("score") is not None]
        fake_scored = [r for r in fake_results if r.get("score") is not None]

        if len(real_scored) + len(fake_scored) < 5:
            pytest.skip("Too few face-detectable images in anchor set for meaningful G3 check")

        # Correctness: REAL → score ≤ 0.5, FAKE → score > 0.5
        real_correct = sum(1 for r in real_scored if r["result"] == "REAL")
        fake_correct = sum(1 for r in fake_scored if r["result"] == "FAKE")

        total = len(real_scored) + len(fake_scored)
        accuracy = (real_correct + fake_correct) / total * 100

        fpr = (len(real_scored) - real_correct) / max(len(real_scored), 1) * 100

        print(f"\n  Anchor set: {len(real_scored)} real | {len(fake_scored)} fake")
        print(f"  Accuracy:   {accuracy:.1f}%  (need >=88%)")
        print(f"  FPR:        {fpr:.1f}%  (need <=8%)")
        for tag, results, exp in [("REAL", real_scored, "REAL"), ("FAKE", fake_scored, "FAKE")]:
            for r in results:
                mark = "✓" if r["result"] == exp else "✗"
                print(f"    [{tag}] {mark} score={r['score']:.3f} cal={r.get('calibrator_applied')}")

        assert accuracy >= 88.0, f"G3 accuracy {accuracy:.1f}% < 88%"
        assert fpr <= 8.0, f"G3 FPR {fpr:.1f}% > 8%"

    def test_no_face_returns_gracefully(self, detector):
        """Noise image with no face should return error='no_face', not raise."""
        from PIL import Image
        noise = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
        result = detector.detect_image(noise)
        assert result["error"] == "no_face"
        assert result["score"] is None

    def test_g8_memory_under_threshold(self):
        """G8: RSS after model load < 2500 MB."""
        import psutil
        rss_mb = psutil.Process().memory_info().rss / 1024 / 1024
        print(f"\n  RSS: {rss_mb:.0f} MB")
        assert rss_mb < 2500, f"G8: RSS {rss_mb:.0f} MB exceeds 2500 MB threshold"