Spaces:

Simma7
/

deepfake_gaurd

Sleeping

File size: 7,607 Bytes

acf615d


import os
import re
import numpy as np
from PIL import Image


def _check_visual_noise(pil_img: Image.Image) -> tuple:
    """
    Detects copy-paste artefacts via local noise variance analysis.
    Forged regions often have suspiciously low or mismatched
    noise variance compared to genuine document background.
    Returns (score 0-1, detail string).
    """
    import cv2

    img_np = np.array(pil_img.convert("L"), dtype=np.float32)
    h, w = img_np.shape

    block = 32
    variances = []
    for y in range(0, h - block, block):
        for x in range(0, w - block, block):
            patch = img_np[y:y+block, x:x+block]
            variances.append(np.var(patch))

    if not variances:
        return 0.0, "Could not analyse noise (image too small)"

    variances = np.array(variances)
    # High coefficient of variation → suspicious variance jumps
    coef_var = np.std(variances) / (np.mean(variances) + 1e-8)

    # Thresholds tuned on document images
    if coef_var > 3.5:
        score = min(1.0, (coef_var - 3.5) / 4.0)
        detail = f"High variance inconsistency (CV={coef_var:.2f}) — possible copy-paste region"
    elif coef_var > 2.0:
        score = (coef_var - 2.0) / 1.5 * 0.5
        detail = f"Moderate variance anomaly (CV={coef_var:.2f})"
    else:
        score = 0.0
        detail = f"Noise pattern normal (CV={coef_var:.2f})"

    return score, detail


def _check_pdf_metadata(path: str) -> tuple:
    """
    Checks PDF metadata for common forgery signals:
    - Creation tool mismatch (e.g. Adobe → LibreOffice date newer than creation)
    - Missing standard metadata fields
    - Modification date earlier than creation date
    Returns (score 0-1, detail string).
    """
    try:
        import PyPDF2
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            meta = reader.metadata or {}

        signals = []
        score = 0.0

        creator = str(meta.get("/Creator", "")).lower()
        producer = str(meta.get("/Producer", "")).lower()
        created  = str(meta.get("/CreationDate", ""))
        modified = str(meta.get("/ModDate", ""))

        # Check 1: creator and producer mismatch (strong forgery signal)
        if creator and producer:
            known_suites = [
                ("microsoft", "libreoffice"), ("libreoffice", "adobe"),
                ("adobe", "libreoffice"), ("word", "ghostscript"),
            ]
            for c, p in known_suites:
                if c in creator and p in producer:
                    signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'")
                    score += 0.4
                    break

        # Check 2: modification predates creation
        if created and modified and len(created) > 4 and len(modified) > 4:
            try:
                c_year = int(re.search(r"D:(\d{4})", created).group(1))
                m_year = int(re.search(r"D:(\d{4})", modified).group(1))
                if m_year < c_year:
                    signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})")
                    score += 0.35
            except Exception:
                pass

        # Check 3: no standard metadata at all
        if not creator and not producer:
            signals.append("No creator/producer metadata — stripped or generated programmatically")
            score += 0.2

        score = min(1.0, score)
        detail = "; ".join(signals) if signals else "PDF metadata appears normal"
        return score, detail

    except Exception as e:
        return 0.0, f"PDF metadata check skipped: {e}"


def _check_text_consistency(pil_img: Image.Image) -> tuple:
    """
    Uses pytesseract OCR to detect font size/style inconsistencies
    within text regions. Genuine documents have consistent baseline
    spacing; forged insertions often deviate.
    Returns (score 0-1, detail string).
    """
    try:
        import pytesseract
        data = pytesseract.image_to_data(
            pil_img, output_type=pytesseract.Output.DICT
        )
        heights = [
            h for h, conf in zip(data["height"], data["conf"])
            if conf > 60 and h > 5
        ]

        if len(heights) < 5:
            return 0.0, "Insufficient text regions for OCR analysis"

        heights = np.array(heights, dtype=float)
        cv = np.std(heights) / (np.mean(heights) + 1e-8)

        if cv > 0.6:
            score = min(1.0, (cv - 0.6) / 0.6)
            detail = f"High font size variance (CV={cv:.2f}) — inconsistent text insertion likely"
        elif cv > 0.35:
            score = (cv - 0.35) / 0.25 * 0.4
            detail = f"Moderate font inconsistency (CV={cv:.2f})"
        else:
            score = 0.0
            detail = f"Text layout appears consistent (CV={cv:.2f})"

        return score, detail

    except Exception as e:
        return 0.0, f"OCR check skipped ({e})"


def _render_pdf_page(path: str) -> Image.Image:
    """Render first page of a PDF as a PIL Image."""
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(path)
        page = doc[0]
        mat = fitz.Matrix(2, 2)   # 2x scale for better OCR
        pix = page.get_pixmap(matrix=mat)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        doc.close()
        return img
    except ImportError:
        # Fallback if PyMuPDF not installed — load as image directly
        return Image.open(path).convert("RGB")


def _threat_level(score: float):
    if score < 0.20:
        return "NONE", " ALLOW"
    elif score < 0.45:
        return "LOW", " LOG"
    elif score < 0.70:
        return "MEDIUM", "ALERT"
    else:
        return "HIGH", " BLOCK"


def detect_document(file_path: str) -> str:
    """
    Main entry point. Accepts image files (JPG/PNG) or PDF.
    Returns a formatted forensic analysis report string.
    """
    try:
        ext = os.path.splitext(file_path)[1].lower()
        is_pdf = ext == ".pdf"
        meta_score, meta_detail = 0.0, "N/A (not a PDF)"

        if is_pdf:
            pil_img = _render_pdf_page(file_path)
            meta_score, meta_detail = _check_pdf_metadata(file_path)
        else:
            pil_img = Image.open(file_path).convert("RGB")

        noise_score,  noise_detail  = _check_visual_noise(pil_img)
        text_score,   text_detail   = _check_text_consistency(pil_img)

        if is_pdf:
            # All three checks relevant for PDFs
            fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30)
        else:
            # Only visual checks for images (no metadata)
            fused = (noise_score * 0.55 + text_score * 0.45)

        prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC"
        threat, action = _threat_level(fused)

        bars = "█" * int(fused * 20) + "░" * (20 - int(fused * 20))

        report = f"""
 DOCUMENT FORENSIC REPORT
{"="*40}

Verdict    : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'}
Risk Score : {fused:.2%}  [{bars}]
Threat     : {threat}
Action     : {action}

{"─"*40}
FORENSIC CHECKS
{"─"*40}

   Visual Noise Analysis
   Score  : {noise_score:.2%}
   Detail : {noise_detail}

   Text/Font Consistency (OCR)
   Score  : {text_score:.2%}
   Detail : {text_detail}

   PDF Metadata Integrity
   Score  : {meta_score:.2%}
   Detail : {meta_detail}

{"─"*40}
{'  FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else '✅  No significant forgery indicators found.'}
"""
        return report.strip()

    except Exception as e:
        return f" Document analysis error: {str(e)}"