Spaces:

Simma7
/

deepfake_gaurd

Sleeping

App Files Files Community

Simma7 commited on Apr 19

Commit

acf615d

verified ·

1 Parent(s): 7c6ca25

Create document.py

Browse files

Files changed (1) hide show

prog/document.py +234 -0

prog/document.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os
+import re
+import numpy as np
+from PIL import Image
+def _check_visual_noise(pil_img: Image.Image) -> tuple:
+    """
+    Detects copy-paste artefacts via local noise variance analysis.
+    Forged regions often have suspiciously low or mismatched
+    noise variance compared to genuine document background.
+    Returns (score 0-1, detail string).
+    """
+    import cv2
+    img_np = np.array(pil_img.convert("L"), dtype=np.float32)
+    h, w = img_np.shape
+    block = 32
+    variances = []
+    for y in range(0, h - block, block):
+        for x in range(0, w - block, block):
+            patch = img_np[y:y+block, x:x+block]
+            variances.append(np.var(patch))
+    if not variances:
+        return 0.0, "Could not analyse noise (image too small)"
+    variances = np.array(variances)
+    # High coefficient of variation → suspicious variance jumps
+    coef_var = np.std(variances) / (np.mean(variances) + 1e-8)
+    # Thresholds tuned on document images
+    if coef_var > 3.5:
+        score = min(1.0, (coef_var - 3.5) / 4.0)
+        detail = f"High variance inconsistency (CV={coef_var:.2f}) — possible copy-paste region"
+    elif coef_var > 2.0:
+        score = (coef_var - 2.0) / 1.5 * 0.5
+        detail = f"Moderate variance anomaly (CV={coef_var:.2f})"
+    else:
+        score = 0.0
+        detail = f"Noise pattern normal (CV={coef_var:.2f})"
+    return score, detail
+def _check_pdf_metadata(path: str) -> tuple:
+    """
+    Checks PDF metadata for common forgery signals:
+    - Creation tool mismatch (e.g. Adobe → LibreOffice date newer than creation)
+    - Missing standard metadata fields
+    - Modification date earlier than creation date
+    Returns (score 0-1, detail string).
+    """
+    try:
+        import PyPDF2
+        with open(path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            meta = reader.metadata or {}
+        signals = []
+        score = 0.0
+        creator = str(meta.get("/Creator", "")).lower()
+        producer = str(meta.get("/Producer", "")).lower()
+        created  = str(meta.get("/CreationDate", ""))
+        modified = str(meta.get("/ModDate", ""))
+        # Check 1: creator and producer mismatch (strong forgery signal)
+        if creator and producer:
+            known_suites = [
+                ("microsoft", "libreoffice"), ("libreoffice", "adobe"),
+                ("adobe", "libreoffice"), ("word", "ghostscript"),
+            ]
+            for c, p in known_suites:
+                if c in creator and p in producer:
+                    signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'")
+                    score += 0.4
+                    break
+        # Check 2: modification predates creation
+        if created and modified and len(created) > 4 and len(modified) > 4:
+            try:
+                c_year = int(re.search(r"D:(\d{4})", created).group(1))
+                m_year = int(re.search(r"D:(\d{4})", modified).group(1))
+                if m_year < c_year:
+                    signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})")
+                    score += 0.35
+            except Exception:
+                pass
+        # Check 3: no standard metadata at all
+        if not creator and not producer:
+            signals.append("No creator/producer metadata — stripped or generated programmatically")
+            score += 0.2
+        score = min(1.0, score)
+        detail = "; ".join(signals) if signals else "PDF metadata appears normal"
+        return score, detail
+    except Exception as e:
+        return 0.0, f"PDF metadata check skipped: {e}"
+def _check_text_consistency(pil_img: Image.Image) -> tuple:
+    """
+    Uses pytesseract OCR to detect font size/style inconsistencies
+    within text regions. Genuine documents have consistent baseline
+    spacing; forged insertions often deviate.
+    Returns (score 0-1, detail string).
+    """
+    try:
+        import pytesseract
+        data = pytesseract.image_to_data(
+            pil_img, output_type=pytesseract.Output.DICT
+        )
+        heights = [
+            h for h, conf in zip(data["height"], data["conf"])
+            if conf > 60 and h > 5
+        ]
+        if len(heights) < 5:
+            return 0.0, "Insufficient text regions for OCR analysis"
+        heights = np.array(heights, dtype=float)
+        cv = np.std(heights) / (np.mean(heights) + 1e-8)
+        if cv > 0.6:
+            score = min(1.0, (cv - 0.6) / 0.6)
+            detail = f"High font size variance (CV={cv:.2f}) — inconsistent text insertion likely"
+        elif cv > 0.35:
+            score = (cv - 0.35) / 0.25 * 0.4
+            detail = f"Moderate font inconsistency (CV={cv:.2f})"
+        else:
+            score = 0.0
+            detail = f"Text layout appears consistent (CV={cv:.2f})"
+        return score, detail
+    except Exception as e:
+        return 0.0, f"OCR check skipped ({e})"
+def _render_pdf_page(path: str) -> Image.Image:
+    """Render first page of a PDF as a PIL Image."""
+    try:
+        import fitz  # PyMuPDF
+        doc = fitz.open(path)
+        page = doc[0]
+        mat = fitz.Matrix(2, 2)   # 2x scale for better OCR
+        pix = page.get_pixmap(matrix=mat)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        doc.close()
+        return img
+    except ImportError:
+        # Fallback if PyMuPDF not installed — load as image directly
+        return Image.open(path).convert("RGB")
+def _threat_level(score: float):
+    if score < 0.20:
+        return "NONE", " ALLOW"
+    elif score < 0.45:
+        return "LOW", " LOG"
+    elif score < 0.70:
+        return "MEDIUM", "ALERT"
+    else:
+        return "HIGH", " BLOCK"
+def detect_document(file_path: str) -> str:
+    """
+    Main entry point. Accepts image files (JPG/PNG) or PDF.
+    Returns a formatted forensic analysis report string.
+    """
+    try:
+        ext = os.path.splitext(file_path)[1].lower()
+        is_pdf = ext == ".pdf"
+        meta_score, meta_detail = 0.0, "N/A (not a PDF)"
+        if is_pdf:
+            pil_img = _render_pdf_page(file_path)
+            meta_score, meta_detail = _check_pdf_metadata(file_path)
+        else:
+            pil_img = Image.open(file_path).convert("RGB")
+        noise_score,  noise_detail  = _check_visual_noise(pil_img)
+        text_score,   text_detail   = _check_text_consistency(pil_img)
+        if is_pdf:
+            # All three checks relevant for PDFs
+            fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30)
+        else:
+            # Only visual checks for images (no metadata)
+            fused = (noise_score * 0.55 + text_score * 0.45)
+        prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC"
+        threat, action = _threat_level(fused)
+        bars = "█" * int(fused * 20) + "░" * (20 - int(fused * 20))
+        report = f"""
+ DOCUMENT FORENSIC REPORT
+{"="*40}
+Verdict    : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'}
+Risk Score : {fused:.2%}  [{bars}]
+Threat     : {threat}
+Action     : {action}
+{"─"*40}
+FORENSIC CHECKS
+{"─"*40}
+   Visual Noise Analysis
+   Score  : {noise_score:.2%}
+   Detail : {noise_detail}
+   Text/Font Consistency (OCR)
+   Score  : {text_score:.2%}
+   Detail : {text_detail}
+   PDF Metadata Integrity
+   Score  : {meta_score:.2%}
+   Detail : {meta_detail}
+{"─"*40}
+{'  FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else '✅  No significant forgery indicators found.'}
+"""
+        return report.strip()
+    except Exception as e:
+        return f" Document analysis error: {str(e)}"