import os import re import numpy as np from PIL import Image def _check_visual_noise(pil_img: Image.Image) -> tuple: """ Detects copy-paste artefacts via local noise variance analysis. Forged regions often have suspiciously low or mismatched noise variance compared to genuine document background. Returns (score 0-1, detail string). """ import cv2 img_np = np.array(pil_img.convert("L"), dtype=np.float32) h, w = img_np.shape block = 32 variances = [] for y in range(0, h - block, block): for x in range(0, w - block, block): patch = img_np[y:y+block, x:x+block] variances.append(np.var(patch)) if not variances: return 0.0, "Could not analyse noise (image too small)" variances = np.array(variances) # High coefficient of variation → suspicious variance jumps coef_var = np.std(variances) / (np.mean(variances) + 1e-8) # Thresholds tuned on document images if coef_var > 3.5: score = min(1.0, (coef_var - 3.5) / 4.0) detail = f"High variance inconsistency (CV={coef_var:.2f}) — possible copy-paste region" elif coef_var > 2.0: score = (coef_var - 2.0) / 1.5 * 0.5 detail = f"Moderate variance anomaly (CV={coef_var:.2f})" else: score = 0.0 detail = f"Noise pattern normal (CV={coef_var:.2f})" return score, detail def _check_pdf_metadata(path: str) -> tuple: """ Checks PDF metadata for common forgery signals: - Creation tool mismatch (e.g. Adobe → LibreOffice date newer than creation) - Missing standard metadata fields - Modification date earlier than creation date Returns (score 0-1, detail string). """ try: import PyPDF2 with open(path, "rb") as f: reader = PyPDF2.PdfReader(f) meta = reader.metadata or {} signals = [] score = 0.0 creator = str(meta.get("/Creator", "")).lower() producer = str(meta.get("/Producer", "")).lower() created = str(meta.get("/CreationDate", "")) modified = str(meta.get("/ModDate", "")) # Check 1: creator and producer mismatch (strong forgery signal) if creator and producer: known_suites = [ ("microsoft", "libreoffice"), ("libreoffice", "adobe"), ("adobe", "libreoffice"), ("word", "ghostscript"), ] for c, p in known_suites: if c in creator and p in producer: signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'") score += 0.4 break # Check 2: modification predates creation if created and modified and len(created) > 4 and len(modified) > 4: try: c_year = int(re.search(r"D:(\d{4})", created).group(1)) m_year = int(re.search(r"D:(\d{4})", modified).group(1)) if m_year < c_year: signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})") score += 0.35 except Exception: pass # Check 3: no standard metadata at all if not creator and not producer: signals.append("No creator/producer metadata — stripped or generated programmatically") score += 0.2 score = min(1.0, score) detail = "; ".join(signals) if signals else "PDF metadata appears normal" return score, detail except Exception as e: return 0.0, f"PDF metadata check skipped: {e}" def _check_text_consistency(pil_img: Image.Image) -> tuple: """ Uses pytesseract OCR to detect font size/style inconsistencies within text regions. Genuine documents have consistent baseline spacing; forged insertions often deviate. Returns (score 0-1, detail string). """ try: import pytesseract data = pytesseract.image_to_data( pil_img, output_type=pytesseract.Output.DICT ) heights = [ h for h, conf in zip(data["height"], data["conf"]) if conf > 60 and h > 5 ] if len(heights) < 5: return 0.0, "Insufficient text regions for OCR analysis" heights = np.array(heights, dtype=float) cv = np.std(heights) / (np.mean(heights) + 1e-8) if cv > 0.6: score = min(1.0, (cv - 0.6) / 0.6) detail = f"High font size variance (CV={cv:.2f}) — inconsistent text insertion likely" elif cv > 0.35: score = (cv - 0.35) / 0.25 * 0.4 detail = f"Moderate font inconsistency (CV={cv:.2f})" else: score = 0.0 detail = f"Text layout appears consistent (CV={cv:.2f})" return score, detail except Exception as e: return 0.0, f"OCR check skipped ({e})" def _render_pdf_page(path: str) -> Image.Image: """Render first page of a PDF as a PIL Image.""" try: import fitz # PyMuPDF doc = fitz.open(path) page = doc[0] mat = fitz.Matrix(2, 2) # 2x scale for better OCR pix = page.get_pixmap(matrix=mat) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) doc.close() return img except ImportError: # Fallback if PyMuPDF not installed — load as image directly return Image.open(path).convert("RGB") def _threat_level(score: float): if score < 0.20: return "NONE", " ALLOW" elif score < 0.45: return "LOW", " LOG" elif score < 0.70: return "MEDIUM", "ALERT" else: return "HIGH", " BLOCK" def detect_document(file_path: str) -> str: """ Main entry point. Accepts image files (JPG/PNG) or PDF. Returns a formatted forensic analysis report string. """ try: ext = os.path.splitext(file_path)[1].lower() is_pdf = ext == ".pdf" meta_score, meta_detail = 0.0, "N/A (not a PDF)" if is_pdf: pil_img = _render_pdf_page(file_path) meta_score, meta_detail = _check_pdf_metadata(file_path) else: pil_img = Image.open(file_path).convert("RGB") noise_score, noise_detail = _check_visual_noise(pil_img) text_score, text_detail = _check_text_consistency(pil_img) if is_pdf: # All three checks relevant for PDFs fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30) else: # Only visual checks for images (no metadata) fused = (noise_score * 0.55 + text_score * 0.45) prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC" threat, action = _threat_level(fused) bars = "█" * int(fused * 20) + "░" * (20 - int(fused * 20)) report = f""" DOCUMENT FORENSIC REPORT {"="*40} Verdict : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'} Risk Score : {fused:.2%} [{bars}] Threat : {threat} Action : {action} {"─"*40} FORENSIC CHECKS {"─"*40} Visual Noise Analysis Score : {noise_score:.2%} Detail : {noise_detail} Text/Font Consistency (OCR) Score : {text_score:.2%} Detail : {text_detail} PDF Metadata Integrity Score : {meta_score:.2%} Detail : {meta_detail} {"─"*40} {' FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else '✅ No significant forgery indicators found.'} """ return report.strip() except Exception as e: return f" Document analysis error: {str(e)}"