Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import numpy as np | |
| from PIL import Image | |
| def _check_visual_noise(pil_img: Image.Image) -> tuple: | |
| """ | |
| Detects copy-paste artefacts via local noise variance analysis. | |
| Forged regions often have suspiciously low or mismatched | |
| noise variance compared to genuine document background. | |
| Returns (score 0-1, detail string). | |
| """ | |
| import cv2 | |
| img_np = np.array(pil_img.convert("L"), dtype=np.float32) | |
| h, w = img_np.shape | |
| block = 32 | |
| variances = [] | |
| for y in range(0, h - block, block): | |
| for x in range(0, w - block, block): | |
| patch = img_np[y:y+block, x:x+block] | |
| variances.append(np.var(patch)) | |
| if not variances: | |
| return 0.0, "Could not analyse noise (image too small)" | |
| variances = np.array(variances) | |
| # High coefficient of variation β suspicious variance jumps | |
| coef_var = np.std(variances) / (np.mean(variances) + 1e-8) | |
| # Thresholds tuned on document images | |
| if coef_var > 3.5: | |
| score = min(1.0, (coef_var - 3.5) / 4.0) | |
| detail = f"High variance inconsistency (CV={coef_var:.2f}) β possible copy-paste region" | |
| elif coef_var > 2.0: | |
| score = (coef_var - 2.0) / 1.5 * 0.5 | |
| detail = f"Moderate variance anomaly (CV={coef_var:.2f})" | |
| else: | |
| score = 0.0 | |
| detail = f"Noise pattern normal (CV={coef_var:.2f})" | |
| return score, detail | |
| def _check_pdf_metadata(path: str) -> tuple: | |
| """ | |
| Checks PDF metadata for common forgery signals: | |
| - Creation tool mismatch (e.g. Adobe β LibreOffice date newer than creation) | |
| - Missing standard metadata fields | |
| - Modification date earlier than creation date | |
| Returns (score 0-1, detail string). | |
| """ | |
| try: | |
| import PyPDF2 | |
| with open(path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| meta = reader.metadata or {} | |
| signals = [] | |
| score = 0.0 | |
| creator = str(meta.get("/Creator", "")).lower() | |
| producer = str(meta.get("/Producer", "")).lower() | |
| created = str(meta.get("/CreationDate", "")) | |
| modified = str(meta.get("/ModDate", "")) | |
| # Check 1: creator and producer mismatch (strong forgery signal) | |
| if creator and producer: | |
| known_suites = [ | |
| ("microsoft", "libreoffice"), ("libreoffice", "adobe"), | |
| ("adobe", "libreoffice"), ("word", "ghostscript"), | |
| ] | |
| for c, p in known_suites: | |
| if c in creator and p in producer: | |
| signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'") | |
| score += 0.4 | |
| break | |
| # Check 2: modification predates creation | |
| if created and modified and len(created) > 4 and len(modified) > 4: | |
| try: | |
| c_year = int(re.search(r"D:(\d{4})", created).group(1)) | |
| m_year = int(re.search(r"D:(\d{4})", modified).group(1)) | |
| if m_year < c_year: | |
| signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})") | |
| score += 0.35 | |
| except Exception: | |
| pass | |
| # Check 3: no standard metadata at all | |
| if not creator and not producer: | |
| signals.append("No creator/producer metadata β stripped or generated programmatically") | |
| score += 0.2 | |
| score = min(1.0, score) | |
| detail = "; ".join(signals) if signals else "PDF metadata appears normal" | |
| return score, detail | |
| except Exception as e: | |
| return 0.0, f"PDF metadata check skipped: {e}" | |
| def _check_text_consistency(pil_img: Image.Image) -> tuple: | |
| """ | |
| Uses pytesseract OCR to detect font size/style inconsistencies | |
| within text regions. Genuine documents have consistent baseline | |
| spacing; forged insertions often deviate. | |
| Returns (score 0-1, detail string). | |
| """ | |
| try: | |
| import pytesseract | |
| data = pytesseract.image_to_data( | |
| pil_img, output_type=pytesseract.Output.DICT | |
| ) | |
| heights = [ | |
| h for h, conf in zip(data["height"], data["conf"]) | |
| if conf > 60 and h > 5 | |
| ] | |
| if len(heights) < 5: | |
| return 0.0, "Insufficient text regions for OCR analysis" | |
| heights = np.array(heights, dtype=float) | |
| cv = np.std(heights) / (np.mean(heights) + 1e-8) | |
| if cv > 0.6: | |
| score = min(1.0, (cv - 0.6) / 0.6) | |
| detail = f"High font size variance (CV={cv:.2f}) β inconsistent text insertion likely" | |
| elif cv > 0.35: | |
| score = (cv - 0.35) / 0.25 * 0.4 | |
| detail = f"Moderate font inconsistency (CV={cv:.2f})" | |
| else: | |
| score = 0.0 | |
| detail = f"Text layout appears consistent (CV={cv:.2f})" | |
| return score, detail | |
| except Exception as e: | |
| return 0.0, f"OCR check skipped ({e})" | |
| def _render_pdf_page(path: str) -> Image.Image: | |
| """Render first page of a PDF as a PIL Image.""" | |
| try: | |
| import fitz # PyMuPDF | |
| doc = fitz.open(path) | |
| page = doc[0] | |
| mat = fitz.Matrix(2, 2) # 2x scale for better OCR | |
| pix = page.get_pixmap(matrix=mat) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| doc.close() | |
| return img | |
| except ImportError: | |
| # Fallback if PyMuPDF not installed β load as image directly | |
| return Image.open(path).convert("RGB") | |
| def _threat_level(score: float): | |
| if score < 0.20: | |
| return "NONE", " ALLOW" | |
| elif score < 0.45: | |
| return "LOW", " LOG" | |
| elif score < 0.70: | |
| return "MEDIUM", "ALERT" | |
| else: | |
| return "HIGH", " BLOCK" | |
| def detect_document(file_path: str) -> str: | |
| """ | |
| Main entry point. Accepts image files (JPG/PNG) or PDF. | |
| Returns a formatted forensic analysis report string. | |
| """ | |
| try: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| is_pdf = ext == ".pdf" | |
| meta_score, meta_detail = 0.0, "N/A (not a PDF)" | |
| if is_pdf: | |
| pil_img = _render_pdf_page(file_path) | |
| meta_score, meta_detail = _check_pdf_metadata(file_path) | |
| else: | |
| pil_img = Image.open(file_path).convert("RGB") | |
| noise_score, noise_detail = _check_visual_noise(pil_img) | |
| text_score, text_detail = _check_text_consistency(pil_img) | |
| if is_pdf: | |
| # All three checks relevant for PDFs | |
| fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30) | |
| else: | |
| # Only visual checks for images (no metadata) | |
| fused = (noise_score * 0.55 + text_score * 0.45) | |
| prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC" | |
| threat, action = _threat_level(fused) | |
| bars = "β" * int(fused * 20) + "β" * (20 - int(fused * 20)) | |
| report = f""" | |
| DOCUMENT FORENSIC REPORT | |
| {"="*40} | |
| Verdict : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'} | |
| Risk Score : {fused:.2%} [{bars}] | |
| Threat : {threat} | |
| Action : {action} | |
| {"β"*40} | |
| FORENSIC CHECKS | |
| {"β"*40} | |
| Visual Noise Analysis | |
| Score : {noise_score:.2%} | |
| Detail : {noise_detail} | |
| Text/Font Consistency (OCR) | |
| Score : {text_score:.2%} | |
| Detail : {text_detail} | |
| PDF Metadata Integrity | |
| Score : {meta_score:.2%} | |
| Detail : {meta_detail} | |
| {"β"*40} | |
| {' FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else 'β No significant forgery indicators found.'} | |
| """ | |
| return report.strip() | |
| except Exception as e: | |
| return f" Document analysis error: {str(e)}" |