deepfake_gaurd / prog /document.py
Simma7's picture
Create document.py
acf615d verified
import os
import re
import numpy as np
from PIL import Image
def _check_visual_noise(pil_img: Image.Image) -> tuple:
"""
Detects copy-paste artefacts via local noise variance analysis.
Forged regions often have suspiciously low or mismatched
noise variance compared to genuine document background.
Returns (score 0-1, detail string).
"""
import cv2
img_np = np.array(pil_img.convert("L"), dtype=np.float32)
h, w = img_np.shape
block = 32
variances = []
for y in range(0, h - block, block):
for x in range(0, w - block, block):
patch = img_np[y:y+block, x:x+block]
variances.append(np.var(patch))
if not variances:
return 0.0, "Could not analyse noise (image too small)"
variances = np.array(variances)
# High coefficient of variation β†’ suspicious variance jumps
coef_var = np.std(variances) / (np.mean(variances) + 1e-8)
# Thresholds tuned on document images
if coef_var > 3.5:
score = min(1.0, (coef_var - 3.5) / 4.0)
detail = f"High variance inconsistency (CV={coef_var:.2f}) β€” possible copy-paste region"
elif coef_var > 2.0:
score = (coef_var - 2.0) / 1.5 * 0.5
detail = f"Moderate variance anomaly (CV={coef_var:.2f})"
else:
score = 0.0
detail = f"Noise pattern normal (CV={coef_var:.2f})"
return score, detail
def _check_pdf_metadata(path: str) -> tuple:
"""
Checks PDF metadata for common forgery signals:
- Creation tool mismatch (e.g. Adobe β†’ LibreOffice date newer than creation)
- Missing standard metadata fields
- Modification date earlier than creation date
Returns (score 0-1, detail string).
"""
try:
import PyPDF2
with open(path, "rb") as f:
reader = PyPDF2.PdfReader(f)
meta = reader.metadata or {}
signals = []
score = 0.0
creator = str(meta.get("/Creator", "")).lower()
producer = str(meta.get("/Producer", "")).lower()
created = str(meta.get("/CreationDate", ""))
modified = str(meta.get("/ModDate", ""))
# Check 1: creator and producer mismatch (strong forgery signal)
if creator and producer:
known_suites = [
("microsoft", "libreoffice"), ("libreoffice", "adobe"),
("adobe", "libreoffice"), ("word", "ghostscript"),
]
for c, p in known_suites:
if c in creator and p in producer:
signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'")
score += 0.4
break
# Check 2: modification predates creation
if created and modified and len(created) > 4 and len(modified) > 4:
try:
c_year = int(re.search(r"D:(\d{4})", created).group(1))
m_year = int(re.search(r"D:(\d{4})", modified).group(1))
if m_year < c_year:
signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})")
score += 0.35
except Exception:
pass
# Check 3: no standard metadata at all
if not creator and not producer:
signals.append("No creator/producer metadata β€” stripped or generated programmatically")
score += 0.2
score = min(1.0, score)
detail = "; ".join(signals) if signals else "PDF metadata appears normal"
return score, detail
except Exception as e:
return 0.0, f"PDF metadata check skipped: {e}"
def _check_text_consistency(pil_img: Image.Image) -> tuple:
"""
Uses pytesseract OCR to detect font size/style inconsistencies
within text regions. Genuine documents have consistent baseline
spacing; forged insertions often deviate.
Returns (score 0-1, detail string).
"""
try:
import pytesseract
data = pytesseract.image_to_data(
pil_img, output_type=pytesseract.Output.DICT
)
heights = [
h for h, conf in zip(data["height"], data["conf"])
if conf > 60 and h > 5
]
if len(heights) < 5:
return 0.0, "Insufficient text regions for OCR analysis"
heights = np.array(heights, dtype=float)
cv = np.std(heights) / (np.mean(heights) + 1e-8)
if cv > 0.6:
score = min(1.0, (cv - 0.6) / 0.6)
detail = f"High font size variance (CV={cv:.2f}) β€” inconsistent text insertion likely"
elif cv > 0.35:
score = (cv - 0.35) / 0.25 * 0.4
detail = f"Moderate font inconsistency (CV={cv:.2f})"
else:
score = 0.0
detail = f"Text layout appears consistent (CV={cv:.2f})"
return score, detail
except Exception as e:
return 0.0, f"OCR check skipped ({e})"
def _render_pdf_page(path: str) -> Image.Image:
"""Render first page of a PDF as a PIL Image."""
try:
import fitz # PyMuPDF
doc = fitz.open(path)
page = doc[0]
mat = fitz.Matrix(2, 2) # 2x scale for better OCR
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
doc.close()
return img
except ImportError:
# Fallback if PyMuPDF not installed β€” load as image directly
return Image.open(path).convert("RGB")
def _threat_level(score: float):
if score < 0.20:
return "NONE", " ALLOW"
elif score < 0.45:
return "LOW", " LOG"
elif score < 0.70:
return "MEDIUM", "ALERT"
else:
return "HIGH", " BLOCK"
def detect_document(file_path: str) -> str:
"""
Main entry point. Accepts image files (JPG/PNG) or PDF.
Returns a formatted forensic analysis report string.
"""
try:
ext = os.path.splitext(file_path)[1].lower()
is_pdf = ext == ".pdf"
meta_score, meta_detail = 0.0, "N/A (not a PDF)"
if is_pdf:
pil_img = _render_pdf_page(file_path)
meta_score, meta_detail = _check_pdf_metadata(file_path)
else:
pil_img = Image.open(file_path).convert("RGB")
noise_score, noise_detail = _check_visual_noise(pil_img)
text_score, text_detail = _check_text_consistency(pil_img)
if is_pdf:
# All three checks relevant for PDFs
fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30)
else:
# Only visual checks for images (no metadata)
fused = (noise_score * 0.55 + text_score * 0.45)
prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC"
threat, action = _threat_level(fused)
bars = "β–ˆ" * int(fused * 20) + "β–‘" * (20 - int(fused * 20))
report = f"""
DOCUMENT FORENSIC REPORT
{"="*40}
Verdict : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'}
Risk Score : {fused:.2%} [{bars}]
Threat : {threat}
Action : {action}
{"─"*40}
FORENSIC CHECKS
{"─"*40}
Visual Noise Analysis
Score : {noise_score:.2%}
Detail : {noise_detail}
Text/Font Consistency (OCR)
Score : {text_score:.2%}
Detail : {text_detail}
PDF Metadata Integrity
Score : {meta_score:.2%}
Detail : {meta_detail}
{"─"*40}
{' FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else 'βœ… No significant forgery indicators found.'}
"""
return report.strip()
except Exception as e:
return f" Document analysis error: {str(e)}"