Spaces:
Sleeping
Sleeping
Create document.py
Browse files- prog/document.py +234 -0
prog/document.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import numpy as np
|
| 5 |
+
from PIL import Image
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _check_visual_noise(pil_img: Image.Image) -> tuple:
|
| 9 |
+
"""
|
| 10 |
+
Detects copy-paste artefacts via local noise variance analysis.
|
| 11 |
+
Forged regions often have suspiciously low or mismatched
|
| 12 |
+
noise variance compared to genuine document background.
|
| 13 |
+
Returns (score 0-1, detail string).
|
| 14 |
+
"""
|
| 15 |
+
import cv2
|
| 16 |
+
|
| 17 |
+
img_np = np.array(pil_img.convert("L"), dtype=np.float32)
|
| 18 |
+
h, w = img_np.shape
|
| 19 |
+
|
| 20 |
+
block = 32
|
| 21 |
+
variances = []
|
| 22 |
+
for y in range(0, h - block, block):
|
| 23 |
+
for x in range(0, w - block, block):
|
| 24 |
+
patch = img_np[y:y+block, x:x+block]
|
| 25 |
+
variances.append(np.var(patch))
|
| 26 |
+
|
| 27 |
+
if not variances:
|
| 28 |
+
return 0.0, "Could not analyse noise (image too small)"
|
| 29 |
+
|
| 30 |
+
variances = np.array(variances)
|
| 31 |
+
# High coefficient of variation β suspicious variance jumps
|
| 32 |
+
coef_var = np.std(variances) / (np.mean(variances) + 1e-8)
|
| 33 |
+
|
| 34 |
+
# Thresholds tuned on document images
|
| 35 |
+
if coef_var > 3.5:
|
| 36 |
+
score = min(1.0, (coef_var - 3.5) / 4.0)
|
| 37 |
+
detail = f"High variance inconsistency (CV={coef_var:.2f}) β possible copy-paste region"
|
| 38 |
+
elif coef_var > 2.0:
|
| 39 |
+
score = (coef_var - 2.0) / 1.5 * 0.5
|
| 40 |
+
detail = f"Moderate variance anomaly (CV={coef_var:.2f})"
|
| 41 |
+
else:
|
| 42 |
+
score = 0.0
|
| 43 |
+
detail = f"Noise pattern normal (CV={coef_var:.2f})"
|
| 44 |
+
|
| 45 |
+
return score, detail
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _check_pdf_metadata(path: str) -> tuple:
|
| 49 |
+
"""
|
| 50 |
+
Checks PDF metadata for common forgery signals:
|
| 51 |
+
- Creation tool mismatch (e.g. Adobe β LibreOffice date newer than creation)
|
| 52 |
+
- Missing standard metadata fields
|
| 53 |
+
- Modification date earlier than creation date
|
| 54 |
+
Returns (score 0-1, detail string).
|
| 55 |
+
"""
|
| 56 |
+
try:
|
| 57 |
+
import PyPDF2
|
| 58 |
+
with open(path, "rb") as f:
|
| 59 |
+
reader = PyPDF2.PdfReader(f)
|
| 60 |
+
meta = reader.metadata or {}
|
| 61 |
+
|
| 62 |
+
signals = []
|
| 63 |
+
score = 0.0
|
| 64 |
+
|
| 65 |
+
creator = str(meta.get("/Creator", "")).lower()
|
| 66 |
+
producer = str(meta.get("/Producer", "")).lower()
|
| 67 |
+
created = str(meta.get("/CreationDate", ""))
|
| 68 |
+
modified = str(meta.get("/ModDate", ""))
|
| 69 |
+
|
| 70 |
+
# Check 1: creator and producer mismatch (strong forgery signal)
|
| 71 |
+
if creator and producer:
|
| 72 |
+
known_suites = [
|
| 73 |
+
("microsoft", "libreoffice"), ("libreoffice", "adobe"),
|
| 74 |
+
("adobe", "libreoffice"), ("word", "ghostscript"),
|
| 75 |
+
]
|
| 76 |
+
for c, p in known_suites:
|
| 77 |
+
if c in creator and p in producer:
|
| 78 |
+
signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'")
|
| 79 |
+
score += 0.4
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
# Check 2: modification predates creation
|
| 83 |
+
if created and modified and len(created) > 4 and len(modified) > 4:
|
| 84 |
+
try:
|
| 85 |
+
c_year = int(re.search(r"D:(\d{4})", created).group(1))
|
| 86 |
+
m_year = int(re.search(r"D:(\d{4})", modified).group(1))
|
| 87 |
+
if m_year < c_year:
|
| 88 |
+
signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})")
|
| 89 |
+
score += 0.35
|
| 90 |
+
except Exception:
|
| 91 |
+
pass
|
| 92 |
+
|
| 93 |
+
# Check 3: no standard metadata at all
|
| 94 |
+
if not creator and not producer:
|
| 95 |
+
signals.append("No creator/producer metadata β stripped or generated programmatically")
|
| 96 |
+
score += 0.2
|
| 97 |
+
|
| 98 |
+
score = min(1.0, score)
|
| 99 |
+
detail = "; ".join(signals) if signals else "PDF metadata appears normal"
|
| 100 |
+
return score, detail
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
return 0.0, f"PDF metadata check skipped: {e}"
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _check_text_consistency(pil_img: Image.Image) -> tuple:
|
| 107 |
+
"""
|
| 108 |
+
Uses pytesseract OCR to detect font size/style inconsistencies
|
| 109 |
+
within text regions. Genuine documents have consistent baseline
|
| 110 |
+
spacing; forged insertions often deviate.
|
| 111 |
+
Returns (score 0-1, detail string).
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
import pytesseract
|
| 115 |
+
data = pytesseract.image_to_data(
|
| 116 |
+
pil_img, output_type=pytesseract.Output.DICT
|
| 117 |
+
)
|
| 118 |
+
heights = [
|
| 119 |
+
h for h, conf in zip(data["height"], data["conf"])
|
| 120 |
+
if conf > 60 and h > 5
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
if len(heights) < 5:
|
| 124 |
+
return 0.0, "Insufficient text regions for OCR analysis"
|
| 125 |
+
|
| 126 |
+
heights = np.array(heights, dtype=float)
|
| 127 |
+
cv = np.std(heights) / (np.mean(heights) + 1e-8)
|
| 128 |
+
|
| 129 |
+
if cv > 0.6:
|
| 130 |
+
score = min(1.0, (cv - 0.6) / 0.6)
|
| 131 |
+
detail = f"High font size variance (CV={cv:.2f}) β inconsistent text insertion likely"
|
| 132 |
+
elif cv > 0.35:
|
| 133 |
+
score = (cv - 0.35) / 0.25 * 0.4
|
| 134 |
+
detail = f"Moderate font inconsistency (CV={cv:.2f})"
|
| 135 |
+
else:
|
| 136 |
+
score = 0.0
|
| 137 |
+
detail = f"Text layout appears consistent (CV={cv:.2f})"
|
| 138 |
+
|
| 139 |
+
return score, detail
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
return 0.0, f"OCR check skipped ({e})"
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _render_pdf_page(path: str) -> Image.Image:
|
| 146 |
+
"""Render first page of a PDF as a PIL Image."""
|
| 147 |
+
try:
|
| 148 |
+
import fitz # PyMuPDF
|
| 149 |
+
doc = fitz.open(path)
|
| 150 |
+
page = doc[0]
|
| 151 |
+
mat = fitz.Matrix(2, 2) # 2x scale for better OCR
|
| 152 |
+
pix = page.get_pixmap(matrix=mat)
|
| 153 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 154 |
+
doc.close()
|
| 155 |
+
return img
|
| 156 |
+
except ImportError:
|
| 157 |
+
# Fallback if PyMuPDF not installed β load as image directly
|
| 158 |
+
return Image.open(path).convert("RGB")
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def _threat_level(score: float):
|
| 162 |
+
if score < 0.20:
|
| 163 |
+
return "NONE", " ALLOW"
|
| 164 |
+
elif score < 0.45:
|
| 165 |
+
return "LOW", " LOG"
|
| 166 |
+
elif score < 0.70:
|
| 167 |
+
return "MEDIUM", "ALERT"
|
| 168 |
+
else:
|
| 169 |
+
return "HIGH", " BLOCK"
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def detect_document(file_path: str) -> str:
|
| 173 |
+
"""
|
| 174 |
+
Main entry point. Accepts image files (JPG/PNG) or PDF.
|
| 175 |
+
Returns a formatted forensic analysis report string.
|
| 176 |
+
"""
|
| 177 |
+
try:
|
| 178 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 179 |
+
is_pdf = ext == ".pdf"
|
| 180 |
+
meta_score, meta_detail = 0.0, "N/A (not a PDF)"
|
| 181 |
+
|
| 182 |
+
if is_pdf:
|
| 183 |
+
pil_img = _render_pdf_page(file_path)
|
| 184 |
+
meta_score, meta_detail = _check_pdf_metadata(file_path)
|
| 185 |
+
else:
|
| 186 |
+
pil_img = Image.open(file_path).convert("RGB")
|
| 187 |
+
|
| 188 |
+
noise_score, noise_detail = _check_visual_noise(pil_img)
|
| 189 |
+
text_score, text_detail = _check_text_consistency(pil_img)
|
| 190 |
+
|
| 191 |
+
if is_pdf:
|
| 192 |
+
# All three checks relevant for PDFs
|
| 193 |
+
fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30)
|
| 194 |
+
else:
|
| 195 |
+
# Only visual checks for images (no metadata)
|
| 196 |
+
fused = (noise_score * 0.55 + text_score * 0.45)
|
| 197 |
+
|
| 198 |
+
prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC"
|
| 199 |
+
threat, action = _threat_level(fused)
|
| 200 |
+
|
| 201 |
+
bars = "β" * int(fused * 20) + "β" * (20 - int(fused * 20))
|
| 202 |
+
|
| 203 |
+
report = f"""
|
| 204 |
+
DOCUMENT FORENSIC REPORT
|
| 205 |
+
{"="*40}
|
| 206 |
+
|
| 207 |
+
Verdict : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'}
|
| 208 |
+
Risk Score : {fused:.2%} [{bars}]
|
| 209 |
+
Threat : {threat}
|
| 210 |
+
Action : {action}
|
| 211 |
+
|
| 212 |
+
{"β"*40}
|
| 213 |
+
FORENSIC CHECKS
|
| 214 |
+
{"β"*40}
|
| 215 |
+
|
| 216 |
+
Visual Noise Analysis
|
| 217 |
+
Score : {noise_score:.2%}
|
| 218 |
+
Detail : {noise_detail}
|
| 219 |
+
|
| 220 |
+
Text/Font Consistency (OCR)
|
| 221 |
+
Score : {text_score:.2%}
|
| 222 |
+
Detail : {text_detail}
|
| 223 |
+
|
| 224 |
+
PDF Metadata Integrity
|
| 225 |
+
Score : {meta_score:.2%}
|
| 226 |
+
Detail : {meta_detail}
|
| 227 |
+
|
| 228 |
+
{"β"*40}
|
| 229 |
+
{' FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else 'β
No significant forgery indicators found.'}
|
| 230 |
+
"""
|
| 231 |
+
return report.strip()
|
| 232 |
+
|
| 233 |
+
except Exception as e:
|
| 234 |
+
return f" Document analysis error: {str(e)}"
|