Simma7 commited on
Commit
acf615d
Β·
verified Β·
1 Parent(s): 7c6ca25

Create document.py

Browse files
Files changed (1) hide show
  1. prog/document.py +234 -0
prog/document.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import re
4
+ import numpy as np
5
+ from PIL import Image
6
+
7
+
8
+ def _check_visual_noise(pil_img: Image.Image) -> tuple:
9
+ """
10
+ Detects copy-paste artefacts via local noise variance analysis.
11
+ Forged regions often have suspiciously low or mismatched
12
+ noise variance compared to genuine document background.
13
+ Returns (score 0-1, detail string).
14
+ """
15
+ import cv2
16
+
17
+ img_np = np.array(pil_img.convert("L"), dtype=np.float32)
18
+ h, w = img_np.shape
19
+
20
+ block = 32
21
+ variances = []
22
+ for y in range(0, h - block, block):
23
+ for x in range(0, w - block, block):
24
+ patch = img_np[y:y+block, x:x+block]
25
+ variances.append(np.var(patch))
26
+
27
+ if not variances:
28
+ return 0.0, "Could not analyse noise (image too small)"
29
+
30
+ variances = np.array(variances)
31
+ # High coefficient of variation β†’ suspicious variance jumps
32
+ coef_var = np.std(variances) / (np.mean(variances) + 1e-8)
33
+
34
+ # Thresholds tuned on document images
35
+ if coef_var > 3.5:
36
+ score = min(1.0, (coef_var - 3.5) / 4.0)
37
+ detail = f"High variance inconsistency (CV={coef_var:.2f}) β€” possible copy-paste region"
38
+ elif coef_var > 2.0:
39
+ score = (coef_var - 2.0) / 1.5 * 0.5
40
+ detail = f"Moderate variance anomaly (CV={coef_var:.2f})"
41
+ else:
42
+ score = 0.0
43
+ detail = f"Noise pattern normal (CV={coef_var:.2f})"
44
+
45
+ return score, detail
46
+
47
+
48
+ def _check_pdf_metadata(path: str) -> tuple:
49
+ """
50
+ Checks PDF metadata for common forgery signals:
51
+ - Creation tool mismatch (e.g. Adobe β†’ LibreOffice date newer than creation)
52
+ - Missing standard metadata fields
53
+ - Modification date earlier than creation date
54
+ Returns (score 0-1, detail string).
55
+ """
56
+ try:
57
+ import PyPDF2
58
+ with open(path, "rb") as f:
59
+ reader = PyPDF2.PdfReader(f)
60
+ meta = reader.metadata or {}
61
+
62
+ signals = []
63
+ score = 0.0
64
+
65
+ creator = str(meta.get("/Creator", "")).lower()
66
+ producer = str(meta.get("/Producer", "")).lower()
67
+ created = str(meta.get("/CreationDate", ""))
68
+ modified = str(meta.get("/ModDate", ""))
69
+
70
+ # Check 1: creator and producer mismatch (strong forgery signal)
71
+ if creator and producer:
72
+ known_suites = [
73
+ ("microsoft", "libreoffice"), ("libreoffice", "adobe"),
74
+ ("adobe", "libreoffice"), ("word", "ghostscript"),
75
+ ]
76
+ for c, p in known_suites:
77
+ if c in creator and p in producer:
78
+ signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'")
79
+ score += 0.4
80
+ break
81
+
82
+ # Check 2: modification predates creation
83
+ if created and modified and len(created) > 4 and len(modified) > 4:
84
+ try:
85
+ c_year = int(re.search(r"D:(\d{4})", created).group(1))
86
+ m_year = int(re.search(r"D:(\d{4})", modified).group(1))
87
+ if m_year < c_year:
88
+ signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})")
89
+ score += 0.35
90
+ except Exception:
91
+ pass
92
+
93
+ # Check 3: no standard metadata at all
94
+ if not creator and not producer:
95
+ signals.append("No creator/producer metadata β€” stripped or generated programmatically")
96
+ score += 0.2
97
+
98
+ score = min(1.0, score)
99
+ detail = "; ".join(signals) if signals else "PDF metadata appears normal"
100
+ return score, detail
101
+
102
+ except Exception as e:
103
+ return 0.0, f"PDF metadata check skipped: {e}"
104
+
105
+
106
+ def _check_text_consistency(pil_img: Image.Image) -> tuple:
107
+ """
108
+ Uses pytesseract OCR to detect font size/style inconsistencies
109
+ within text regions. Genuine documents have consistent baseline
110
+ spacing; forged insertions often deviate.
111
+ Returns (score 0-1, detail string).
112
+ """
113
+ try:
114
+ import pytesseract
115
+ data = pytesseract.image_to_data(
116
+ pil_img, output_type=pytesseract.Output.DICT
117
+ )
118
+ heights = [
119
+ h for h, conf in zip(data["height"], data["conf"])
120
+ if conf > 60 and h > 5
121
+ ]
122
+
123
+ if len(heights) < 5:
124
+ return 0.0, "Insufficient text regions for OCR analysis"
125
+
126
+ heights = np.array(heights, dtype=float)
127
+ cv = np.std(heights) / (np.mean(heights) + 1e-8)
128
+
129
+ if cv > 0.6:
130
+ score = min(1.0, (cv - 0.6) / 0.6)
131
+ detail = f"High font size variance (CV={cv:.2f}) β€” inconsistent text insertion likely"
132
+ elif cv > 0.35:
133
+ score = (cv - 0.35) / 0.25 * 0.4
134
+ detail = f"Moderate font inconsistency (CV={cv:.2f})"
135
+ else:
136
+ score = 0.0
137
+ detail = f"Text layout appears consistent (CV={cv:.2f})"
138
+
139
+ return score, detail
140
+
141
+ except Exception as e:
142
+ return 0.0, f"OCR check skipped ({e})"
143
+
144
+
145
+ def _render_pdf_page(path: str) -> Image.Image:
146
+ """Render first page of a PDF as a PIL Image."""
147
+ try:
148
+ import fitz # PyMuPDF
149
+ doc = fitz.open(path)
150
+ page = doc[0]
151
+ mat = fitz.Matrix(2, 2) # 2x scale for better OCR
152
+ pix = page.get_pixmap(matrix=mat)
153
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
154
+ doc.close()
155
+ return img
156
+ except ImportError:
157
+ # Fallback if PyMuPDF not installed β€” load as image directly
158
+ return Image.open(path).convert("RGB")
159
+
160
+
161
+ def _threat_level(score: float):
162
+ if score < 0.20:
163
+ return "NONE", " ALLOW"
164
+ elif score < 0.45:
165
+ return "LOW", " LOG"
166
+ elif score < 0.70:
167
+ return "MEDIUM", "ALERT"
168
+ else:
169
+ return "HIGH", " BLOCK"
170
+
171
+
172
+ def detect_document(file_path: str) -> str:
173
+ """
174
+ Main entry point. Accepts image files (JPG/PNG) or PDF.
175
+ Returns a formatted forensic analysis report string.
176
+ """
177
+ try:
178
+ ext = os.path.splitext(file_path)[1].lower()
179
+ is_pdf = ext == ".pdf"
180
+ meta_score, meta_detail = 0.0, "N/A (not a PDF)"
181
+
182
+ if is_pdf:
183
+ pil_img = _render_pdf_page(file_path)
184
+ meta_score, meta_detail = _check_pdf_metadata(file_path)
185
+ else:
186
+ pil_img = Image.open(file_path).convert("RGB")
187
+
188
+ noise_score, noise_detail = _check_visual_noise(pil_img)
189
+ text_score, text_detail = _check_text_consistency(pil_img)
190
+
191
+ if is_pdf:
192
+ # All three checks relevant for PDFs
193
+ fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30)
194
+ else:
195
+ # Only visual checks for images (no metadata)
196
+ fused = (noise_score * 0.55 + text_score * 0.45)
197
+
198
+ prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC"
199
+ threat, action = _threat_level(fused)
200
+
201
+ bars = "β–ˆ" * int(fused * 20) + "β–‘" * (20 - int(fused * 20))
202
+
203
+ report = f"""
204
+ DOCUMENT FORENSIC REPORT
205
+ {"="*40}
206
+
207
+ Verdict : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'}
208
+ Risk Score : {fused:.2%} [{bars}]
209
+ Threat : {threat}
210
+ Action : {action}
211
+
212
+ {"─"*40}
213
+ FORENSIC CHECKS
214
+ {"─"*40}
215
+
216
+ Visual Noise Analysis
217
+ Score : {noise_score:.2%}
218
+ Detail : {noise_detail}
219
+
220
+ Text/Font Consistency (OCR)
221
+ Score : {text_score:.2%}
222
+ Detail : {text_detail}
223
+
224
+ PDF Metadata Integrity
225
+ Score : {meta_score:.2%}
226
+ Detail : {meta_detail}
227
+
228
+ {"─"*40}
229
+ {' FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else 'βœ… No significant forgery indicators found.'}
230
+ """
231
+ return report.strip()
232
+
233
+ except Exception as e:
234
+ return f" Document analysis error: {str(e)}"