File size: 7,607 Bytes
acf615d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234

import os
import re
import numpy as np
from PIL import Image


def _check_visual_noise(pil_img: Image.Image) -> tuple:
    """
    Detects copy-paste artefacts via local noise variance analysis.
    Forged regions often have suspiciously low or mismatched
    noise variance compared to genuine document background.
    Returns (score 0-1, detail string).
    """
    import cv2

    img_np = np.array(pil_img.convert("L"), dtype=np.float32)
    h, w = img_np.shape

    block = 32
    variances = []
    for y in range(0, h - block, block):
        for x in range(0, w - block, block):
            patch = img_np[y:y+block, x:x+block]
            variances.append(np.var(patch))

    if not variances:
        return 0.0, "Could not analyse noise (image too small)"

    variances = np.array(variances)
    # High coefficient of variation β†’ suspicious variance jumps
    coef_var = np.std(variances) / (np.mean(variances) + 1e-8)

    # Thresholds tuned on document images
    if coef_var > 3.5:
        score = min(1.0, (coef_var - 3.5) / 4.0)
        detail = f"High variance inconsistency (CV={coef_var:.2f}) β€” possible copy-paste region"
    elif coef_var > 2.0:
        score = (coef_var - 2.0) / 1.5 * 0.5
        detail = f"Moderate variance anomaly (CV={coef_var:.2f})"
    else:
        score = 0.0
        detail = f"Noise pattern normal (CV={coef_var:.2f})"

    return score, detail


def _check_pdf_metadata(path: str) -> tuple:
    """
    Checks PDF metadata for common forgery signals:
    - Creation tool mismatch (e.g. Adobe β†’ LibreOffice date newer than creation)
    - Missing standard metadata fields
    - Modification date earlier than creation date
    Returns (score 0-1, detail string).
    """
    try:
        import PyPDF2
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            meta = reader.metadata or {}

        signals = []
        score = 0.0

        creator = str(meta.get("/Creator", "")).lower()
        producer = str(meta.get("/Producer", "")).lower()
        created  = str(meta.get("/CreationDate", ""))
        modified = str(meta.get("/ModDate", ""))

        # Check 1: creator and producer mismatch (strong forgery signal)
        if creator and producer:
            known_suites = [
                ("microsoft", "libreoffice"), ("libreoffice", "adobe"),
                ("adobe", "libreoffice"), ("word", "ghostscript"),
            ]
            for c, p in known_suites:
                if c in creator and p in producer:
                    signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'")
                    score += 0.4
                    break

        # Check 2: modification predates creation
        if created and modified and len(created) > 4 and len(modified) > 4:
            try:
                c_year = int(re.search(r"D:(\d{4})", created).group(1))
                m_year = int(re.search(r"D:(\d{4})", modified).group(1))
                if m_year < c_year:
                    signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})")
                    score += 0.35
            except Exception:
                pass

        # Check 3: no standard metadata at all
        if not creator and not producer:
            signals.append("No creator/producer metadata β€” stripped or generated programmatically")
            score += 0.2

        score = min(1.0, score)
        detail = "; ".join(signals) if signals else "PDF metadata appears normal"
        return score, detail

    except Exception as e:
        return 0.0, f"PDF metadata check skipped: {e}"


def _check_text_consistency(pil_img: Image.Image) -> tuple:
    """
    Uses pytesseract OCR to detect font size/style inconsistencies
    within text regions. Genuine documents have consistent baseline
    spacing; forged insertions often deviate.
    Returns (score 0-1, detail string).
    """
    try:
        import pytesseract
        data = pytesseract.image_to_data(
            pil_img, output_type=pytesseract.Output.DICT
        )
        heights = [
            h for h, conf in zip(data["height"], data["conf"])
            if conf > 60 and h > 5
        ]

        if len(heights) < 5:
            return 0.0, "Insufficient text regions for OCR analysis"

        heights = np.array(heights, dtype=float)
        cv = np.std(heights) / (np.mean(heights) + 1e-8)

        if cv > 0.6:
            score = min(1.0, (cv - 0.6) / 0.6)
            detail = f"High font size variance (CV={cv:.2f}) β€” inconsistent text insertion likely"
        elif cv > 0.35:
            score = (cv - 0.35) / 0.25 * 0.4
            detail = f"Moderate font inconsistency (CV={cv:.2f})"
        else:
            score = 0.0
            detail = f"Text layout appears consistent (CV={cv:.2f})"

        return score, detail

    except Exception as e:
        return 0.0, f"OCR check skipped ({e})"


def _render_pdf_page(path: str) -> Image.Image:
    """Render first page of a PDF as a PIL Image."""
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(path)
        page = doc[0]
        mat = fitz.Matrix(2, 2)   # 2x scale for better OCR
        pix = page.get_pixmap(matrix=mat)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        doc.close()
        return img
    except ImportError:
        # Fallback if PyMuPDF not installed β€” load as image directly
        return Image.open(path).convert("RGB")


def _threat_level(score: float):
    if score < 0.20:
        return "NONE", " ALLOW"
    elif score < 0.45:
        return "LOW", " LOG"
    elif score < 0.70:
        return "MEDIUM", "ALERT"
    else:
        return "HIGH", " BLOCK"


def detect_document(file_path: str) -> str:
    """
    Main entry point. Accepts image files (JPG/PNG) or PDF.
    Returns a formatted forensic analysis report string.
    """
    try:
        ext = os.path.splitext(file_path)[1].lower()
        is_pdf = ext == ".pdf"
        meta_score, meta_detail = 0.0, "N/A (not a PDF)"

        if is_pdf:
            pil_img = _render_pdf_page(file_path)
            meta_score, meta_detail = _check_pdf_metadata(file_path)
        else:
            pil_img = Image.open(file_path).convert("RGB")

        noise_score,  noise_detail  = _check_visual_noise(pil_img)
        text_score,   text_detail   = _check_text_consistency(pil_img)

        if is_pdf:
            # All three checks relevant for PDFs
            fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30)
        else:
            # Only visual checks for images (no metadata)
            fused = (noise_score * 0.55 + text_score * 0.45)

        prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC"
        threat, action = _threat_level(fused)

        bars = "β–ˆ" * int(fused * 20) + "β–‘" * (20 - int(fused * 20))

        report = f"""
 DOCUMENT FORENSIC REPORT
{"="*40}

Verdict    : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'}
Risk Score : {fused:.2%}  [{bars}]
Threat     : {threat}
Action     : {action}

{"─"*40}
FORENSIC CHECKS
{"─"*40}

   Visual Noise Analysis
   Score  : {noise_score:.2%}
   Detail : {noise_detail}

   Text/Font Consistency (OCR)
   Score  : {text_score:.2%}
   Detail : {text_detail}

   PDF Metadata Integrity
   Score  : {meta_score:.2%}
   Detail : {meta_detail}

{"─"*40}
{'  FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else 'βœ…  No significant forgery indicators found.'}
"""
        return report.strip()

    except Exception as e:
        return f" Document analysis error: {str(e)}"