DocSentry / tampering.py
SpandanM110's picture
Round 2: fraud ring graph, AI-gen detector, provenance ledger, architecture doc
e97f963
Raw
History Blame Contribute Delete
13.1 kB
"""
tampering.py - Smart document tampering for the Tamper Forge Studio.
Each tamper function returns a dict:
{
"image": PIL.Image,
"src_box": (x0,y0,x1,y1) or None,
"dst_box": (x0,y0,x1,y1) or None,
"description": str,
"intensity": str,
}
"""
import io, os, random, shutil
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import cv2
try:
import pytesseract
_TESS_OK = False
for _c in (shutil.which("tesseract"),
r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe",
r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe",
os.path.expanduser(r"~\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe")):
if _c and os.path.isfile(_c):
pytesseract.pytesseract.tesseract_cmd = _c
_TESS_OK = True
break
if not _TESS_OK and shutil.which("tesseract"):
_TESS_OK = True
except ImportError:
_TESS_OK = False
def _font(size=18):
for path in ("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
r"C:\\Windows\\Fonts\\arial.ttf",
"DejaVuSans.ttf", "arial.ttf"):
try: return ImageFont.truetype(path, size)
except OSError: continue
return ImageFont.load_default()
_INTENSITY = {
"subtle": {"box_scale": 0.7, "jpeg_q": 75, "amount_jump": 1.5},
"moderate": {"box_scale": 1.0, "jpeg_q": 50, "amount_jump": 3.0},
"aggressive": {"box_scale": 1.4, "jpeg_q": 25, "amount_jump": 8.0},
}
def _profile(i): return _INTENSITY.get(i, _INTENSITY["moderate"])
def _find_salient_box(img, w_target=180, h_target=80):
"""High-variance region: likely a seal, signature, or stamp."""
arr = np.array(img.convert("L"))
H, W = arr.shape
scale = max(1, max(H, W) // 400)
small = arr[::scale, ::scale]
sh, sw = small.shape
kernel = max(15, min(sh, sw) // 8)
if kernel >= sh or kernel >= sw:
return (W//2 - w_target//2, H//2 - h_target//2,
W//2 + w_target//2, H//2 + h_target//2)
mean = cv2.boxFilter(small.astype(np.float32), -1, (kernel, kernel))
sq_mean = cv2.boxFilter((small.astype(np.float32) ** 2), -1, (kernel, kernel))
var = sq_mean - mean ** 2
var[:kernel,:] = 0; var[-kernel:,:] = 0
var[:,:kernel] = 0; var[:,-kernel:] = 0
py, px = np.unravel_index(var.argmax(), var.shape)
cx, cy = px * scale, py * scale
return (max(0, cx - w_target//2), max(0, cy - h_target//2),
min(W, cx + w_target//2), min(H, cy + h_target//2))
def _find_text_box_via_ocr(img, keywords=("Rs", "Date", "Amount", "Total", "Principal", "Stamp")):
if not _TESS_OK:
return None
try:
import pytesseract
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
except Exception:
return None
for i, txt in enumerate(data.get("text", [])):
for kw in keywords:
if kw.lower() in txt.lower():
x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
return (x, y, min(img.width, x + w * 8), y + h + 4)
return None
def tamper_copy_move(img, intensity="moderate", rng=None):
rng = rng or random.Random()
img = img.convert("RGB")
arr = np.array(img)
H, W = arr.shape[:2]
p = _profile(intensity)
bw, bh = int(180 * p["box_scale"]), int(90 * p["box_scale"])
sx0, sy0, sx1, sy1 = _find_salient_box(img, bw, bh)
patch = arr[sy0:sy1, sx0:sx1].copy()
tx0 = 40 if sx0 > W // 2 else max(40, W - (sx1 - sx0) - 40)
ty0 = sy0 + (1 if sy0 < H // 2 else -1) * (bh + 20)
ty0 = max(40, min(H - bh - 40, ty0))
tx1, ty1 = tx0 + (sx1 - sx0), ty0 + (sy1 - sy0)
arr[ty0:ty1, tx0:tx1] = patch[: ty1 - ty0, : tx1 - tx0]
return {"image": Image.fromarray(arr),
"src_box": (sx0, sy0, sx1, sy1),
"dst_box": (tx0, ty0, tx1, ty1),
"description": f"Duplicated a {bw}x{bh}px high-variance region (likely a seal/signature) to another quadrant.",
"intensity": intensity}
def tamper_text_edit(img, intensity="moderate", rng=None):
rng = rng or random.Random()
img = img.convert("RGB").copy()
p = _profile(intensity)
W, H = img.size
box = _find_text_box_via_ocr(img)
used_ocr = box is not None
if box is None:
strip_y = int(H * 0.35); strip_h = int(36 * p["box_scale"])
box = (int(W * 0.15), strip_y, int(W * 0.60), strip_y + strip_h)
x0, y0, x1, y1 = box
d = ImageDraw.Draw(img)
d.rectangle(box, fill="white")
new_amount = int(10_00_000 * p["amount_jump"])
new_text = f"Rs {new_amount:,}"
d.text((x0 + 6, y0 + 4), new_text, font=_font(int(20 * p["box_scale"])), fill="black")
return {"image": img, "src_box": None, "dst_box": box,
"description": f"Located a text/amount field via {'OCR' if used_ocr else 'fallback'} and rewrote it as '{new_text}'.",
"intensity": intensity}
def tamper_splice(img, donor=None, intensity="moderate", rng=None):
rng = rng or random.Random()
img = img.convert("RGB").copy()
p = _profile(intensity)
W, H = img.size
if donor is None:
sample_dir = Path("sample_data/originals")
if sample_dir.exists():
cands = [f for f in sample_dir.glob("*.png") if f.stat().st_size > 5000]
if cands:
donor = Image.open(rng.choice(cands)).convert("RGB")
if donor is None:
donor = Image.fromarray(np.full((H, W, 3),
[rng.randint(180, 255), rng.randint(120, 200), rng.randint(80, 160)],
dtype=np.uint8))
donor = donor.resize((W, H))
arr = np.array(img); darr = np.array(donor)
bw, bh = int(220 * p["box_scale"]), int(80 * p["box_scale"])
x = rng.randint(40, max(41, W - bw - 40))
y = rng.randint(int(H * 0.55), max(int(H * 0.55) + 1, H - bh - 40))
arr[y:y+bh, x:x+bw] = darr[y:y+bh, x:x+bw]
return {"image": Image.fromarray(arr),
"src_box": (x, y, x+bw, y+bh), "dst_box": (x, y, x+bw, y+bh),
"description": f"Spliced a {bw}x{bh}px region from a different document into the lower area.",
"intensity": intensity}
def tamper_compression(img, intensity="moderate", rng=None):
img = img.convert("RGB"); p = _profile(intensity)
buf = io.BytesIO(); img.save(buf, "JPEG", quality=p["jpeg_q"]); buf.seek(0)
return {"image": Image.open(buf).convert("RGB"),
"src_box": None, "dst_box": None,
"description": f"Re-saved at JPEG quality {p['jpeg_q']} (post-edit hide-the-evidence pattern).",
"intensity": intensity}
def tamper_metadata_strip(img, intensity="moderate", rng=None):
img = img.convert("RGB")
buf = io.BytesIO(); img.save(buf, "JPEG", quality=92); buf.seek(0)
return {"image": Image.open(buf).convert("RGB"),
"src_box": None, "dst_box": None,
"description": "Stripped all EXIF metadata (mimics photo-editor export).",
"intensity": intensity}
def tamper_custom_region(img, box, intensity="moderate", rng=None):
img = img.convert("RGB").copy()
x0, y0, x1, y1 = [int(v) for v in box]
crop = img.crop((x0, y0, x1, y1))
radius = {"subtle": 1.5, "moderate": 3.0, "aggressive": 6.0}.get(intensity, 3.0)
crop = crop.filter(ImageFilter.GaussianBlur(radius=radius))
img.paste(crop, (x0, y0))
d = ImageDraw.Draw(img)
d.rectangle([x0+4, y0+4, min(x1, x0+60), min(y1, y0+24)], fill="white")
d.text((x0+8, y0+6), "EDITED", font=_font(12), fill="black")
return {"image": img, "src_box": None, "dst_box": (x0, y0, x1, y1),
"description": f"User-drawn region blurred (sigma={radius}) and overpainted - adversarial test.",
"intensity": intensity}
TAMPER_FUNCTIONS = {
"copy_move": ("Copy-move", "Duplicates a salient region (e.g. seal). Classic forgery."),
"text_edit": ("Text edit", "Whites out a value and writes a new one. Loan-document fraud."),
"splice": ("Splice", "Pastes a region from a different doc. Noise inconsistency."),
"compression": ("Re-save", "Low JPEG quality to hide tampering. ELA catches it."),
"metadata": ("Strip EXIF", "Removes EXIF metadata. EXIF audit catches it."),
"custom": ("Custom (adversarial)", "User-drawn region blurred and overpainted. Adversarial."),
}
def tamper_dispatch(name, img, intensity="moderate", donor=None, custom_box=None, rng=None):
if name == "copy_move": return tamper_copy_move(img, intensity, rng)
if name == "text_edit": return tamper_text_edit(img, intensity, rng)
if name == "splice": return tamper_splice(img, donor, intensity, rng)
if name == "compression": return tamper_compression(img, intensity, rng)
if name == "metadata": return tamper_metadata_strip(img, intensity, rng)
if name == "custom":
if custom_box is None:
raise ValueError("custom needs custom_box")
return tamper_custom_region(img, custom_box, intensity, rng)
raise ValueError(f"unknown tamper: {name}")
def tamper_chain(img, names, intensity="moderate", rng=None):
rng = rng or random.Random()
current = img.convert("RGB")
steps, last_src, last_dst = [], None, None
for n in names:
out = tamper_dispatch(n, current, intensity=intensity, rng=rng)
current = out["image"]
steps.append({"name": n, "description": out["description"], "dst_box": out["dst_box"]})
if out["dst_box"]: last_dst = out["dst_box"]
if out["src_box"]: last_src = out["src_box"]
return {"image": current, "src_box": last_src, "dst_box": last_dst,
"description": " -> ".join(s["name"] for s in steps),
"intensity": intensity, "steps": steps}
def annotate_before_after(orig_img, tamper_meta, box_width=4):
orig = orig_img.convert("RGB").copy()
tampered = tamper_meta["image"].convert("RGB").copy()
d_orig, d_tamp = ImageDraw.Draw(orig), ImageDraw.Draw(tampered)
if tamper_meta.get("src_box"):
x0, y0, x1, y1 = tamper_meta["src_box"]
d_orig.rectangle(tamper_meta["src_box"], outline=(0, 200, 0), width=box_width)
d_orig.rectangle([x0, max(0, y0-20), x0+90, y0], fill=(0, 200, 0))
d_orig.text((x0+4, max(0, y0-18)), "SOURCE", font=_font(12), fill="white")
if tamper_meta.get("dst_box"):
x0, y0, x1, y1 = tamper_meta["dst_box"]
d_tamp.rectangle(tamper_meta["dst_box"], outline=(220, 30, 30), width=box_width)
d_tamp.rectangle([x0, max(0, y0-20), x0+110, y0], fill=(220, 30, 30))
d_tamp.text((x0+4, max(0, y0-18)), "TAMPERED", font=_font(12), fill="white")
return orig, tampered
def overlay_heatmap_on_image(base_img, heat_2d, alpha=0.55, cmap="hot"):
# Use the modern colormap registry (matplotlib >= 3.5),
# fall back to the deprecated cm.get_cmap on older versions.
try:
import matplotlib as mpl
cmap_fn = mpl.colormaps[cmap]
except (AttributeError, KeyError):
import matplotlib.cm as cm
cmap_fn = cm.get_cmap(cmap)
base = base_img.convert("RGBA")
W, H = base.size
h = heat_2d.astype(np.float32)
if h.max() > 0: h = (h - h.min()) / (h.max() - h.min() + 1e-9)
h_resized = cv2.resize(h, (W, H), interpolation=cv2.INTER_CUBIC)
rgba = (cmap_fn(h_resized) * 255).astype(np.uint8)
rgba[..., 3] = (h_resized * 255 * alpha).astype(np.uint8)
overlay = Image.fromarray(rgba, mode="RGBA")
return Image.alpha_composite(base, overlay).convert("RGB")
def detector_scorecard(image_path):
import forensics
scores = {}
_, ela_score = forensics.error_level_analysis(image_path)
scores["ELA"] = {"score": min(ela_score / 25.0, 1.0), "raw": round(ela_score, 2),
"what": "JPEG re-save artefacts"}
_, cm_count, _ = forensics.copy_move_detect(image_path)
scores["Copy-move (ORB)"] = {"score": min(cm_count / 50.0, 1.0), "raw": cm_count,
"what": "Duplicated regions"}
_, noise_ratio = forensics.noise_inconsistency(image_path)
scores["Noise inconsistency"] = {"score": min(noise_ratio * 4, 1.0),
"raw": round(noise_ratio, 3),
"what": "Splicing / region mismatch"}
exif_flags = forensics.exif_sanity(image_path)
scores["EXIF metadata"] = {"score": 0.0 if exif_flags == ["exif clean"] else 0.6,
"raw": "; ".join(exif_flags),
"what": "Edit-tool fingerprints"}
try:
ml = forensics.predict_with_model(image_path)
if ml is not None:
scores["Random Forest"] = {"score": ml["tamper_probability"],
"raw": ml["verdict"],
"what": "Learned forensic-feature blend"}
except Exception:
pass
for v in scores.values():
v["caught"] = v["score"] >= 0.4
return scores