Spaces:

SpandanM110
/

DocSentry

Sleeping

App Files Files Community

DocSentry / tampering.py

SpandanM110

Round 2: fraud ring graph, AI-gen detector, provenance ledger, architecture doc

e97f963 9 days ago

Raw

History Blame Contribute Delete

13.1 kB

	"""
	tampering.py - Smart document tampering for the Tamper Forge Studio.

	Each tamper function returns a dict:
	{
	"image": PIL.Image,
	"src_box": (x0,y0,x1,y1) or None,
	"dst_box": (x0,y0,x1,y1) or None,
	"description": str,
	"intensity": str,
	}
	"""

	import io, os, random, shutil
	import numpy as np
	from pathlib import Path
	from PIL import Image, ImageDraw, ImageFont, ImageFilter
	import cv2

	try:
	import pytesseract
	_TESS_OK = False
	for _c in (shutil.which("tesseract"),
	r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe",
	r"C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe",
	os.path.expanduser(r"~\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe")):
	if _c and os.path.isfile(_c):
	pytesseract.pytesseract.tesseract_cmd = _c
	_TESS_OK = True
	break
	if not _TESS_OK and shutil.which("tesseract"):
	_TESS_OK = True
	except ImportError:
	_TESS_OK = False


	def _font(size=18):
	for path in ("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
	r"C:\\Windows\\Fonts\\arial.ttf",
	"DejaVuSans.ttf", "arial.ttf"):
	try: return ImageFont.truetype(path, size)
	except OSError: continue
	return ImageFont.load_default()


	_INTENSITY = {
	"subtle": {"box_scale": 0.7, "jpeg_q": 75, "amount_jump": 1.5},
	"moderate": {"box_scale": 1.0, "jpeg_q": 50, "amount_jump": 3.0},
	"aggressive": {"box_scale": 1.4, "jpeg_q": 25, "amount_jump": 8.0},
	}
	def _profile(i): return _INTENSITY.get(i, _INTENSITY["moderate"])


	def _find_salient_box(img, w_target=180, h_target=80):
	"""High-variance region: likely a seal, signature, or stamp."""
	arr = np.array(img.convert("L"))
	H, W = arr.shape
	scale = max(1, max(H, W) // 400)
	small = arr[::scale, ::scale]
	sh, sw = small.shape
	kernel = max(15, min(sh, sw) // 8)
	if kernel >= sh or kernel >= sw:
	return (W//2 - w_target//2, H//2 - h_target//2,
	W//2 + w_target//2, H//2 + h_target//2)
	mean = cv2.boxFilter(small.astype(np.float32), -1, (kernel, kernel))
	sq_mean = cv2.boxFilter((small.astype(np.float32) ** 2), -1, (kernel, kernel))
	var = sq_mean - mean ** 2
	var[:kernel,:] = 0; var[-kernel:,:] = 0
	var[:,:kernel] = 0; var[:,-kernel:] = 0
	py, px = np.unravel_index(var.argmax(), var.shape)
	cx, cy = px * scale, py * scale
	return (max(0, cx - w_target//2), max(0, cy - h_target//2),
	min(W, cx + w_target//2), min(H, cy + h_target//2))


	def _find_text_box_via_ocr(img, keywords=("Rs", "Date", "Amount", "Total", "Principal", "Stamp")):
	if not _TESS_OK:
	return None
	try:
	import pytesseract
	data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
	except Exception:
	return None
	for i, txt in enumerate(data.get("text", [])):
	for kw in keywords:
	if kw.lower() in txt.lower():
	x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
	return (x, y, min(img.width, x + w * 8), y + h + 4)
	return None


	def tamper_copy_move(img, intensity="moderate", rng=None):
	rng = rng or random.Random()
	img = img.convert("RGB")
	arr = np.array(img)
	H, W = arr.shape[:2]
	p = _profile(intensity)
	bw, bh = int(180 * p["box_scale"]), int(90 * p["box_scale"])
	sx0, sy0, sx1, sy1 = _find_salient_box(img, bw, bh)
	patch = arr[sy0:sy1, sx0:sx1].copy()
	tx0 = 40 if sx0 > W // 2 else max(40, W - (sx1 - sx0) - 40)
	ty0 = sy0 + (1 if sy0 < H // 2 else -1) * (bh + 20)
	ty0 = max(40, min(H - bh - 40, ty0))
	tx1, ty1 = tx0 + (sx1 - sx0), ty0 + (sy1 - sy0)
	arr[ty0:ty1, tx0:tx1] = patch[: ty1 - ty0, : tx1 - tx0]
	return {"image": Image.fromarray(arr),
	"src_box": (sx0, sy0, sx1, sy1),
	"dst_box": (tx0, ty0, tx1, ty1),
	"description": f"Duplicated a {bw}x{bh}px high-variance region (likely a seal/signature) to another quadrant.",
	"intensity": intensity}


	def tamper_text_edit(img, intensity="moderate", rng=None):
	rng = rng or random.Random()
	img = img.convert("RGB").copy()
	p = _profile(intensity)
	W, H = img.size
	box = _find_text_box_via_ocr(img)
	used_ocr = box is not None
	if box is None:
	strip_y = int(H * 0.35); strip_h = int(36 * p["box_scale"])
	box = (int(W * 0.15), strip_y, int(W * 0.60), strip_y + strip_h)
	x0, y0, x1, y1 = box
	d = ImageDraw.Draw(img)
	d.rectangle(box, fill="white")
	new_amount = int(10_00_000 * p["amount_jump"])
	new_text = f"Rs {new_amount:,}"
	d.text((x0 + 6, y0 + 4), new_text, font=_font(int(20 * p["box_scale"])), fill="black")
	return {"image": img, "src_box": None, "dst_box": box,
	"description": f"Located a text/amount field via {'OCR' if used_ocr else 'fallback'} and rewrote it as '{new_text}'.",
	"intensity": intensity}


	def tamper_splice(img, donor=None, intensity="moderate", rng=None):
	rng = rng or random.Random()
	img = img.convert("RGB").copy()
	p = _profile(intensity)
	W, H = img.size
	if donor is None:
	sample_dir = Path("sample_data/originals")
	if sample_dir.exists():
	cands = [f for f in sample_dir.glob("*.png") if f.stat().st_size > 5000]
	if cands:
	donor = Image.open(rng.choice(cands)).convert("RGB")
	if donor is None:
	donor = Image.fromarray(np.full((H, W, 3),
	[rng.randint(180, 255), rng.randint(120, 200), rng.randint(80, 160)],
	dtype=np.uint8))
	donor = donor.resize((W, H))
	arr = np.array(img); darr = np.array(donor)
	bw, bh = int(220 * p["box_scale"]), int(80 * p["box_scale"])
	x = rng.randint(40, max(41, W - bw - 40))
	y = rng.randint(int(H * 0.55), max(int(H * 0.55) + 1, H - bh - 40))
	arr[y:y+bh, x:x+bw] = darr[y:y+bh, x:x+bw]
	return {"image": Image.fromarray(arr),
	"src_box": (x, y, x+bw, y+bh), "dst_box": (x, y, x+bw, y+bh),
	"description": f"Spliced a {bw}x{bh}px region from a different document into the lower area.",
	"intensity": intensity}


	def tamper_compression(img, intensity="moderate", rng=None):
	img = img.convert("RGB"); p = _profile(intensity)
	buf = io.BytesIO(); img.save(buf, "JPEG", quality=p["jpeg_q"]); buf.seek(0)
	return {"image": Image.open(buf).convert("RGB"),
	"src_box": None, "dst_box": None,
	"description": f"Re-saved at JPEG quality {p['jpeg_q']} (post-edit hide-the-evidence pattern).",
	"intensity": intensity}


	def tamper_metadata_strip(img, intensity="moderate", rng=None):
	img = img.convert("RGB")
	buf = io.BytesIO(); img.save(buf, "JPEG", quality=92); buf.seek(0)
	return {"image": Image.open(buf).convert("RGB"),
	"src_box": None, "dst_box": None,
	"description": "Stripped all EXIF metadata (mimics photo-editor export).",
	"intensity": intensity}


	def tamper_custom_region(img, box, intensity="moderate", rng=None):
	img = img.convert("RGB").copy()
	x0, y0, x1, y1 = [int(v) for v in box]
	crop = img.crop((x0, y0, x1, y1))
	radius = {"subtle": 1.5, "moderate": 3.0, "aggressive": 6.0}.get(intensity, 3.0)
	crop = crop.filter(ImageFilter.GaussianBlur(radius=radius))
	img.paste(crop, (x0, y0))
	d = ImageDraw.Draw(img)
	d.rectangle([x0+4, y0+4, min(x1, x0+60), min(y1, y0+24)], fill="white")
	d.text((x0+8, y0+6), "EDITED", font=_font(12), fill="black")
	return {"image": img, "src_box": None, "dst_box": (x0, y0, x1, y1),
	"description": f"User-drawn region blurred (sigma={radius}) and overpainted - adversarial test.",
	"intensity": intensity}


	TAMPER_FUNCTIONS = {
	"copy_move": ("Copy-move", "Duplicates a salient region (e.g. seal). Classic forgery."),
	"text_edit": ("Text edit", "Whites out a value and writes a new one. Loan-document fraud."),
	"splice": ("Splice", "Pastes a region from a different doc. Noise inconsistency."),
	"compression": ("Re-save", "Low JPEG quality to hide tampering. ELA catches it."),
	"metadata": ("Strip EXIF", "Removes EXIF metadata. EXIF audit catches it."),
	"custom": ("Custom (adversarial)", "User-drawn region blurred and overpainted. Adversarial."),
	}


	def tamper_dispatch(name, img, intensity="moderate", donor=None, custom_box=None, rng=None):
	if name == "copy_move": return tamper_copy_move(img, intensity, rng)
	if name == "text_edit": return tamper_text_edit(img, intensity, rng)
	if name == "splice": return tamper_splice(img, donor, intensity, rng)
	if name == "compression": return tamper_compression(img, intensity, rng)
	if name == "metadata": return tamper_metadata_strip(img, intensity, rng)
	if name == "custom":
	if custom_box is None:
	raise ValueError("custom needs custom_box")
	return tamper_custom_region(img, custom_box, intensity, rng)
	raise ValueError(f"unknown tamper: {name}")


	def tamper_chain(img, names, intensity="moderate", rng=None):
	rng = rng or random.Random()
	current = img.convert("RGB")
	steps, last_src, last_dst = [], None, None
	for n in names:
	out = tamper_dispatch(n, current, intensity=intensity, rng=rng)
	current = out["image"]
	steps.append({"name": n, "description": out["description"], "dst_box": out["dst_box"]})
	if out["dst_box"]: last_dst = out["dst_box"]
	if out["src_box"]: last_src = out["src_box"]
	return {"image": current, "src_box": last_src, "dst_box": last_dst,
	"description": " -> ".join(s["name"] for s in steps),
	"intensity": intensity, "steps": steps}


	def annotate_before_after(orig_img, tamper_meta, box_width=4):
	orig = orig_img.convert("RGB").copy()
	tampered = tamper_meta["image"].convert("RGB").copy()
	d_orig, d_tamp = ImageDraw.Draw(orig), ImageDraw.Draw(tampered)
	if tamper_meta.get("src_box"):
	x0, y0, x1, y1 = tamper_meta["src_box"]
	d_orig.rectangle(tamper_meta["src_box"], outline=(0, 200, 0), width=box_width)
	d_orig.rectangle([x0, max(0, y0-20), x0+90, y0], fill=(0, 200, 0))
	d_orig.text((x0+4, max(0, y0-18)), "SOURCE", font=_font(12), fill="white")
	if tamper_meta.get("dst_box"):
	x0, y0, x1, y1 = tamper_meta["dst_box"]
	d_tamp.rectangle(tamper_meta["dst_box"], outline=(220, 30, 30), width=box_width)
	d_tamp.rectangle([x0, max(0, y0-20), x0+110, y0], fill=(220, 30, 30))
	d_tamp.text((x0+4, max(0, y0-18)), "TAMPERED", font=_font(12), fill="white")
	return orig, tampered


	def overlay_heatmap_on_image(base_img, heat_2d, alpha=0.55, cmap="hot"):
	# Use the modern colormap registry (matplotlib >= 3.5),
	# fall back to the deprecated cm.get_cmap on older versions.
	try:
	import matplotlib as mpl
	cmap_fn = mpl.colormaps[cmap]
	except (AttributeError, KeyError):
	import matplotlib.cm as cm
	cmap_fn = cm.get_cmap(cmap)
	base = base_img.convert("RGBA")
	W, H = base.size
	h = heat_2d.astype(np.float32)
	if h.max() > 0: h = (h - h.min()) / (h.max() - h.min() + 1e-9)
	h_resized = cv2.resize(h, (W, H), interpolation=cv2.INTER_CUBIC)
	rgba = (cmap_fn(h_resized) * 255).astype(np.uint8)
	rgba[..., 3] = (h_resized * 255 * alpha).astype(np.uint8)
	overlay = Image.fromarray(rgba, mode="RGBA")
	return Image.alpha_composite(base, overlay).convert("RGB")


	def detector_scorecard(image_path):
	import forensics
	scores = {}
	_, ela_score = forensics.error_level_analysis(image_path)
	scores["ELA"] = {"score": min(ela_score / 25.0, 1.0), "raw": round(ela_score, 2),
	"what": "JPEG re-save artefacts"}
	_, cm_count, _ = forensics.copy_move_detect(image_path)
	scores["Copy-move (ORB)"] = {"score": min(cm_count / 50.0, 1.0), "raw": cm_count,
	"what": "Duplicated regions"}
	_, noise_ratio = forensics.noise_inconsistency(image_path)
	scores["Noise inconsistency"] = {"score": min(noise_ratio * 4, 1.0),
	"raw": round(noise_ratio, 3),
	"what": "Splicing / region mismatch"}
	exif_flags = forensics.exif_sanity(image_path)
	scores["EXIF metadata"] = {"score": 0.0 if exif_flags == ["exif clean"] else 0.6,
	"raw": "; ".join(exif_flags),
	"what": "Edit-tool fingerprints"}
	try:
	ml = forensics.predict_with_model(image_path)
	if ml is not None:
	scores["Random Forest"] = {"score": ml["tamper_probability"],
	"raw": ml["verdict"],
	"what": "Learned forensic-feature blend"}
	except Exception:
	pass
	for v in scores.values():
	v["caught"] = v["score"] >= 0.4
	return scores