Spaces:

Simma7
/

deepfake_gaurd

Sleeping

App Files Files Community

deepfake_gaurd / prog /document.py

Simma7

Create document.py

acf615d verified about 1 month ago

raw

history blame contribute delete

7.61 kB


	import os
	import re
	import numpy as np
	from PIL import Image


	def _check_visual_noise(pil_img: Image.Image) -> tuple:
	"""
	Detects copy-paste artefacts via local noise variance analysis.
	Forged regions often have suspiciously low or mismatched
	noise variance compared to genuine document background.
	Returns (score 0-1, detail string).
	"""
	import cv2

	img_np = np.array(pil_img.convert("L"), dtype=np.float32)
	h, w = img_np.shape

	block = 32
	variances = []
	for y in range(0, h - block, block):
	for x in range(0, w - block, block):
	patch = img_np[y:y+block, x:x+block]
	variances.append(np.var(patch))

	if not variances:
	return 0.0, "Could not analyse noise (image too small)"

	variances = np.array(variances)
	# High coefficient of variation → suspicious variance jumps
	coef_var = np.std(variances) / (np.mean(variances) + 1e-8)

	# Thresholds tuned on document images
	if coef_var > 3.5:
	score = min(1.0, (coef_var - 3.5) / 4.0)
	detail = f"High variance inconsistency (CV={coef_var:.2f}) — possible copy-paste region"
	elif coef_var > 2.0:
	score = (coef_var - 2.0) / 1.5 * 0.5
	detail = f"Moderate variance anomaly (CV={coef_var:.2f})"
	else:
	score = 0.0
	detail = f"Noise pattern normal (CV={coef_var:.2f})"

	return score, detail


	def _check_pdf_metadata(path: str) -> tuple:
	"""
	Checks PDF metadata for common forgery signals:
	- Creation tool mismatch (e.g. Adobe → LibreOffice date newer than creation)
	- Missing standard metadata fields
	- Modification date earlier than creation date
	Returns (score 0-1, detail string).
	"""
	try:
	import PyPDF2
	with open(path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	meta = reader.metadata or {}

	signals = []
	score = 0.0

	creator = str(meta.get("/Creator", "")).lower()
	producer = str(meta.get("/Producer", "")).lower()
	created = str(meta.get("/CreationDate", ""))
	modified = str(meta.get("/ModDate", ""))

	# Check 1: creator and producer mismatch (strong forgery signal)
	if creator and producer:
	known_suites = [
	("microsoft", "libreoffice"), ("libreoffice", "adobe"),
	("adobe", "libreoffice"), ("word", "ghostscript"),
	]
	for c, p in known_suites:
	if c in creator and p in producer:
	signals.append(f"Creator/producer mismatch: '{creator}' vs '{producer}'")
	score += 0.4
	break

	# Check 2: modification predates creation
	if created and modified and len(created) > 4 and len(modified) > 4:
	try:
	c_year = int(re.search(r"D:(\d{4})", created).group(1))
	m_year = int(re.search(r"D:(\d{4})", modified).group(1))
	if m_year < c_year:
	signals.append(f"ModDate ({m_year}) predates CreationDate ({c_year})")
	score += 0.35
	except Exception:
	pass

	# Check 3: no standard metadata at all
	if not creator and not producer:
	signals.append("No creator/producer metadata — stripped or generated programmatically")
	score += 0.2

	score = min(1.0, score)
	detail = "; ".join(signals) if signals else "PDF metadata appears normal"
	return score, detail

	except Exception as e:
	return 0.0, f"PDF metadata check skipped: {e}"


	def _check_text_consistency(pil_img: Image.Image) -> tuple:
	"""
	Uses pytesseract OCR to detect font size/style inconsistencies
	within text regions. Genuine documents have consistent baseline
	spacing; forged insertions often deviate.
	Returns (score 0-1, detail string).
	"""
	try:
	import pytesseract
	data = pytesseract.image_to_data(
	pil_img, output_type=pytesseract.Output.DICT
	)
	heights = [
	h for h, conf in zip(data["height"], data["conf"])
	if conf > 60 and h > 5
	]

	if len(heights) < 5:
	return 0.0, "Insufficient text regions for OCR analysis"

	heights = np.array(heights, dtype=float)
	cv = np.std(heights) / (np.mean(heights) + 1e-8)

	if cv > 0.6:
	score = min(1.0, (cv - 0.6) / 0.6)
	detail = f"High font size variance (CV={cv:.2f}) — inconsistent text insertion likely"
	elif cv > 0.35:
	score = (cv - 0.35) / 0.25 * 0.4
	detail = f"Moderate font inconsistency (CV={cv:.2f})"
	else:
	score = 0.0
	detail = f"Text layout appears consistent (CV={cv:.2f})"

	return score, detail

	except Exception as e:
	return 0.0, f"OCR check skipped ({e})"


	def _render_pdf_page(path: str) -> Image.Image:
	"""Render first page of a PDF as a PIL Image."""
	try:
	import fitz # PyMuPDF
	doc = fitz.open(path)
	page = doc[0]
	mat = fitz.Matrix(2, 2) # 2x scale for better OCR
	pix = page.get_pixmap(matrix=mat)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	doc.close()
	return img
	except ImportError:
	# Fallback if PyMuPDF not installed — load as image directly
	return Image.open(path).convert("RGB")


	def _threat_level(score: float):
	if score < 0.20:
	return "NONE", " ALLOW"
	elif score < 0.45:
	return "LOW", " LOG"
	elif score < 0.70:
	return "MEDIUM", "ALERT"
	else:
	return "HIGH", " BLOCK"


	def detect_document(file_path: str) -> str:
	"""
	Main entry point. Accepts image files (JPG/PNG) or PDF.
	Returns a formatted forensic analysis report string.
	"""
	try:
	ext = os.path.splitext(file_path)[1].lower()
	is_pdf = ext == ".pdf"
	meta_score, meta_detail = 0.0, "N/A (not a PDF)"

	if is_pdf:
	pil_img = _render_pdf_page(file_path)
	meta_score, meta_detail = _check_pdf_metadata(file_path)
	else:
	pil_img = Image.open(file_path).convert("RGB")

	noise_score, noise_detail = _check_visual_noise(pil_img)
	text_score, text_detail = _check_text_consistency(pil_img)

	if is_pdf:
	# All three checks relevant for PDFs
	fused = (noise_score * 0.35 + text_score * 0.35 + meta_score * 0.30)
	else:
	# Only visual checks for images (no metadata)
	fused = (noise_score * 0.55 + text_score * 0.45)

	prediction = "FORGED" if fused >= 0.40 else "AUTHENTIC"
	threat, action = _threat_level(fused)

	bars = "█" * int(fused * 20) + "░" * (20 - int(fused * 20))

	report = f"""
	DOCUMENT FORENSIC REPORT
	{"="*40}

	Verdict : {' FORGED' if prediction == 'FORGED' else 'AUTHENTIC'}
	Risk Score : {fused:.2%} [{bars}]
	Threat : {threat}
	Action : {action}

	{"─"*40}
	FORENSIC CHECKS
	{"─"*40}

	Visual Noise Analysis
	Score : {noise_score:.2%}
	Detail : {noise_detail}

	Text/Font Consistency (OCR)
	Score : {text_score:.2%}
	Detail : {text_detail}

	PDF Metadata Integrity
	Score : {meta_score:.2%}
	Detail : {meta_detail}

	{"─"*40}
	{' FORGERY INDICATORS DETECTED. Recommend human review.' if prediction == 'FORGED' else '✅ No significant forgery indicators found.'}
	"""
	return report.strip()

	except Exception as e:
	return f" Document analysis error: {str(e)}"