Spaces:

ar07xd
/

deepshield

Running

App Files Files Community

deepshield / models /heatmap_generator.py

ar07xd

Sync from GitHub via hub-sync

07ff735 verified 9 days ago

raw

history blame contribute delete

13.9 kB

	from __future__ import annotations

	import base64
	import io
	from typing import Literal, Optional

	import cv2
	import numpy as np
	import torch
	from loguru import logger
	from PIL import Image
	from pytorch_grad_cam import GradCAMPlusPlus
	from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget

	from config import settings
	from models.model_loader import get_model_loader


	class _HFLogitsWrapper(torch.nn.Module):
	"""Wrap a HuggingFace image classification model so forward() returns logits
	as a plain tensor (pytorch_grad_cam expects tensor outputs, not dicts/dataclasses).
	"""

	def __init__(self, model: torch.nn.Module) -> None:
	super().__init__()
	self.model = model

	def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: # type: ignore[override]
	return self.model(pixel_values=pixel_values).logits


	def _vit_reshape_transform(tensor: torch.Tensor, height: int = 14, width: int = 14) -> torch.Tensor:
	"""Grad-CAM expects (B, C, H, W); ViT hidden states are (B, 1+H*W, C).
	Drop the CLS token and reshape tokens into a spatial grid.
	"""
	result = tensor[:, 1:, :]
	b, n, c = result.shape
	result = result.reshape(b, height, width, c)
	result = result.permute(0, 3, 1, 2) # (B, C, H, W)
	return result


	def _find_class_index(model: torch.nn.Module, label_tokens: tuple[str, ...]) -> Optional[int]:
	"""Find the first class index whose label contains one of `label_tokens`."""
	id2label: dict[int, str] = getattr(getattr(model, "config", None), "id2label", {}) or {}
	for idx, label in id2label.items():
	lowered = str(label).lower()
	if any(token in lowered for token in label_tokens):
	return int(idx)
	return None


	def _preprocess_for_cam(pil_img: Image.Image, processor) -> tuple[torch.Tensor, np.ndarray]:
	"""Return (input_tensor, rgb_float_224) where rgb_float_224 is a (H,W,3) float
	array in [0,1] matching the model input geometry — needed for overlaying.
	"""
	inputs = processor(images=pil_img, return_tensors="pt")
	input_tensor = inputs["pixel_values"].to(settings.DEVICE)

	size = getattr(processor, "size", {"height": 224, "width": 224})
	h = size.get("height", 224) if isinstance(size, dict) else 224
	w = size.get("width", 224) if isinstance(size, dict) else 224

	resized = pil_img.resize((w, h), Image.BILINEAR)
	rgb = np.array(resized).astype(np.float32) / 255.0 # (H,W,3) in [0,1]
	return input_tensor, rgb


	def _encode_overlay_to_base64(overlay: np.ndarray) -> str:
	"""Encode a uint8 RGB/RGBA overlay to a base64 data-URL PNG."""
	buf = io.BytesIO()
	Image.fromarray(overlay).save(buf, format="PNG")
	b64 = base64.b64encode(buf.getvalue()).decode("ascii")
	return f"data:image/png;base64,{b64}"


	def _compute_gradcam_pp(
	pil_img: Image.Image,
	target_class_idx: Optional[int] = None,
	) -> tuple[np.ndarray, np.ndarray]:
	"""Compute Grad-CAM++ averaged across the last 3 ViT encoder layers.
	Returns (grayscale_cam, rgb_float) where grayscale_cam is (H,W) in [0,1].
	"""
	loader = get_model_loader()
	model, processor = loader.load_image_model()

	model.eval()
	for p in model.parameters():
	p.requires_grad_(True)

	input_tensor, rgb_float = _preprocess_for_cam(pil_img, processor)

	grid = int(model.config.image_size / model.config.patch_size)

	# Average across last 3 ViT encoder layers for smoother heatmaps
	num_layers = len(model.vit.encoder.layer)
	last_n = min(3, num_layers)
	target_layers = [
	model.vit.encoder.layer[-(i + 1)].layernorm_before
	for i in range(last_n)
	]

	wrapped = _HFLogitsWrapper(model)

	if target_class_idx is None:
	target_class_idx = _find_class_index(
	model,
	("fake", "deepfake", "manipulated", "ai", "generated", "synthetic"),
	)

	targets = [ClassifierOutputTarget(int(target_class_idx))] if target_class_idx is not None else None

	with GradCAMPlusPlus(
	model=wrapped,
	target_layers=target_layers,
	reshape_transform=lambda t: _vit_reshape_transform(t, grid, grid),
	) as cam:
	grayscale_cam = cam(input_tensor=input_tensor, targets=targets)[0] # (H,W) in [0,1]

	return grayscale_cam, rgb_float


	def _face_bbox_from_detections(frame_data: dict, orig_h: int, orig_w: int) -> Optional[tuple[int,int,int,int]]:
	"""Extract (ymin, xmin, ymax, xmax) in pixel coords from BlazeFace frame_data."""
	detections = frame_data.get("detections", [])
	if len(detections) == 0:
	return None
	d = detections[0] # first (highest-confidence) face
	ymin = int(max(0, d[0]))
	xmin = int(max(0, d[1]))
	ymax = int(min(orig_h, d[2]))
	xmax = int(min(orig_w, d[3]))
	if ymax <= ymin or xmax <= xmin:
	return None
	return ymin, xmin, ymax, xmax


	def _compute_gradcam_pp_efficientnet(
	pil_img: Image.Image,
	) -> tuple[np.ndarray, Optional[tuple[int,int,int,int]], Literal["attention", "gradcam++"]]:
	"""Grad-CAM++ for EfficientNetAutoAttB4.

	Returns (grayscale_cam_224, face_bbox_pixels_or_None, heatmap_source).
	grayscale_cam_224 is in the 224x224 coordinate space of the face crop.
	face_bbox_pixels is (ymin, xmin, ymax, xmax) in original image pixels.
	"""
	loader = get_model_loader()
	eff = loader.load_efficientnet()
	if eff is None:
	raise RuntimeError("EfficientNet not loaded")

	if pil_img.mode != "RGB":
	pil_img = pil_img.convert("RGB")
	img_np = np.array(pil_img)
	orig_h, orig_w = img_np.shape[:2]

	frame_data = eff.face_extractor.process_image(img=img_np)
	faces: list = frame_data.get("faces", [])
	if not faces:
	raise ValueError("no_face")

	face_bbox = _face_bbox_from_detections(frame_data, orig_h, orig_w)

	face_t = eff._face_tensor(faces[0]).unsqueeze(0).to(eff.device)

	try:
	net = eff.net
	target_layers = [net.efficientnet._blocks[-1]]
	face_t.requires_grad_(True)
	for p in net.parameters():
	p.requires_grad_(True)
	with GradCAMPlusPlus(model=net, target_layers=target_layers) as cam:
	grayscale_cam = cam(input_tensor=face_t, targets=None)[0]
	return grayscale_cam, face_bbox, "gradcam++"
	except Exception as e:
	logger.warning(f"EfficientNet Grad-CAM++ failed ({e}), using uniform fallback")
	grayscale_cam = np.ones((224, 224), dtype=np.float32) * 0.5
	return grayscale_cam, face_bbox, "gradcam++"


	def _cam_to_full_image(
	grayscale_cam: np.ndarray,
	pil_img: Image.Image,
	face_bbox: Optional[tuple[int,int,int,int]] = None,
	) -> tuple[np.ndarray, np.ndarray]:
	"""Resize grayscale_cam to the original image dimensions.

	For EfficientNet (face-crop cam + known bbox): places the cam activation
	at the face location; background activation is 0.
	For ViT (full-image cam): bilinear resize to original dims.

	Returns (cam_full [H,W] float32), orig_np [H,W,3] float32 in [0,1]).
	"""
	orig_w, orig_h = pil_img.size
	orig_np = np.array(pil_img.convert("RGB")).astype(np.float32) / 255.0

	if face_bbox is not None:
	ymin, xmin, ymax, xmax = face_bbox
	face_h, face_w = ymax - ymin, xmax - xmin
	cam_full = np.zeros((orig_h, orig_w), dtype=np.float32)
	cam_resized = cv2.resize(grayscale_cam, (face_w, face_h), interpolation=cv2.INTER_LINEAR)
	cam_full[ymin:ymax, xmin:xmax] = cam_resized
	else:
	cam_full = cv2.resize(grayscale_cam, (orig_w, orig_h), interpolation=cv2.INTER_LINEAR)

	return cam_full, orig_np


	def _compute_gradcam_pp_densenet(
	pil_img: Image.Image,
	) -> tuple[np.ndarray, str]:
	"""Grad-CAM++ on the DenseNet121 face-GAN model.

	Target signal = fake probability = sigmoid(-logit), so we maximise the
	negated logit. Target layer = features.norm5 (final BN after last DenseBlock,
	7×7×1024 activation map). Returns (grayscale_cam, source_tag).
	"""
	loader = get_model_loader()
	result = loader.load_densenet()
	if result is None:
	raise RuntimeError("DenseNet model unavailable")
	model, meta = result

	from services.densenet_service import _preprocess
	image_size = int(meta.get("image_size", 224))
	input_tensor = _preprocess(pil_img, image_size, settings.DEVICE)

	model.eval()
	for p in model.parameters():
	p.requires_grad_(True)

	# Target = last BN after all DenseBlocks (equivalent to conv5_block16_concat in Keras)
	target_layers = [model.features.norm5]

	# Negate logit so Grad-CAM gradients flow toward the FAKE class
	# (model output = real_probability logit; higher = more real)
	class _NegatedLogitWrapper(torch.nn.Module):
	def __init__(self, m: torch.nn.Module) -> None:
	super().__init__()
	self.m = m

	def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore[override]
	return -self.m(x) # negative logit → gradient points at fake evidence

	wrapped = _NegatedLogitWrapper(model)

	with GradCAMPlusPlus(model=wrapped, target_layers=target_layers) as cam:
	grayscale_cam = cam(input_tensor=input_tensor, targets=None)[0] # (H,W) in [0,1]

	return grayscale_cam, "gradcam++_densenet"


	def generate_heatmap_base64(
	pil_img: Image.Image,
	target_class_idx: Optional[int] = None,
	model_family: Literal["vit", "efficientnet", "densenet"] = "vit",
	) -> tuple[str, str]:
	"""Produce a base64 data-URL PNG of the Grad-CAM++ overlay at original image resolution.

	Returns (base64_png, heatmap_source).
	"""
	if model_family == "densenet":
	try:
	grayscale_cam, source = _compute_gradcam_pp_densenet(pil_img)
	cam_full, orig_np = _cam_to_full_image(grayscale_cam, pil_img, None)
	except Exception as e:
	logger.warning(f"DenseNet heatmap failed ({e}) — falling back to ViT Grad-CAM++")
	try:
	grayscale_cam, _ = _compute_gradcam_pp(pil_img, target_class_idx)
	cam_full, orig_np = _cam_to_full_image(grayscale_cam, pil_img, None)
	source = "vit_fallback"
	except Exception as fe:
	logger.warning(f"ViT fallback heatmap also failed: {fe}")
	return "", "none"
	elif model_family == "efficientnet":
	try:
	grayscale_cam, face_bbox, source = _compute_gradcam_pp_efficientnet(pil_img)
	cam_full, orig_np = _cam_to_full_image(grayscale_cam, pil_img, face_bbox)
	except ValueError:
	# BlazeFace found no face — fall back to ViT Grad-CAM on the full image.
	logger.info("EfficientNet heatmap: no face detected — falling back to ViT Grad-CAM++")
	try:
	grayscale_cam, _ = _compute_gradcam_pp(pil_img, target_class_idx)
	cam_full, orig_np = _cam_to_full_image(grayscale_cam, pil_img, None)
	source = "vit_fallback"
	except Exception as fe:
	logger.warning(f"ViT fallback heatmap also failed: {fe}")
	return "", "none"
	except Exception as e:
	logger.warning(f"EfficientNet heatmap failed: {e}")
	return "", "fallback"
	else:
	grayscale_cam, _ = _compute_gradcam_pp(pil_img, target_class_idx)
	source = "gradcam++"
	cam_full, orig_np = _cam_to_full_image(grayscale_cam, pil_img, None)

	# Generate transparent RGBA overlay so CSS can blend it without darkening the base image
	heatmap_colored = cv2.applyColorMap(np.uint8(255 * cam_full), cv2.COLORMAP_JET)
	heatmap_colored = cv2.cvtColor(heatmap_colored, cv2.COLOR_BGR2RGB)

	alpha = np.clip(cam_full * 1.8 * 255, 0, 255).astype(np.uint8)
	overlay_rgba = np.dstack((heatmap_colored, alpha))

	logger.info(f"Heatmap generated ({overlay_rgba.shape[1]}x{overlay_rgba.shape[0]}) source={source}")
	return _encode_overlay_to_base64(overlay_rgba), source


	def generate_boxes_base64(
	pil_img: Image.Image,
	target_class_idx: Optional[int] = None,
	top_k: int = 5,
	threshold: float = 0.4,
	) -> str:
	"""Draw Grad-CAM++ activation bounding boxes on the full original image.

	Uses the ViT cam (full-image coverage), resizes it to original dimensions,
	finds contours, and draws boxes at the correct pixel locations.
	"""
	grayscale_cam, _ = _compute_gradcam_pp(pil_img, target_class_idx)

	# Use original image as the canvas — resize cam to match
	orig_w, orig_h = pil_img.size
	base_img = np.array(pil_img.convert("RGB")).copy()
	cam_full = cv2.resize(grayscale_cam, (orig_w, orig_h), interpolation=cv2.INTER_LINEAR)

	binary = (cam_full >= threshold).astype(np.uint8) * 255
	contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	if not contours:
	logger.info("No significant activation regions found for bounding boxes")
	return _encode_overlay_to_base64(base_img)

	contours = sorted(contours, key=cv2.contourArea, reverse=True)[:top_k]

	# Scale line width to image size
	line_w = max(2, orig_w // 300)
	font_scale = max(0.5, orig_w / 1200)

	for cnt in contours:
	x, y, bw, bh = cv2.boundingRect(cnt)
	region_activation = cam_full[y:y + bh, x:x + bw].mean()

	if region_activation >= 0.7:
	color = (220, 40, 40)
	elif region_activation >= 0.5:
	color = (240, 140, 20)
	else:
	color = (230, 200, 40)

	cv2.rectangle(base_img, (x, y), (x + bw, y + bh), color, line_w)
	label = f"{region_activation * 100:.0f}%"
	cv2.putText(base_img, label, (x, max(y - 6, 14)),
	cv2.FONT_HERSHEY_SIMPLEX, font_scale, color, line_w, cv2.LINE_AA)

	logger.info(f"Bounding boxes generated: {len(contours)} regions on {orig_w}x{orig_h} image")
	return _encode_overlay_to_base64(base_img)