Spaces:
Running
Running
| import sys | |
| from pathlib import Path | |
| # 1. Injeta o caminho ANTES de qualquer import local ou externo | |
| ROOT_DIR = Path(__file__).resolve().parent.parent | |
| CLIP_SURGERY_PATH = str(ROOT_DIR / "CLIP_Surgery") | |
| if CLIP_SURGERY_PATH not in sys.path: | |
| sys.path.insert(0, CLIP_SURGERY_PATH) | |
| import clip as clip_surgery | |
| import cv2 | |
| import mediapipe.python.solutions.face_mesh as mp_face_mesh | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| from segmenter import FaceSegmenter | |
| from huggingface_hub import hf_hub_download | |
| from torchvision import transforms | |
| from torchvision.transforms import InterpolationMode | |
| from config import ( | |
| CLIP_MEAN, | |
| CLIP_STD, | |
| DEVICE, | |
| FAKE_PROMPT_KEYWORDS, | |
| REAL_PROMPTS, | |
| SURGERY_PROMPTS, | |
| SURGERY_RES, | |
| ) | |
| # ════════════════════════════════════════════════════════════ | |
| # MEDIAPIPE — Máscara facial + Regiões | |
| # ════════════════════════════════════════════════════════════ | |
| _face_mesh = None | |
| _segmenter_instance = None | |
| def get_segmenter(): | |
| """Lazy loading rigoroso do BiSeNet. Só carrega quando chamado a primeira vez.""" | |
| global _segmenter_instance | |
| if _segmenter_instance is None: | |
| print(" A instanciar BiSeNet FaceSegmenter...") | |
| model_path = hf_hub_download(repo_id="liamu/Deepfake-Pesos", filename="79999_iter.pth") | |
| _segmenter_instance = FaceSegmenter(model_path=model_path, device=DEVICE) | |
| print(" BiSeNet pronto.") | |
| return _segmenter_instance | |
| def get_region_masks(img_rgb: np.ndarray) -> dict: | |
| # Chama o método na instância correta | |
| return get_segmenter().get_masks(img_rgb) | |
| def get_face_mesh(): | |
| global _face_mesh | |
| if _face_mesh is None: | |
| # Usa a importacao direta do modulo | |
| _face_mesh = mp_face_mesh.FaceMesh( | |
| static_image_mode=True, | |
| max_num_faces=1, | |
| refine_landmarks=True, | |
| min_detection_confidence=0.5, | |
| ) | |
| return _face_mesh | |
| def build_face_mask(img_rgb: np.ndarray) -> np.ndarray: | |
| """ | |
| Cria máscara da região facial via MediaPipe convexHull. | |
| Expande 10% para cima para incluir a testa. | |
| Fallback: máscara de uns se face não detectada. | |
| """ | |
| h, w = img_rgb.shape[:2] | |
| mesh = get_face_mesh() | |
| result = mesh.process(img_rgb) | |
| if not result.multi_face_landmarks: | |
| return np.ones((h, w), dtype=np.float32) | |
| lm = result.multi_face_landmarks[0].landmark | |
| points = np.array( | |
| [(int(lm_point.x * w), int(lm_point.y * h)) for lm_point in lm], dtype=np.int32 | |
| ) | |
| hull = cv2.convexHull(points) | |
| y_min = hull[:, 0, 1].min() | |
| y_max = hull[:, 0, 1].max() | |
| face_h = y_max - y_min | |
| # 10% padding para cima | |
| forehead_expansion = int(face_h * 0.10) | |
| hull_expanded = hull.copy() | |
| top_mask = hull_expanded[:, 0, 1] < (y_min + face_h * 0.35) | |
| hull_expanded[top_mask, 0, 1] = np.maximum( | |
| 0, hull_expanded[top_mask, 0, 1] - forehead_expansion | |
| ) | |
| mask = np.zeros((h, w), dtype=np.uint8) | |
| cv2.fillConvexPoly(mask, hull_expanded, 1) | |
| mask_f = cv2.GaussianBlur(mask.astype(np.float32), (31, 31), 0) | |
| mask_f = mask_f / (mask_f.max() + 1e-8) | |
| return mask_f | |
| # ════════════════════════════════════════════════════════════ | |
| # CLIP SURGERY — Heatmaps visuais | |
| # ════════════════════════════════════════════════════════════ | |
| _surgery_model = None | |
| _surgery_preprocess = None | |
| def get_surgery_model(): | |
| global _surgery_model, _surgery_preprocess | |
| if _surgery_model is None: | |
| print(" A carregar CLIP Surgery CS-ViT-L/14...") | |
| _surgery_model, _ = clip_surgery.load("CS-ViT-L/14", device=DEVICE) | |
| _surgery_model.eval() | |
| _surgery_preprocess = transforms.Compose( | |
| [ | |
| transforms.Resize( | |
| (SURGERY_RES, SURGERY_RES), | |
| interpolation=InterpolationMode.BICUBIC, | |
| ), | |
| transforms.ToTensor(), | |
| transforms.Normalize(CLIP_MEAN, CLIP_STD), | |
| ] | |
| ) | |
| print(" CLIP Surgery pronto.") | |
| return _surgery_model, _surgery_preprocess | |
| def generate_heatmap(img_rgb, method: str = ""): | |
| """ | |
| Gera heatmap via CLIP Surgery com análise contrastiva. | |
| Heatmap contrastivo = manipulação - real | |
| → Só ficam activas as zonas onde manipulação > real | |
| → Elimina a contradição de "AI face" e "real human face" activarem igual | |
| Devolve: contrastive, per_text, scores, prompts, top_heatmap | |
| contrastive — heatmap manipulação - real (zonas genuinamente suspeitas) | |
| per_text — dict {prompt: heatmap normalizado} | |
| scores — dict {prompt: score médio na face} | |
| prompts — lista de prompts usados | |
| top_heatmap — heatmap do prompt de manipulação com maior score | |
| """ | |
| prompts = SURGERY_PROMPTS | |
| sm, sp = get_surgery_model() | |
| h, w = img_rgb.shape[:2] | |
| tensor = sp(Image.fromarray(img_rgb)).unsqueeze(0).to(DEVICE) | |
| with torch.no_grad(): | |
| img_feats = sm.encode_image(tensor) | |
| img_feats = img_feats / img_feats.norm(dim=-1, keepdim=True) | |
| txt_feats = clip_surgery.encode_text_with_prompt_ensemble( | |
| sm, prompts, DEVICE | |
| ) # Features de texto para cada prompt → shape (n_prompts, 512) | |
| similarity = clip_surgery.clip_feature_surgery( | |
| img_feats, txt_feats | |
| ) # L2 e faz similiaridade para cada patch e prompt, removendo CLS para isolar o que é especifico | |
| sim_map = clip_surgery.get_similarity_map( | |
| similarity[:, 1:, :], (h, w) | |
| ) # remove CLS | |
| face_mask = build_face_mask(img_rgb) | |
| face_pixels = face_mask > 0.5 | |
| sim_np = sim_map[0].cpu().numpy() | |
| fake_maps = [] | |
| real_maps = [] | |
| per_text = {} | |
| scores = {} | |
| for n, text in enumerate(prompts): | |
| m = sim_np[:, :, n] # heatmap para o prompt n | |
| m = (m - m.min()) / (m.max() - m.min() + 1e-8) # normaliza para 0-1 | |
| m = m * face_mask # Nao considera zonas fora da face | |
| m = (m - m.min()) / (m.max() - m.min() + 1e-8) # re-normaliza após a mascara | |
| per_text[text] = m.astype(np.float32) | |
| scores[text] = float(m[face_pixels].mean()) if face_pixels.any() else 0.0 | |
| is_manip = any(kw.lower() in text.lower() for kw in FAKE_PROMPT_KEYWORDS) | |
| is_real = text in REAL_PROMPTS | |
| if is_manip: | |
| fake_maps.append(m) | |
| elif is_real: | |
| real_maps.append(m) | |
| # Heatmap de manipulação médio | |
| manip_mean = ( | |
| np.mean(fake_maps, axis=0).astype(np.float32) | |
| if fake_maps | |
| else np.zeros((h, w), dtype=np.float32) | |
| ) | |
| # Heatmap real médio | |
| real_mean = ( | |
| np.mean(real_maps, axis=0).astype(np.float32) | |
| if real_maps | |
| else np.zeros((h, w), dtype=np.float32) | |
| ) | |
| # Heatmap contrastivo: manipulação - real → só zonas genuinamente suspeitas | |
| contrastive = manip_mean - real_mean | |
| contrastive = np.clip(contrastive, 0, None) # só valores positivos | |
| if contrastive.max() > 1e-8: | |
| contrastive = (contrastive / contrastive.max()).astype(np.float32) | |
| contrastive = contrastive * face_mask | |
| # Top heatmap: prompt de manipulação com maior score | |
| manip_scores = { | |
| t: s | |
| for t, s in scores.items() | |
| if any(kw.lower() in t.lower() for kw in FAKE_PROMPT_KEYWORDS) | |
| } | |
| top_prompt_name = ( | |
| max(manip_scores, key=manip_scores.get) | |
| if manip_scores | |
| else max(scores, key=scores.get) | |
| ) | |
| top_heatmap = per_text[top_prompt_name] | |
| return contrastive, per_text, scores, prompts, top_heatmap | |
| def score_regions_manipulation(img_hires, heatmap, masks, scores): | |
| """ | |
| img_hires: A imagem original (usada para referencia de tamanho). | |
| heatmap: O mapa de calor gerado pelo CLIP. | |
| masks: O dicionario de mascaras gerado pelo BiSeNet. | |
| scores: Limiares de pontuacao ou configuracoes adicionais. | |
| """ | |
| reg_scores = {} | |
| # 1. Garantir que o heatmap esta na mesma escala que as mascaras | |
| h, w = heatmap.shape[:2] | |
| # 2. Calcular scores por regiao | |
| for name, mask in masks.items(): | |
| # Redimensionar a mascara do BiSeNet (512x512) para o tamanho do heatmap | |
| mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_NEAREST) | |
| # Aplicar mascara | |
| masked_heatmap = heatmap * mask_resized | |
| # Calcular score (Percentil 95 para capturar picos de anomalia) | |
| if np.sum(mask_resized) > 0: | |
| score = np.percentile(masked_heatmap[mask_resized > 0], 95) | |
| else: | |
| score = 0.0 | |
| reg_scores[name] = {"contrast": score} | |
| return reg_scores | |