Spaces:

lucasddmc
/

ViTViz

Sleeping

App Files Files Community

lucasddmc commited on Nov 14, 2025

Commit

e11edb1

1 Parent(s): b5c4373

feat: primeira versão

Browse files

Files changed (7) hide show

app.py +560 -0
requirements.txt +9 -0
utils/attacks.py +130 -0
utils/inference.py +27 -0
utils/model_loader.py +153 -0
utils/preprocessing.py +31 -0
utils/visualization.py +256 -0

app.py ADDED Viewed

	@@ -0,0 +1,560 @@

+import gradio as gr
+import torch
+from PIL import Image
+from typing import Optional, List, Tuple
+from utils.model_loader import load_model_and_labels
+from utils.preprocessing import get_default_transform, preprocess_image
+from utils.inference import predict_topk
+from utils.attacks import PGDIterations
+from utils.visualization import extract_attention_maps, attention_rollout, create_attention_overlay, extract_attention_for_iterations, create_iteration_attention_overlays
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+transform = get_default_transform()
+def _to_path(file_like: Optional[object]) -> Optional[str]:
+    """Extrai caminho de um objeto vindo do Gradio File (string, dict com 'name' ou objeto com atributo .name)."""
+    if file_like is None:
+        return None
+    if isinstance(file_like, str):
+        return file_like
+    # dict do gradio {'name': '/tmp/xxx', ...}
+    if isinstance(file_like, dict) and 'name' in file_like:
+        return file_like['name']
+    # objetos com .name
+    path = getattr(file_like, 'name', None)
+    if isinstance(path, str):
+        return path
+    return None
+def classify_image(model_file, image, labels_file=None):
+    """
+    Carrega o modelo e classifica a imagem
+    Args:
+        model_file: caminho para o arquivo .pth do modelo (pode ser state_dict ou modelo completo)
+        image: imagem PIL ou caminho para a imagem
+        labels_file: arquivo opcional com nomes das classes (.txt ou .json)
+    Returns:
+        str: label de classificação
+    """
+    try:
+        if model_file is None:
+            return "Por favor, envie um arquivo de modelo (.pth)"
+        # Extrair paths dos componentes de arquivo do Gradio
+        model_path = _to_path(model_file)
+        labels_path = _to_path(labels_file)
+        # Carregar modelo e labels
+        model, class_names, label_source = load_model_and_labels(model_path, labels_path, device=DEVICE)
+        # Processar imagem
+        if not (isinstance(image, str) or isinstance(image, Image.Image)):
+            return "Por favor, envie uma imagem válida"
+        img_tensor = preprocess_image(image, transform=transform).to(DEVICE)
+        # Inferência
+        top_prob, top_idx, num_classes, probabilities = predict_topk(model, img_tensor, top_k=5, device=DEVICE)
+        top_k = len(top_prob)
+        result = f"**Top {top_k} Predições:**\n\n"
+        result += f"**Modelo:** {num_classes} classes detectadas\n"
+        # Indicar origem dos labels
+        if class_names:
+            if label_source == 'file':
+                result += f"**Labels:** Carregados do arquivo\n\n"
+            else:
+                result += f"**Labels:** Encontrados no checkpoint\n\n"
+        else:
+            result += f"**Labels:** Não disponíveis (mostrando índices)\n\n"
+        for i, (prob, idx) in enumerate(zip(top_prob, top_idx)):
+            class_idx = idx.item()
+            if class_names and class_idx in class_names:
+                class_label = class_names[class_idx]
+                result += f"{i+1}. **{class_label}** (classe {class_idx}): {prob.item()*100:.2f}%\n"
+            else:
+                result += f"{i+1}. **Classe {class_idx}**: {prob.item()*100:.2f}%\n"
+        return result
+    except Exception as e:
+        import traceback
+        return f"❌ Erro ao processar modelo:\n```\n{str(e)}\n\n{traceback.format_exc()}\n```"
+def visualize_attention(
+    model_file,
+    image,
+    labels_file,
+    discard_ratio: float,
+    head_fusion: str,
+    alpha_overlay: float
+) -> Tuple[Image.Image, str]:
+    """
+    Visualiza o mapa de atenção do modelo usando Attention Rollout.
+    Args:
+        model_file: arquivo .pth do modelo
+        image: imagem PIL
+        labels_file: arquivo opcional de labels
+        discard_ratio: proporção de atenções fracas a descartar
+        head_fusion: como agregar heads ('mean', 'max', 'min')
+        alpha_overlay: transparência da sobreposição
+    Returns:
+        (attention_overlay_image, result_text)
+    """
+    try:
+        if model_file is None:
+            return None, "Por favor, envie um arquivo de modelo (.pth)"
+        if image is None:
+            return None, "Por favor, envie uma imagem"
+        # Carregar modelo e labels
+        model_path = _to_path(model_file)
+        labels_path = _to_path(labels_file)
+        model, class_names, label_source = load_model_and_labels(model_path, labels_path, device=DEVICE)
+        # Processar imagem
+        img_tensor = preprocess_image(image, transform=transform).to(DEVICE)
+        # Predição
+        top_prob, top_idx, num_classes, _ = predict_topk(model, img_tensor, top_k=1, device=DEVICE)
+        pred_class = top_idx[0].item()
+        pred_prob = top_prob[0].item()
+        # Extrair mapas de atenção (retorna lista, não dict)
+        attentions = extract_attention_maps(model, img_tensor)
+        # Aplicar Attention Rollout
+        attention_mask = attention_rollout(
+            attentions,
+            discard_ratio=discard_ratio,
+            head_fusion=head_fusion
+        )
+        # Criar overlay
+        attention_overlay = create_attention_overlay(
+            image,
+            attention_mask,
+            alpha=alpha_overlay,
+            colormap='jet'
+        )
+        # Formatar resultado
+        result = "## 🔍 Visualização de Atenção (Attention Rollout)\n\n"
+        result += f"**Predição do Modelo:**\n"
+        if class_names and pred_class in class_names:
+            result += f"- Classe: **{class_names[pred_class]}** (índice {pred_class})\n"
+        else:
+            result += f"- Classe: **{pred_class}**\n"
+        result += f"- Confiança: {pred_prob*100:.2f}%\n\n"
+        result += f"**Configuração da Visualização:**\n"
+        result += f"- Head Fusion: {head_fusion}\n"
+        result += f"- Discard Ratio: {discard_ratio:.1%}\n"
+        result += f"- Transparência: {alpha_overlay:.2f}\n\n"
+        result += "**Interpretação:**\n"
+        result += "As regiões em vermelho/amarelo indicam onde o modelo está 'olhando' para fazer a classificação.\n"
+        return attention_overlay, result
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Erro ao visualizar atenção:\n```\n{str(e)}\n\n{traceback.format_exc()}\n```"
+        return None, error_msg
+def run_pgd_attack(
+    model_file,
+    image,
+    labels_file,
+    eps: float,
+    alpha: float,
+    steps: int,
+    discard_ratio: float,
+    head_fusion: str,
+    alpha_overlay: float
+) -> Tuple[List[Image.Image], str, Image.Image, List[Image.Image]]:
+    """
+    Executa ataque PGD untargeted e extrai atenção de cada iteração.
+    Args:
+        model_file: arquivo .pth do modelo
+        image: imagem PIL
+        labels_file: arquivo opcional de labels
+        eps: epsilon (perturbação máxima)
+        alpha: step size
+        steps: número de iterações
+        discard_ratio: proporção de atenções fracas a descartar
+        head_fusion: como agregar heads ('mean', 'max', 'min')
+        alpha_overlay: transparência da sobreposição
+    Returns:
+        (iteration_images, result_text, final_adv_image, attention_overlays)
+    """
+    try:
+        if model_file is None:
+            return [], "Por favor, envie um arquivo de modelo (.pth)", None, []
+        if image is None:
+            return [], "Por favor, envie uma imagem", None, []
+        # Carregar modelo e labels
+        model_path = _to_path(model_file)
+        labels_path = _to_path(labels_file)
+        model, class_names, label_source = load_model_and_labels(model_path, labels_path, device=DEVICE)
+        # Processar imagem
+        img_tensor = preprocess_image(image, transform=transform).to(DEVICE)
+        # Predição original
+        top_prob_orig, top_idx_orig, num_classes, _ = predict_topk(model, img_tensor, top_k=1, device=DEVICE)
+        orig_class = top_idx_orig[0].item()
+        orig_prob = top_prob_orig[0].item()
+        # Configurar ataque PGD untargeted
+        attack = PGDIterations(model, eps=eps, alpha=alpha, steps=steps)
+        # Para untargeted, usamos a classe original como "label verdadeiro"
+        # O PGD maximizará a loss para essa classe (fazendo o modelo errar)
+        original_label = torch.tensor([orig_class], device=DEVICE)
+        # Executar ataque
+        adv_tensor, iteration_images = attack(img_tensor, original_label)
+        # Extrair atenção para todas as iterações (incluindo original)
+        attention_masks = extract_attention_for_iterations(
+            model,
+            attack.iteration_tensors,
+            discard_ratio=discard_ratio,
+            head_fusion=head_fusion
+        )
+        # Criar overlays de atenção
+        attention_overlays = create_iteration_attention_overlays(
+            iteration_images,
+            attention_masks,
+            alpha=alpha_overlay
+        )
+        # Predição adversarial
+        top_prob_adv, top_idx_adv, _, _ = predict_topk(model, adv_tensor, top_k=1, device=DEVICE)
+        adv_class = top_idx_adv[0].item()
+        adv_prob = top_prob_adv[0].item()
+        # Converter imagem adversarial final para PIL
+        from utils.attacks import tensor_to_pil
+        final_adv_image = tensor_to_pil(adv_tensor[0])
+        # Formatar resultado
+        result = "## 🎯 Resultado do Ataque PGD (Untargeted)\n\n"
+        result += f"**Configuração:**\n"
+        result += f"- Tipo: Untargeted (objetivo: fazer o modelo errar)\n"
+        result += f"- Epsilon (ε): {eps:.4f}\n"
+        result += f"- Alpha (α): {alpha:.4f}\n"
+        result += f"- Steps: {steps}\n\n"
+        result += f"**Predição Original:**\n"
+        if class_names and orig_class in class_names:
+            result += f"- Classe: **{class_names[orig_class]}** (índice {orig_class})\n"
+        else:
+            result += f"- Classe: **{orig_class}**\n"
+        result += f"- Confiança: {orig_prob*100:.2f}%\n\n"
+        result += f"**Predição Adversarial:**\n"
+        if class_names and adv_class in class_names:
+            result += f"- Classe: **{class_names[adv_class]}** (índice {adv_class})\n"
+        else:
+            result += f"- Classe: **{adv_class}**\n"
+        result += f"- Confiança: {adv_prob*100:.2f}%\n\n"
+        if adv_class != orig_class:
+            result += "✅ **Ataque bem-sucedido!** Modelo mudou a predição.\n\n"
+        else:
+            result += "⚠️ **Ataque falhou.** Modelo manteve a mesma predição.\n\n"
+        result += f"**Visualização de Atenção:**\n"
+        result += f"- Total de iterações capturadas: {len(attention_overlays)}\n"
+        result += f"- Use o slider abaixo para explorar como a atenção evolui durante o ataque\n"
+        result += f"- Iteração 0 = Imagem original\n"
+        result += f"- Iteração {steps} = Imagem adversarial final\n"
+        return iteration_images, result, final_adv_image, attention_overlays
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Erro ao executar ataque:\n```\n{str(e)}\n\n{traceback.format_exc()}\n```"
+        return [], error_msg, None, []
+def create_app():
+    """Cria interface Gradio"""
+    with gr.Blocks(title="ViTViz - Classifier & Attacks", theme=gr.themes.Soft()) as app:
+        gr.Markdown("""
+        # 🔍 ViTViz: Vision Transformer Classifier & Adversarial Attacks
+        """)
+        with gr.Tabs():
+            # Tab 1: Classificação simples
+            with gr.Tab("📊 Classificação"):
+                gr.Markdown("### Upload um modelo e uma imagem para classificação")
+                with gr.Row():
+                    with gr.Column():
+                        model_upload_classify = gr.File(
+                            label="Upload Model (.pth/.pt)",
+                            file_types=[".pth", ".pt"]
+                        )
+                        labels_upload_classify = gr.File(
+                            label="Upload Class Labels (opcional - .txt/.json)",
+                            file_types=[".txt", ".json"]
+                        )
+                        image_upload_classify = gr.Image(
+                            label="Upload Image",
+                            type="pil"
+                        )
+                        classify_btn = gr.Button("🚀 Classificar", variant="primary")
+                    with gr.Column():
+                        output_text_classify = gr.Markdown(label="Resultado")
+                # Event: classificar imagem
+                classify_btn.click(
+                    fn=classify_image,
+                    inputs=[model_upload_classify, image_upload_classify, labels_upload_classify],
+                    outputs=[output_text_classify]
+                )
+            # Tab 2: Ataque adversarial
+            with gr.Tab("⚔️ Adversarial Attack + Attention"):
+                gr.Markdown("### Execute ataques adversariais e visualize como a atenção evolui")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        model_upload_attack = gr.File(
+                            label="Upload Model (.pth/.pt)",
+                            file_types=[".pth", ".pt"]
+                        )
+                        labels_upload_attack = gr.File(
+                            label="Upload Class Labels (opcional - .txt/.json)",
+                            file_types=[".txt", ".json"]
+                        )
+                        image_upload_attack = gr.Image(
+                            label="Upload Image",
+                            type="pil"
+                        )
+                        gr.Markdown("#### ⚔️ Configuração do Ataque")
+                        with gr.Row():
+                            eps_input = gr.Slider(
+                                minimum=0.0,
+                                maximum=1.0,
+                                value=8/255,
+                                step=1/255,
+                                label="Epsilon (ε)"
+                            )
+                            alpha_input = gr.Slider(
+                                minimum=0.0,
+                                maximum=0.1,
+                                value=2/255,
+                                step=1/255,
+                                label="Alpha (α)"
+                            )
+                        steps_input = gr.Slider(
+                            minimum=1,
+                            maximum=100,
+                            value=10,
+                            step=1,
+                            label="Steps"
+                        )
+                        gr.Markdown("#### 👁️ Configuração da Atenção")
+                        head_fusion_attack = gr.Radio(
+                            choices=["mean", "max", "min"],
+                            value="max",
+                            label="Head Fusion"
+                        )
+                        with gr.Row():
+                            discard_ratio_attack = gr.Slider(
+                                minimum=0.0,
+                                maximum=1.0,
+                                value=0.9,
+                                step=0.05,
+                                label="Discard Ratio"
+                            )
+                            alpha_overlay_attack = gr.Slider(
+                                minimum=0.0,
+                                maximum=1.0,
+                                value=0.7,
+                                step=0.05,
+                                label="Alpha Overlay"
+                            )
+                        attack_btn = gr.Button("🚀 Executar Análise Completa", variant="primary", size="lg")
+                    with gr.Column(scale=2):
+                        output_text_attack = gr.Markdown(label="Resultado")
+                        with gr.Row():
+                            with gr.Column():
+                                gr.Markdown("**Imagem Adversarial Final**")
+                                final_adv_image = gr.Image(type="pil", show_label=False)
+                            with gr.Column():
+                                gr.Markdown("**Todas as Iterações**")
+                                iteration_gallery = gr.Gallery(
+                                    columns=5,
+                                    height="auto",
+                                    show_label=False
+                                )
+                # Seção de Evolução da Atenção
+                gr.Markdown("---")
+                gr.Markdown("### 🔍 Evolução da Atenção Durante o Ataque")
+                gr.Markdown("_Compare a imagem da iteração (esquerda) com o mapa de atenção (direita)_")
+                # ImageSlider para comparação lado a lado!
+                iteration_comparison = gr.ImageSlider(
+                    label="Iteração vs Atenção",
+                    type="pil",
+                    interactive=False
+                )
+                iteration_slider = gr.Slider(
+                    minimum=0,
+                    maximum=10,
+                    step=1,
+                    value=0,
+                    label="Iteração do Ataque"
+                )
+                # States para armazenar as imagens
+                attention_overlays_state = gr.State([])
+                iteration_images_state = gr.State([])
+                # Função para atualizar ImageSlider baseado no slider
+                def update_iteration_view(iteration_idx, iteration_images, attention_overlays):
+                    if not iteration_images or not attention_overlays:
+                        return None
+                    idx = int(iteration_idx)
+                    if idx >= len(iteration_images):
+                        idx = len(iteration_images) - 1
+                    # ImageSlider espera uma tupla (img_esquerda, img_direita)
+                    return (iteration_images[idx], attention_overlays[idx])
+                # Events
+                attack_btn.click(
+                    fn=run_pgd_attack,
+                    inputs=[
+                        model_upload_attack, image_upload_attack, labels_upload_attack,
+                        eps_input, alpha_input, steps_input,
+                        discard_ratio_attack, head_fusion_attack, alpha_overlay_attack
+                    ],
+                    outputs=[iteration_images_state, output_text_attack, final_adv_image, attention_overlays_state]
+                ).then(
+                    fn=lambda x: x,
+                    inputs=[iteration_images_state],
+                    outputs=[iteration_gallery]
+                ).then(
+                    fn=lambda imgs: gr.update(maximum=len(imgs)-1 if imgs else 0, value=0),
+                    inputs=[iteration_images_state],
+                    outputs=[iteration_slider]
+                ).then(
+                    fn=update_iteration_view,
+                    inputs=[iteration_slider, iteration_images_state, attention_overlays_state],
+                    outputs=[iteration_comparison]
+                )
+                # Update quando mexer no slider
+                iteration_slider.change(
+                    fn=update_iteration_view,
+                    inputs=[iteration_slider, iteration_images_state, attention_overlays_state],
+                    outputs=[iteration_comparison],
+                    show_progress="hidden"  # Remove loading - imagem simplesmente troca!
+                )
+            # Tab 3: Visualização de Atenção
+            with gr.Tab("👁️ Attention Visualization"):
+                gr.Markdown("### Visualize onde o modelo está 'olhando' na imagem")
+                with gr.Row():
+                    with gr.Column():
+                        model_upload_attention = gr.File(
+                            label="Upload Model (.pth/.pt)",
+                            file_types=[".pth", ".pt"]
+                        )
+                        labels_upload_attention = gr.File(
+                            label="Upload Class Labels (opcional - .txt/.json)",
+                            file_types=[".txt", ".json"]
+                        )
+                        image_upload_attention = gr.Image(
+                            label="Upload Image",
+                            type="pil"
+                        )
+                        gr.Markdown("#### Configuração da Visualização")
+                        head_fusion_input = gr.Radio(
+                            choices=["mean", "max", "min"],
+                            value="max",
+                            label="Head Fusion - Como agregar múltiplas cabeças de atenção"
+                        )
+                        discard_ratio_input = gr.Slider(
+                            minimum=0.0,
+                            maximum=1.0,
+                            value=0.9,
+                            step=0.05,
+                            label="Discard Ratio - Proporção de atenções fracas a descartar"
+                        )
+                        alpha_overlay_input = gr.Slider(
+                            minimum=0.0,
+                            maximum=1.0,
+                            value=0.7,
+                            step=0.05,
+                            label="Peso da Imagem Original (alpha) - 0.7 = 70% imagem, 30% heatmap"
+                        )
+                        attention_btn = gr.Button("👁️ Visualizar Atenção", variant="primary")
+                    with gr.Column():
+                        output_text_attention = gr.Markdown(label="Resultado")
+                        attention_output = gr.Image(label="Mapa de Atenção Sobreposto", type="pil")
+                # Event: visualizar atenção
+                attention_btn.click(
+                    fn=visualize_attention,
+                    inputs=[
+                        model_upload_attention,
+                        image_upload_attention,
+                        labels_upload_attention,
+                        discard_ratio_input,
+                        head_fusion_input,
+                        alpha_overlay_input
+                    ],
+                    outputs=[attention_output, output_text_attention]
+                )
+    return app
+if __name__ == "__main__":
+    app = create_app()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7861,
+        share=False
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio
+torch>=2.0.0
+torchvision>=0.15.0
+timm>=0.9.0
+torchattacks>=3.5.0
+numpy>=1.24.0
+opencv-python>=4.8.0
+matplotlib>=3.7.0
+Pillow>=10.0.0

utils/attacks.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import torchattacks
+from PIL import Image
+from typing import List, Tuple
+import numpy as np
+def denormalize_imagenet(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Reverte a normalização ImageNet de um tensor.
+    Args:
+        tensor: Tensor normalizado (CxHxW) com mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    Returns:
+        Tensor desnormalizado com valores em [0, 1]
+    """
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1).to(tensor.device)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1).to(tensor.device)
+    # Inverte: x_norm = (x - mean) / std  =>  x = x_norm * std + mean
+    denorm = tensor * std + mean
+    # Clip para garantir [0, 1]
+    return torch.clamp(denorm, 0, 1)
+def tensor_to_pil(tensor: torch.Tensor, denormalize: bool = True) -> Image.Image:
+    """
+    Converte tensor (CxHxW) para PIL Image RGB.
+    Args:
+        tensor: Tensor com shape (C, H, W)
+        denormalize: Se True, aplica desnormalização ImageNet antes da conversão
+    Returns:
+        PIL Image no espaço RGB [0, 255]
+    """
+    if denormalize:
+        tensor = denormalize_imagenet(tensor)
+    # tensor shape: (C, H, W) com valores [0, 1]
+    img_np = tensor.cpu().detach().numpy()
+    img_np = np.transpose(img_np, (1, 2, 0))  # HxWxC
+    img_np = (img_np * 255).clip(0, 255).astype(np.uint8)
+    return Image.fromarray(img_np, mode='RGB')
+class PGDIterations(torchattacks.PGD):
+    """
+    Extensão do ataque PGD padrão que captura e retorna
+    as imagens adversariais de cada iteração como lista de PIL Images.
+    """
+    def __init__(self, model, eps=0.05, alpha=0.005, steps=10, random_start=True):
+        # Inicializa PGD padrão com os parâmetros
+        super().__init__(model, eps=eps, alpha=alpha, steps=steps, random_start=random_start)
+        self.iteration_images: List[Image.Image] = []
+        self.iteration_tensors: List[torch.Tensor] = []
+    def forward(self, images, labels) -> Tuple[torch.Tensor, List[Image.Image]]:
+        """
+        Executa o ataque PGD e retorna:
+        - adv_images: tensor adversarial final
+        - iteration_images: lista de PIL Images (uma por iteração do ataque)
+        Implementação adaptada para trabalhar com imagens normalizadas ImageNet.
+        """
+        images = images.clone().detach().to(self.device)
+        labels = labels.clone().detach().to(self.device)
+        # Para targeted attack (se implementarmos no futuro)
+        if self.targeted:
+            target_labels = self.get_target_label(images, labels)
+        loss = torch.nn.CrossEntropyLoss()
+        adv_images = images.clone().detach()
+        # Desnormalizar para aplicar eps e clipping no espaço correto [0,1]
+        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(self.device)
+        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(self.device)
+        # Converter para espaço [0,1]
+        images_denorm = images * std + mean
+        adv_images_denorm = images_denorm.clone().detach()
+        if self.random_start:
+            # Starting at a uniformly random point no espaço [0,1]
+            adv_images_denorm = adv_images_denorm + torch.empty_like(adv_images_denorm).uniform_(-self.eps, self.eps)
+            adv_images_denorm = torch.clamp(adv_images_denorm, min=0, max=1).detach()
+        self.iteration_images = []
+        self.iteration_tensors = []
+        # Salvar iteração 0 (imagem original)
+        pil_img_orig = tensor_to_pil(images_denorm[0], denormalize=False)
+        self.iteration_images.append(pil_img_orig)
+        self.iteration_tensors.append(images.clone().detach())
+        for _ in range(self.steps):
+            # Normalizar para passar pelo modelo
+            adv_images = (adv_images_denorm - mean) / std
+            adv_images.requires_grad = True
+            outputs = self.get_logits(adv_images)
+            # Calculate loss
+            if self.targeted:
+                cost = -loss(outputs, target_labels)
+            else:
+                cost = loss(outputs, labels)
+            # Update adversarial images
+            grad = torch.autograd.grad(cost, adv_images,
+                                       retain_graph=False, create_graph=False)[0]
+            # Voltar para espaço desnormalizado para aplicar perturbação
+            adv_images_denorm = adv_images_denorm.detach() + self.alpha * grad.sign() * std
+            delta = torch.clamp(adv_images_denorm - images_denorm, min=-self.eps, max=self.eps)
+            adv_images_denorm = torch.clamp(images_denorm + delta, min=0, max=1).detach()
+            # Normalizar para salvar tensor
+            adv_images_normalized = (adv_images_denorm - mean) / std
+            # Capturar imagem e tensor desta iteração
+            pil_img = tensor_to_pil(adv_images_denorm[0], denormalize=False)
+            self.iteration_images.append(pil_img)
+            self.iteration_tensors.append(adv_images_normalized.clone().detach())
+        # Retornar imagem normalizada para o modelo
+        adv_images = (adv_images_denorm - mean) / std
+        return adv_images, self.iteration_images

utils/inference.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from typing import Tuple, Optional
+import torch
+def predict_topk(
+    model: torch.nn.Module,
+    img_tensor: torch.Tensor,
+    top_k: int = 5,
+    device: Optional[torch.device] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
+    """Retorna top_k probabilidades e índices, número total de classes e vetor de probabilidades completo.
+    Saída: (top_prob, top_idx, num_classes, probabilities)
+    """
+    if device is not None:
+        img_tensor = img_tensor.to(device)
+    model.eval()
+    with torch.no_grad():
+        output = model(img_tensor)
+        if isinstance(output, tuple):
+            output = output[0]
+        logits = output[0]
+        probabilities = torch.nn.functional.softmax(logits, dim=0)
+        num_classes = probabilities.shape[0]
+        k = min(top_k, num_classes)
+        top_prob, top_idx = torch.topk(probabilities, k)
+        return top_prob, top_idx, num_classes, probabilities

utils/model_loader.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import pickle
+import torch
+import timm
+from typing import Optional, Tuple, Dict, Any
+DEVICE_DEFAULT = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class CustomUnpickler(pickle.Unpickler):
+    """Unpickler que ignora classes customizadas ausentes criando dummies dinamicamente."""
+    def find_class(self, module, name):
+        try:
+            return super().find_class(module, name)
+        except Exception:
+            # Cria uma classe dummy com o mesmo nome para permitir o unpickle
+            return type(name, (), {})
+def load_checkpoint(model_path: str, device: Optional[torch.device] = None) -> Any:
+    """Carrega um checkpoint/modelo do caminho informado, com fallback para unpickler customizado.
+    Retorna o objeto carregado (modelo completo, state_dict ou dict de checkpoint).
+    """
+    device = device or DEVICE_DEFAULT
+    try:
+        return torch.load(model_path, map_location=device, weights_only=False)
+    except (AttributeError, ModuleNotFoundError, RuntimeError):
+        # Fallback quando há classes ausentes ou conflitos de versão
+        with open(model_path, 'rb') as f:
+            return CustomUnpickler(f).load()
+def infer_num_classes(state_dict: Dict[str, torch.Tensor]) -> int:
+    """Infere o número de classes a partir do state_dict (camada de head).
+    Caso não encontre, retorna 1000 (padrão ImageNet).
+    """
+    for key, tensor in state_dict.items():
+        if 'head' in key and 'weight' in key and hasattr(tensor, 'shape'):
+            return tensor.shape[0]
+    return 1000
+def extract_class_names(checkpoint: Any) -> Optional[Dict[int, str]]:
+    """Tenta extrair nomes de classes de um checkpoint (se presente)."""
+    if not isinstance(checkpoint, dict):
+        return None
+    possible_keys = [
+        'class_names', 'classes', 'class_to_idx', 'idx_to_class',
+        'label_names', 'labels', 'class_labels'
+    ]
+    for key in possible_keys:
+        if key in checkpoint:
+            labels = checkpoint[key]
+            if isinstance(labels, list):
+                return {i: name for i, name in enumerate(labels)}
+            if isinstance(labels, dict):
+                # Se já for idx->nome
+                if all(isinstance(k, int) for k in labels.keys()):
+                    return labels  # type: ignore[return-value]
+                # Se for nome->idx
+                if all(isinstance(v, int) for v in labels.values()):
+                    return {v: k for k, v in labels.items()}
+            return labels  # type: ignore[return-value]
+    return None
+def load_class_names_from_file(labels_file: Optional[str]) -> Optional[Dict[int, str]]:
+    """Carrega nomes de classes de um arquivo .txt (um por linha) ou .json (lista ou dict)."""
+    if not labels_file:
+        return None
+    import json
+    try:
+        if labels_file.endswith('.json'):
+            with open(labels_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                if isinstance(data, list):
+                    return {i: name for i, name in enumerate(data)}
+                if isinstance(data, dict):
+                    out: Dict[int, str] = {}
+                    for k, v in data.items():
+                        try:
+                            out[int(k)] = v
+                        except Exception:
+                            # Ignora chaves não numéricas
+                            pass
+                    if out:
+                        return out
+                    # fallback se for nome->idx
+                    if all(isinstance(v, int) for v in data.values()):
+                        return {v: k for k, v in data.items()}
+                    return None
+        else:
+            with open(labels_file, 'r', encoding='utf-8') as f:
+                lines = [line.strip() for line in f if line.strip()]
+                return {i: name for i, name in enumerate(lines)}
+    except Exception:
+        return None
+def build_model_from_checkpoint(checkpoint: Any, device: Optional[torch.device] = None) -> torch.nn.Module:
+    """Constroi um modelo a partir de um checkpoint que pode ser um dict, state_dict ou o próprio modelo."""
+    device = device or DEVICE_DEFAULT
+    if isinstance(checkpoint, dict):
+        if 'model' in checkpoint:
+            model = checkpoint['model']
+        elif 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+            num_classes = infer_num_classes(state_dict)
+            model = timm.create_model('vit_base_patch16_224', pretrained=False, num_classes=num_classes)
+            model.load_state_dict(state_dict)
+        else:
+            # assume dict é um state_dict
+            num_classes = infer_num_classes(checkpoint)
+            model = timm.create_model('vit_base_patch16_224', pretrained=False, num_classes=num_classes)
+            model.load_state_dict(checkpoint)
+    else:
+        # modelo completo salvo via torch.save(model, ...)
+        model = checkpoint
+    model = model.to(device)
+    model.eval()
+    return model
+def load_model_and_labels(
+    model_path: str,
+    labels_file: Optional[str] = None,
+    device: Optional[torch.device] = None,
+) -> Tuple[torch.nn.Module, Optional[Dict[int, str]], Optional[str]]:
+    """
+    ** Função Principal **
+    Carrega modelo e, se disponível, nomes de classes.
+    Retorna: (model, class_names, origem_labels) onde origem_labels ∈ {"file", "checkpoint", None}
+        None se não houver nomes de classes disponíveis.
+    """
+    device = device or DEVICE_DEFAULT
+    checkpoint = load_checkpoint(model_path, device=device)
+    class_names_ckpt = extract_class_names(checkpoint)
+    class_names_file = load_class_names_from_file(labels_file)
+    class_names = class_names_file or class_names_ckpt
+    source: Optional[str] = None
+    if class_names_file:
+        source = 'file'
+    elif class_names_ckpt:
+        source = 'checkpoint'
+    model = build_model_from_checkpoint(checkpoint, device=device)
+    return model, class_names, source

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Optional, Union
+from PIL import Image
+from torchvision import transforms
+import torch
+def get_default_transform() -> transforms.Compose:
+    """Transform padrão (Resize+CenterCrop+Normalize) compatível com modelos ImageNet."""
+    return transforms.Compose([
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+def preprocess_image(
+    image: Union[str, Image.Image],
+    transform: Optional[transforms.Compose] = None,
+) -> torch.Tensor:
+    """Carrega e transforma uma imagem (caminho ou PIL) retornando um tensor 1xCxHxW."""
+    transform = transform or get_default_transform()
+    if isinstance(image, str):
+        img = Image.open(image).convert('RGB')
+    elif isinstance(image, Image.Image):
+        img = image.convert('RGB')
+    else:
+        raise ValueError("Imagem inválida: informe caminho ou PIL.Image")
+    return transform(img).unsqueeze(0)

utils/visualization.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import torch
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+from torchvision.models.feature_extraction import create_feature_extractor
+from typing import Dict, Tuple
+def extract_attention_maps(model, image: torch.Tensor) -> list:
+    """
+    Extrai attention maps de todas as camadas do ViT usando hooks.
+    Implementação simplificada e robusta que calcula attention manualmente.
+    Args:
+        model: Modelo ViT
+        image: Tensor de imagem [1, 3, 224, 224]
+    Returns:
+        attentions: lista de tensores [batch, heads, patches, patches]
+    """
+    attentions = []
+    # Função de hook simplificada que captura entrada e calcula attention
+    def make_attention_hook():
+        def hook(module, input, output):
+            x = input[0]  # Input do módulo de atenção
+            B, N, C = x.shape
+            # Verificar se tem os componentes necessários
+            if not (hasattr(module, 'qkv') and hasattr(module, 'num_heads')):
+                return
+            # Calcular Q, K, V
+            qkv = module.qkv(x).reshape(B, N, 3, module.num_heads, C // module.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv.unbind(0)
+            # Calcular attention weights
+            scale = (C // module.num_heads) ** -0.5
+            attn = (q @ k.transpose(-2, -1)) * scale
+            attn = attn.softmax(dim=-1)
+            # Salvar (já no CPU para não acumular na GPU)
+            attentions.append(attn.detach().cpu())
+        return hook
+    # Encontrar e registrar hooks nos módulos de atenção
+    hooks = []
+    if not hasattr(model, 'blocks'):
+        raise ValueError("Modelo não tem atributo 'blocks'. Não é um ViT compatível.")
+    for i, block in enumerate(model.blocks):
+        if hasattr(block, 'attn'):
+            hook = block.attn.register_forward_hook(make_attention_hook())
+            hooks.append(hook)
+    if len(hooks) == 0:
+        raise ValueError("Não foi possível registrar hooks. Verifique a arquitetura do modelo.")
+    # Executar forward pass
+    model.eval()
+    with torch.inference_mode():
+        _ = model(image)
+    # Remover hooks
+    for hook in hooks:
+        hook.remove()
+    if len(attentions) == 0:
+        raise ValueError(
+            f"Nenhuma atenção capturada após registrar {len(hooks)} hooks. "
+            f"A arquitetura do modelo pode não ser compatível."
+        )
+    return attentions
+def attention_rollout(attentions: list,
+                     discard_ratio: float = 0.9,
+                     head_fusion: str = 'max') -> np.ndarray:
+    """
+    Implementa Attention Rollout seguindo a implementação original.
+    Referência: https://github.com/jacobgil/vit-explain
+    Args:
+        attentions: Lista de tensores [batch, heads, patches, patches]
+        discard_ratio: Proporção de atenções mais fracas a descartar (default: 0.9)
+        head_fusion: Como agregar múltiplas cabeças - 'mean', 'max' ou 'min'
+    Returns:
+        mask: Array numpy [grid_size, grid_size] com valores normalizados [0, 1]
+    """
+    # Inicializar com matriz identidade (CORREÇÃO 1)
+    result = torch.eye(attentions[0].size(-1))
+    with torch.no_grad():
+        for attention in attentions:
+            # Agregar heads (CORREÇÃO 2: usar axis=1, não dim=0)
+            if head_fusion == 'mean':
+                attention_heads_fused = attention.mean(axis=1)
+            elif head_fusion == 'max':
+                attention_heads_fused = attention.max(axis=1)[0]
+            elif head_fusion == 'min':
+                attention_heads_fused = attention.min(axis=1)[0]
+            else:
+                raise ValueError(f"head_fusion deve ser 'mean', 'max' ou 'min'")
+            # Descartar atenções fracas, mas proteger CLS token
+            flat = attention_heads_fused.view(attention_heads_fused.size(0), -1)
+            _, indices = flat.topk(int(flat.size(-1) * discard_ratio), -1, False)
+            indices = indices[indices != 0]  # Proteger CLS token
+            flat[0, indices] = 0
+            # Adicionar identidade e normalizar
+            I = torch.eye(attention_heads_fused.size(-1))
+            a = (attention_heads_fused + 1.0 * I) / 2
+            # CORREÇÃO 3: normalizar sem keepdim
+            a = a / a.sum(dim=-1)
+            # Rollout recursivo
+            result = torch.matmul(a, result)
+    # CORREÇÃO 4: Extrair atenção do CLS token (batch, CLS, patches)
+    # Look at the total attention between the class token and the image patches
+    mask = result[0, 0, 1:]
+    # Calcular tamanho do grid
+    width = int(mask.size(-1) ** 0.5)
+    mask = mask.reshape(width, width).numpy()
+    # Normalizar
+    mask = mask / np.max(mask)
+    return mask
+def create_attention_overlay(original_image: Image.Image,
+                            attention_mask: np.ndarray,
+                            alpha: float = 0.5,
+                            colormap: str = 'jet') -> Image.Image:
+    """
+    Cria visualização sobrepondo o mapa de atenção na imagem original.
+    Segue implementação de referência usando OpenCV.
+    Args:
+        original_image: Imagem PIL original
+        attention_mask: Máscara de atenção [H, W] normalizada [0, 1]
+        alpha: Peso da imagem original (0.7 = 70% imagem, 30% heatmap)
+        colormap: 'jet' (padrão OpenCV)
+    Returns:
+        Imagem PIL com overlay de atenção
+    """
+    import cv2
+    # Converter PIL para numpy array RGB
+    img_np = np.array(original_image).astype(np.float32) / 255.0
+    # Redimensionar máscara para o tamanho da imagem (224x224 ou tamanho original)
+    h, w = img_np.shape[:2]
+    mask_resized = cv2.resize(attention_mask, (w, h))
+    # Aplicar colormap do OpenCV (retorna BGR!)
+    heatmap = cv2.applyColorMap(np.uint8(255 * mask_resized), cv2.COLORMAP_JET)
+    # CRÍTICO: Converter BGR → RGB (OpenCV usa BGR!)
+    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
+    heatmap = heatmap.astype(np.float32) / 255.0
+    # Blend: alpha * img_original + (1-alpha) * heatmap
+    overlay = alpha * img_np + (1 - alpha) * heatmap
+    overlay = np.clip(overlay, 0, 1)
+    # Converter de volta para PIL
+    overlay_uint8 = (overlay * 255).astype(np.uint8)
+    return Image.fromarray(overlay_uint8)
+def extract_attention_for_iterations(
+    model,
+    iteration_tensors: list,
+    discard_ratio: float = 0.9,
+    head_fusion: str = 'max'
+) -> list:
+    """
+    Extrai mapas de atenção para cada iteração do ataque PGD.
+    Args:
+        model: Modelo ViT
+        iteration_tensors: Lista de tensors normalizados [1, 3, 224, 224] de cada iteração
+        discard_ratio: Proporção de atenções fracas a descartar
+        head_fusion: Como agregar heads ('mean', 'max', 'min')
+    Returns:
+        Lista de máscaras de atenção [14, 14] normalizadas [0, 1]
+    """
+    attention_masks = []
+    for tensor in iteration_tensors:
+        # Extrair attention maps para esta iteração
+        attentions = extract_attention_maps(model, tensor)
+        # Aplicar Attention Rollout
+        mask = attention_rollout(
+            attentions,
+            discard_ratio=discard_ratio,
+            head_fusion=head_fusion
+        )
+        attention_masks.append(mask)
+    return attention_masks
+def create_iteration_attention_overlays(
+    iteration_images: list,
+    attention_masks: list,
+    alpha: float = 0.7
+) -> list:
+    """
+    Cria overlays de atenção para cada iteração do ataque.
+    OTIMIZADO para velocidade de renderização.
+    Args:
+        iteration_images: Lista de PIL Images (uma por iteração)
+        attention_masks: Lista de máscaras de atenção [14, 14]
+        alpha: Transparência do overlay
+    Returns:
+        Lista de PIL Images com heatmaps sobrepostos (comprimidas)
+    """
+    overlays = []
+    for img, mask in zip(iteration_images, attention_masks):
+        overlay = create_attention_overlay(img, mask, alpha=alpha)
+        # OTIMIZAÇÃO AGRESSIVA: reduzir para 224x224 JPEG qualidade 75
+        overlay = overlay.resize((224, 224), Image.LANCZOS)
+        # Converter para RGB se necessário (JPEG não suporta RGBA)
+        if overlay.mode in ('RGBA', 'LA', 'P'):
+            background = Image.new('RGB', overlay.size, (255, 255, 255))
+            if overlay.mode == 'P':
+                overlay = overlay.convert('RGBA')
+            background.paste(overlay, mask=overlay.split()[-1] if overlay.mode == 'RGBA' else None)
+            overlay = background
+        overlays.append(overlay)
+    return overlays