JointTaggerProject-Inference-Beta-AttnVis

Running

App Files Files Community

drhead commited on May 6, 2025

Commit

d3aa745

verified ·

1 Parent(s): 06cf327

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -31

app.py CHANGED Viewed

@@ -154,8 +154,6 @@ allowed_tags = list(tags.keys())
 for idx, tag in enumerate(allowed_tags):
     allowed_tags[idx] = tag.replace("_", " ")
 @spaces.GPU(duration=5)
 def run_classifier(image: Image.Image, threshold):
     img = image.convert('RGBA')
@@ -186,9 +184,6 @@ def cam_inference(img, threshold, alpha, evt: gr.SelectData):
     gradients = {}
     activations = {}
-    cam = None
-    target_tag_index = None
     def hook_forward(module, input, output):
         activations['value'] = output
@@ -200,29 +195,24 @@ def cam_inference(img, threshold, alpha, evt: gr.SelectData):
     handle_forward = model.norm.register_forward_hook(hook_forward)
     handle_backward = model.norm.register_full_backward_hook(hook_backward)
-    probits = model(tensor)[0].cpu()
     model.zero_grad()
-    target_score = probits[target_tag_index]
-    target_score.backward(retain_graph=True)
-    grads = gradients.get('value')
-    acts = activations.get('value')
-    patch_grads = grads
-    patch_acts = acts
-    weights = torch.mean(patch_grads, dim=1).squeeze(0)
-    cam_1d = torch.einsum('pe,e->p', patch_acts.squeeze(0), weights)
-    cam_1d = torch.relu(cam_1d)
-    cam = cam_1d.reshape(27, 27).detach().cpu().numpy()
     handle_forward.remove()
     handle_backward.remove()
-    gradients = {}
-    activations = {}
     return create_cam_visualization_pil(img, cam, alpha=alpha, vis_threshold=threshold), cam
@@ -245,26 +235,30 @@ def create_cam_visualization_pil(image_pil, cam, alpha=0.6, vis_threshold=0.2):
     size = max(w, h)
     # Normalize CAM to [0, 1]
-    cam_norm = (cam - cam.min()) / (np.ptp(cam) + 1e-8)
     # Create heatmap using matplotlib colormap
     colormap = cm.get_cmap('inferno')
-    cam_colored = colormap(cam_norm)[:, :, :3]  # RGB
-    cam_alpha = (cam_norm >= vis_threshold).astype(np.float32) * alpha  # Alpha mask
-    cam_rgba = np.dstack((cam_colored, cam_alpha))  # Shape: (H, W, 4)
     # Resize CAM to match image
-    cam_resized = Image.fromarray((cam_rgba * 255).astype(np.uint8), mode="RGBA").resize((216,216), resample=Image.Resampling.NEAREST).resize((size, size), resample=Image.Resampling.BICUBIC)
-    cam_image = transforms.CenterCrop((h, w))(cam_resized)
     # Composite over original
     composite = Image.alpha_composite(image_pil, cam_image)
     return composite
 with gr.Blocks(css=".output-class { display: none; }") as demo:
     gr.Markdown("""
     ## Joint Tagger Project: JTP-PILOT² Demo **BETA**
@@ -280,10 +274,20 @@ with gr.Blocks(css=".output-class { display: none; }") as demo:
     sorted_tag_score_state = gr.State(value={}) # stash a copy of the input image
     cam_state = gr.State()
     with gr.Row():
-        with gr.Column():
             image_input = gr.Image(label="Source", sources=['upload'], type='pil', height=512, show_label=False)
             threshold_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.20, label="Tag Threshold")
-            cam_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.20, label="CAM Threshold")
             alpha_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.60, label="CAM Alpha")
         with gr.Column():
             tag_string = gr.Textbox(label="Tag String")

 for idx, tag in enumerate(allowed_tags):
     allowed_tags[idx] = tag.replace("_", " ")
 @spaces.GPU(duration=5)
 def run_classifier(image: Image.Image, threshold):
     img = image.convert('RGBA')
     gradients = {}
     activations = {}
     def hook_forward(module, input, output):
         activations['value'] = output
     handle_forward = model.norm.register_forward_hook(hook_forward)
     handle_backward = model.norm.register_full_backward_hook(hook_backward)
+    probits = model(tensor)[0]
     model.zero_grad()
+    probits[target_tag_index].backward(retain_graph=True)
+    with torch.no_grad():
+        patch_grads = gradients.get('value')
+        patch_acts = activations.get('value')
+        weights = torch.mean(patch_grads, dim=1).squeeze(0)
+        cam_1d = torch.einsum('pe,e->p', patch_acts.squeeze(0), weights)
+        cam_1d = torch.relu(cam_1d)
+        cam = cam_1d.reshape(27, 27).detach().cpu().numpy()
     handle_forward.remove()
     handle_backward.remove()
     return create_cam_visualization_pil(img, cam, alpha=alpha, vis_threshold=threshold), cam
     size = max(w, h)
     # Normalize CAM to [0, 1]
+    cam -= cam.min()
+    cam /= cam.max()
     # Create heatmap using matplotlib colormap
     colormap = cm.get_cmap('inferno')
+    cam_rgb = colormap(cam)[:, :, :3]  # RGB
+    # Create alpha channel
+    cam_alpha = (cam >= vis_threshold).astype(np.float32) * alpha  # Alpha mask
+    cam_rgba = np.dstack((cam_rgb, cam_alpha))  # Shape: (H, W, 4)
     # Resize CAM to match image
+    cam_pil = Image.fromarray((cam_rgba * 255).astype(np.uint8), mode="RGBA")
+    cam_pil = cam_pil.resize((216,216), resample=Image.Resampling.NEAREST)
+    # Model uses padded image as input, this matches attention map to input image aspect ratio
+    cam_pil = cam_pil.resize((size, size), resample=Image.Resampling.BICUBIC)
+    cam_pil = transforms.CenterCrop((h, w))(cam_pil)
     # Composite over original
     composite = Image.alpha_composite(image_pil, cam_image)
     return composite
 with gr.Blocks(css=".output-class { display: none; }") as demo:
     gr.Markdown("""
     ## Joint Tagger Project: JTP-PILOT² Demo **BETA**
     sorted_tag_score_state = gr.State(value={}) # stash a copy of the input image
     cam_state = gr.State()
     with gr.Row():
+        custom_css = """
+        .inferno-slider input[type=range] {
+            background: linear-gradient(to right,
+                #000004, #1b0c41, #4a0c6b, #781c6d,
+                #a52c60, #cf4446, #ed6925, #fb9b06,
+                #f7d13d, #fcffa4
+            ) !important;
+            background-size: 100% 100% !important;
+        }
+        """
+        with gr.Column(css=custom_css):
             image_input = gr.Image(label="Source", sources=['upload'], type='pil', height=512, show_label=False)
             threshold_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.20, label="Tag Threshold")
+            cam_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.40, label="CAM Threshold", elem_classes="inferno-slider")
             alpha_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.60, label="CAM Alpha")
         with gr.Column():
             tag_string = gr.Textbox(label="Tag String")