Spaces:

griddev
/

project_02_DS

Sleeping

App Files Files Community

griddev commited on 28 days ago

Commit

64b98e5

verified ·

1 Parent(s): ce25d0a

Deploy Streamlit Space app

Browse files

Files changed (2) hide show

app.py +6 -4
models/attention_flow.py +7 -2

app.py CHANGED Viewed

@@ -782,7 +782,7 @@ def render_caption_card(model_name, caption, weight_src, num_beams, length_penal
 # ─────────────────────────────────────────────────────────────────────────────
 tab_caption, tab_compare, tab_attention, tab_results = st.tabs([
-    "🖼️  Caption", "🔀  Compare All Models", "🧠  Attention Explorer", "📊  Experiment Results"
 ])
@@ -998,11 +998,12 @@ with tab_compare:
 # ═══════════════════════════════════════════════════════════════════════════
-# Tab 3 — Attention Explorer (Task 2)
 # ═══════════════════════════════════════════════════════════════════════════
 with tab_attention:
-    st.markdown("### 🧠 BLIP Attention Explorer")
     st.caption(
         "Step-by-step cross-attention analysis with rollout across decoder layers, "
         "2x5 heatmap grid, IoU grounding score, and caption-length summary."
@@ -1049,10 +1050,11 @@ with tab_attention:
             )
         max_attn_steps = st.slider(
-            "Caption Steps to Analyze",
             min_value=3,
             max_value=12,
             value=9,
             key="attn_steps",
         )
         run_iou = st.toggle(

 # ─────────────────────────────────────────────────────────────────────────────
 tab_caption, tab_compare, tab_attention, tab_results = st.tabs([
+    "🖼️  Caption", "🔀  Compare All Models", "🧭  Word Focus Map", "📊  Experiment Results"
 ])
 # ═══════════════════════════════════════════════════════════════════════════
+# Tab 3 — Word Focus Map (Task 2)
 # ═══════════════════════════════════════════════════════════════════════════
 with tab_attention:
+    st.markdown("### 🧭 Word Focus Map")
+    st.markdown("`Task: Attention Weight Visualization & Cross-Attention Rollout for Caption Generation`")
     st.caption(
         "Step-by-step cross-attention analysis with rollout across decoder layers, "
         "2x5 heatmap grid, IoU grounding score, and caption-length summary."
             )
         max_attn_steps = st.slider(
+            "How many words to trace",
             min_value=3,
             max_value=12,
             value=9,
+            help="One step = one word position in the generated/custom text (word 1, word 2, ...).",
             key="attn_steps",
         )
         run_iou = st.toggle(

models/attention_flow.py CHANGED Viewed

@@ -80,7 +80,7 @@ def _normalize1d(tensor: torch.Tensor) -> torch.Tensor:
 def compute_attention_flow(
     extractor: FlowExtractor,
-    num_image_tokens: int = 197,
     residual_weight: float = 0.05,
     out_resolution: int = 224,
 ) -> np.ndarray:
@@ -93,6 +93,12 @@ def compute_attention_flow(
     if not valid_cams:
         return np.zeros((out_resolution, out_resolution), dtype=np.float32)
     uniform = torch.ones(num_image_tokens, device=valid_cams[0].device) / num_image_tokens
     rollout = _normalize1d(valid_cams[0])
     for cam in valid_cams[1:]:
@@ -325,4 +331,3 @@ def summarize_caption_alignment(results: List[dict], caption_length: int) -> dic
         return {"caption_length": caption_length, "mean_alignment_iou": 0.0}
     mean_iou = float(np.mean([item["iou"] for item in results]))
     return {"caption_length": caption_length, "mean_alignment_iou": mean_iou}

 def compute_attention_flow(
     extractor: FlowExtractor,
+    num_image_tokens: int | None = None,
     residual_weight: float = 0.05,
     out_resolution: int = 224,
 ) -> np.ndarray:
     if not valid_cams:
         return np.zeros((out_resolution, out_resolution), dtype=np.float32)
+    if num_image_tokens is None:
+        num_image_tokens = int(valid_cams[0].numel())
+    valid_cams = [cam for cam in valid_cams if int(cam.numel()) == int(num_image_tokens)]
+    if not valid_cams:
+        return np.zeros((out_resolution, out_resolution), dtype=np.float32)
     uniform = torch.ones(num_image_tokens, device=valid_cams[0].device) / num_image_tokens
     rollout = _normalize1d(valid_cams[0])
     for cam in valid_cams[1:]:
         return {"caption_length": caption_length, "mean_alignment_iou": 0.0}
     mean_iou = float(np.mean([item["iou"] for item in results]))
     return {"caption_length": caption_length, "mean_alignment_iou": mean_iou}