dinov3-viz-sat493m

Running on Zero

App Files Files Community

pszemraj commited on Sep 5

Commit

252fe2c

verified ·

1 Parent(s): 43e8613

refactor to satellite-based dinoV3

Browse files

note this was done by claude, potentially overkill

Files changed (1) hide show

app.py +286 -52

app.py CHANGED Viewed

@@ -1,74 +1,167 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
 from PIL import Image, ImageOps
-import matplotlib.cm as cm
-import gradio as gr
 from transformers import AutoImageProcessor, AutoModel
-import spaces
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_MAP = {
-    "DINOv3 ViT+ Huge/16 (841M)": "facebook/dinov3-vith16plus-pretrain-lvd1689m",
-    "DINOv2 ViT Large/16 (303M)": "facebook/dinov3-vitl16-pretrain-lvd1689m",
-    "DINOv2 ConvNeXt Small (49.5M)": "facebook/dinov3-convnext-small-pretrain-lvd1689m",
 }
 DEFAULT_NAME = list(MODEL_MAP.keys())[0]
 processor = None
 model = None
 def load_model(name):
     global processor, model
-    model_id = MODEL_MAP[name]
-    processor = AutoImageProcessor.from_pretrained(model_id)
-    model = AutoModel.from_pretrained(model_id, torch_dtype=torch.float32).to(DEVICE).eval()
-    return f"Loaded: {name} → {model_id}"
 load_model(DEFAULT_NAME)
-@spaces.GPU()
 def _extract_grid(img):
     with torch.inference_mode():
-        pv = processor(images=img, return_tensors="pt").pixel_values.to(DEVICE)
         out = model(pixel_values=pv)
         last = out.last_hidden_state[0].to(torch.float32)
         num_reg = getattr(model.config, "num_register_tokens", 0)
         p = model.config.patch_size
         _, _, Ht, Wt = pv.shape
         gh, gw = Ht // p, Wt // p
-        feats = last[1 + num_reg:, :].reshape(gh, gw, -1).cpu()
     return feats, gh, gw
 def _overlay(orig, heat01, alpha=0.55, box=None):
     H, W = orig.height, orig.width
-    heat = Image.fromarray((heat01 * 255).astype(np.uint8)).resize((W, H))
-    rgba = (cm.get_cmap("inferno")(np.asarray(heat) / 255.0) * 255).astype(np.uint8)
-    ov = Image.fromarray(rgba, "RGBA"); ov.putalpha(int(alpha * 255))
     base = orig.copy().convert("RGBA")
     out = Image.alpha_composite(base, ov)
     if box:
         from PIL import ImageDraw
-        ImageDraw.Draw(out, "RGBA").rectangle(box, outline=(255, 255, 255, 220), width=2)
     return out
 def prepare(img):
     if img is None:
         return None
     base = ImageOps.exif_transpose(img.convert("RGB"))
     feats, gh, gw = _extract_grid(base)
     return {"orig": base, "feats": feats, "gh": gh, "gw": gw}
-def click(state, opacity, img_value, evt: gr.SelectData):
-    # If state wasn't prepared (e.g., Example selection), build it now
     if state is None and img_value is not None:
         state = prepare(img_value)
     if not state or evt.index is None:
-        # Just show whatever is currently in the image component
-        return img_value, state
     base, feats, gh, gw = state["orig"], state["feats"], state["gh"], state["gw"]
@@ -85,52 +178,193 @@ def click(state, opacity, img_value, evt: gr.SelectData):
     smin, smax = float(sims.min()), float(sims.max())
     heat01 = (sims - smin) / (smax - smin + 1e-12)
     box = (int(i * px_x), int(j * px_y), int((i + 1) * px_x), int((j + 1) * px_y))
-    overlay = _overlay(base, heat01, alpha=opacity, box=box)
-    return overlay, state
 def reset():
-    return None, None
-with gr.Blocks() as demo:
-    gr.Markdown("## DINOv3: patch similarity visualizer")
-    gr.Markdown("This is an app where you can upload an image, click on an object in the image and get the most similar patches to it according to DINOv3, revealing the way DINOv3 segments objects through features natively.")
-    gr.Markdown("There's multiple model options you can pick from the dropdown.")
-    gr.Markdown("Please click Reset before you want to upload another image, as this app keeps features of the images.")
-    with gr.Column():
-        with gr.Row(scale=1):
             model_choice = gr.Dropdown(
-                choices=list(MODEL_MAP.keys()), value=DEFAULT_NAME, label="Model"
             )
-            status = gr.Textbox(label="Status", value=f"Loaded: {DEFAULT_NAME}", interactive=False)
-            opacity = gr.Slider(0.0, 1.0, 0.55, step=0.05, label="Opacity for the Map")
-        with gr.Row(scale=1):
-            img = gr.Image(type="pil", label="Image", interactive=True, height=750, width=750)
-    state = gr.State()
-    model_choice.change(load_model, inputs=model_choice, outputs=status)
-    img.upload(prepare, inputs=img, outputs=state)
-    img.select(click, inputs=[state, opacity, img], outputs=[img, state], show_progress="minimal")
-    gr.Button("Reset").click(reset, outputs=[img, state])
     gr.Examples(
-                examples=[["flowers.PNG"], ["kedis.JPG"], ["kyoto.jpg"], ["nara.JPG"]],
-                inputs=img,
-                fn=prepare,
-                outputs=[img,state],
-                label="Try an example image and then click on the objects.",
-                examples_per_page=4,
-                cache_examples=False,
-                  )
 if __name__ == "__main__":
-    demo.launch()

+import gc
+from pathlib import Path
+import gradio as gr
+import matplotlib.cm as cm
 import numpy as np
+import spaces
 import torch
 import torch.nn.functional as F
 from PIL import Image, ImageOps
 from transformers import AutoImageProcessor, AutoModel
+# Device configuration with memory management
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_MAP = {
+    "DINOv3 ViT-L/16 Satellite": "facebook/dinov3-vitl16-pretrain-sat493m",
+    "DINOv3 ViT-L/16 LVD (General Web)": "facebook/dinov3-vitl16-pretrain-lvd1689m",
+    "⚠️ DINOv3 ViT-7B/16 Satellite": "facebook/dinov3-vit7b16-pretrain-sat493m",
 }
 DEFAULT_NAME = list(MODEL_MAP.keys())[0]
+# Global model state
 processor = None
 model = None
+def cleanup_memory():
+    """Aggressive memory cleanup for model switching"""
+    global processor, model
+    if model is not None:
+        del model
+        model = None
+    if processor is not None:
+        del processor
+        processor = None
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
 def load_model(name):
+    """Load model with proper memory management and dtype handling"""
     global processor, model
+    try:
+        # Clean up existing model
+        cleanup_memory()
+        model_id = MODEL_MAP[name]
+        # Load with auto dtype for optimal performance
+        processor = AutoImageProcessor.from_pretrained(model_id)
+        # Determine optimal dtype based on model size and hardware
+        if "7b" in model_id.lower() and torch.cuda.is_available():
+            # For 7B model, use bfloat16 if available for memory efficiency
+            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        else:
+            dtype = torch.float32
+        model = AutoModel.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            device_map="auto" if torch.cuda.is_available() else None,
+        )
+        if DEVICE == "cuda" and not hasattr(model, "device_map"):
+            model = model.to(DEVICE)
+        model.eval()
+        # Get model info
+        param_count = sum(p.numel() for p in model.parameters()) / 1e9
+        dtype_str = str(dtype).split(".")[-1]
+        return f"✅ Loaded: {name} | {param_count:.1f}B params | {dtype_str} | {DEVICE.upper()}"
+    except Exception as e:
+        cleanup_memory()
+        return f"❌ Failed to load {name}: {str(e)}"
+# Initialize default model
 load_model(DEFAULT_NAME)
+@spaces.GPU(duration=60)
 def _extract_grid(img):
+    """Extract feature grid from image"""
     with torch.inference_mode():
+        pv = processor(images=img, return_tensors="pt").pixel_values
+        if DEVICE == "cuda":
+            pv = pv.to(DEVICE)
         out = model(pixel_values=pv)
         last = out.last_hidden_state[0].to(torch.float32)
         num_reg = getattr(model.config, "num_register_tokens", 0)
         p = model.config.patch_size
         _, _, Ht, Wt = pv.shape
         gh, gw = Ht // p, Wt // p
+        feats = last[1 + num_reg :, :].reshape(gh, gw, -1).cpu()
     return feats, gh, gw
 def _overlay(orig, heat01, alpha=0.55, box=None):
+    """Create heatmap overlay with improved visualization"""
     H, W = orig.height, orig.width
+    heat = Image.fromarray((heat01 * 255).astype(np.uint8)).resize(
+        (W, H), resample=Image.LANCZOS
+    )
+    # Use a better colormap for satellite imagery
+    rgba = (cm.get_cmap("turbo")(np.asarray(heat) / 255.0) * 255).astype(np.uint8)
+    ov = Image.fromarray(rgba, "RGBA")
+    ov.putalpha(int(alpha * 255))
     base = orig.copy().convert("RGBA")
     out = Image.alpha_composite(base, ov)
     if box:
         from PIL import ImageDraw
+        draw = ImageDraw.Draw(out, "RGBA")
+        # Enhanced box visualization
+        draw.rectangle(box, outline=(255, 255, 255, 255), width=3)
+        draw.rectangle(
+            (box[0] - 1, box[1] - 1, box[2] + 1, box[3] + 1),
+            outline=(0, 0, 0, 200),
+            width=1,
+        )
     return out
 def prepare(img):
+    """Prepare image and extract features"""
     if img is None:
         return None
     base = ImageOps.exif_transpose(img.convert("RGB"))
     feats, gh, gw = _extract_grid(base)
     return {"orig": base, "feats": feats, "gh": gh, "gw": gw}
+def click(state, opacity, colormap, img_value, evt: gr.SelectData):
+    """Handle click events for similarity visualization"""
+    # If state wasn't prepared, build it now
     if state is None and img_value is not None:
         state = prepare(img_value)
     if not state or evt.index is None:
+        return img_value, state, None
     base, feats, gh, gw = state["orig"], state["feats"], state["gh"], state["gw"]
     smin, smax = float(sims.min()), float(sims.max())
     heat01 = (sims - smin) / (smax - smin + 1e-12)
+    # Update colormap dynamically
+    cm_func = cm.get_cmap(colormap.lower())
+    rgba = (cm_func(heat01) * 255).astype(np.uint8)
+    ov = Image.fromarray(rgba, "RGBA")
+    ov.putalpha(int(opacity * 255))
+    base_rgba = base.copy().convert("RGBA")
     box = (int(i * px_x), int(j * px_y), int((i + 1) * px_x), int((j + 1) * px_y))
+    out = Image.alpha_composite(base_rgba, ov)
+    if box:
+        from PIL import ImageDraw
+        draw = ImageDraw.Draw(out, "RGBA")
+        draw.rectangle(box, outline=(255, 255, 255, 255), width=3)
+        draw.rectangle(
+            (box[0] - 1, box[1] - 1, box[2] + 1, box[3] + 1),
+            outline=(0, 0, 0, 200),
+            width=1,
+        )
+    # Stats for info panel
+    stats = f"""📊 **Similarity Statistics**
+- Min: {smin:.3f}
+- Max: {smax:.3f}
+- Range: {smax - smin:.3f}
+- Patch: ({i}, {j})
+- Grid: {gw}×{gh}"""
+    return out, state, stats
 def reset():
+    """Reset the interface"""
+    return None, None, None
+# Build the interface
+with gr.Blocks(
+    theme=gr.themes.Soft(
+        primary_hue="blue",
+        secondary_hue="gray",
+        neutral_hue="gray",
+        font=gr.themes.GoogleFont("Inter"),
+    ),
+    css="""
+    .container {max-width: 1200px; margin: auto;}
+    .header {text-align: center; padding: 20px;}
+    .info-box {
+        background: rgba(0,0,0,0.03);
+        border-radius: 8px;
+        padding: 12px;
+        margin: 10px 0;
+        border-left: 4px solid #2563eb;
+    }
+    """,
+) as demo:
+    gr.HTML(
+        """
+    <div class="header">
+        <h1>🛰️ DINOv3 Satellite Vision: Interactive Patch Similarity</h1>
+        <p style="font-size: 1.1em; color: #666;">
+            Explore how DINOv3 models trained on satellite imagery understand visual patterns
+        </p>
+    </div>
+    """
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.Markdown(
+                """
+            ### How it works
+            1. **Select a model** - Satellite-pretrained models are optimized for aerial/satellite imagery
+            2. **Upload or select an image** - Works best with satellite, aerial, or outdoor scenes
+            3. **Click any region** - See how similar other patches are to your selection
+            4. **Adjust visualization** - Fine-tune opacity and colormap for clarity
+            """
+            )
+        with gr.Column(scale=2):
+            gr.HTML(
+                """
+            <div class="info-box">
+                <b>💡 Model Info:</b><br>
+                • <b>Satellite models</b>: Trained on 493M satellite images<br>
+                • <b>LVD model</b>: Trained on 1.7B diverse images<br>
+                • <b>7B model</b>: Massive capacity, slower but more nuanced
+            </div>
+            """
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
             model_choice = gr.Dropdown(
+                choices=list(MODEL_MAP.keys()),
+                value=DEFAULT_NAME,
+                label="🤖 Model Selection",
+                info="Satellite models excel at geographic and structural patterns",
             )
+            status = gr.Textbox(
+                label="📡 Model Status",
+                value=f"Ready: {DEFAULT_NAME}",
+                interactive=False,
+                lines=1,
+            )
+            with gr.Row():
+                opacity = gr.Slider(
+                    0.2,
+                    0.9,
+                    0.55,
+                    step=0.05,
+                    label="🎨 Heatmap Opacity",
+                    info="Balance between image and similarity map",
+                )
+                colormap = gr.Dropdown(
+                    choices=["Turbo", "Inferno", "Viridis", "Plasma", "Magma", "Jet"],
+                    value="Turbo",
+                    label="🌈 Colormap",
+                    info="Different maps for different contrasts",
+                )
+            info_panel = gr.Markdown(value=None, label="Statistics", visible=True)
+            with gr.Row():
+                reset_btn = gr.Button("🔄 Reset", variant="secondary", scale=1)
+                clear_btn = gr.ClearButton(value="🗑️ Clear All", scale=1)
+        with gr.Column(scale=2):
+            img = gr.Image(
+                type="pil",
+                label="Interactive Canvas (Click to explore)",
+                interactive=True,
+                height=600,
+                show_download_button=True,
+                show_share_button=False,
+            )
+    state = gr.State()
+    # Examples focused on satellite-relevant imagery
     gr.Examples(
+        examples=[
+            [_filepath.name]
+            for _filepath in Path.cwd().iterdir()
+            if _filepath.suffix.lower() in [".jpg", ".png", ".webp"]
+        ],
+        inputs=img,
+        fn=prepare,
+        outputs=[state],
+        label="Example Images",
+        examples_per_page=6,
+        cache_examples=False,
+    )
+    # Event handlers
+    model_choice.change(
+        load_model, inputs=model_choice, outputs=status, show_progress="full"
+    )
+    img.upload(prepare, inputs=img, outputs=state, show_progress="minimal")
+    img.select(
+        click,
+        inputs=[state, opacity, colormap, img],
+        outputs=[img, state, info_panel],
+        show_progress="minimal",
+    )
+    reset_btn.click(reset, outputs=[img, state, info_panel], show_progress=False)
+    clear_btn.add([img, state, info_panel])
+    gr.Markdown(
+        """
+    ---
+    <div style="text-align: center; color: #666; font-size: 0.9em;">
+        <b>Performance Notes:</b> Satellite models are optimized for geographic patterns, land use classification,
+        and structural analysis. The 7B model provides exceptional detail but requires significant compute.
+        <br><br>
+        Built with DINOv3 | Optimized for satellite and aerial imagery analysis
+    </div>
+    """
+    )
 if __name__ == "__main__":
+    demo.launch(share=False, show_error=True)