dinov3-viz-sat493m

Running on Zero

App Files Files Community

pszemraj commited on Sep 17

Commit

fe31b95

verified ·

1 Parent(s): bded4ff

Dynamic img sizing

Browse files

Key Changes:

Dynamic sizing function (compute_dynamic_size): Computes optimal dimensions up to 720px max while maintaining aspect ratio AND ensuring divisibility by patch_size (16).
Custom preprocessing (preprocess_image): Bypasses the rigid 224×224 resize. DINOv3 was trained with scale augmentation and its RoPE handles this perfectly.
Resolution feedback: Added resolution_info textbox showing actual processing resolution and patch grid dimensions.
Proper normalization: Still applies ImageNet normalization but at the dynamically computed resolution.

Why This Works:
DINOv3's 3D RoPE embeddings are computed dynamically in the forward pass based on actual image dimensions (see get_patches_center_coordinates in the modeling code). The model was trained with random scale augmentation specifically to handle variable sizes. You were unnecessarily constraining it.

Files changed (1) hide show

app.py +116 -32

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ MODEL_MAP = {
 }
 DEFAULT_NAME = list(MODEL_MAP.keys())[0]
 # Global model state
 processor = None
@@ -42,23 +43,44 @@ def cleanup_memory():
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-        # torch.cuda.synchronize()
 def load_model(name):
     """Load model with CORRECT dtype"""
     global processor, model
     cleanup_memory()
     model_id = MODEL_MAP[name]
     processor = AutoImageProcessor.from_pretrained(model_id)
     model = AutoModel.from_pretrained(
         model_id,
-        torch_dtype="auto",
     ).eval()
     param_count = sum(p.numel() for p in model.parameters()) / 1e9
     return f"Loaded: {name} | {param_count:.1f}B params | Ready"
@@ -67,35 +89,77 @@ def load_model(name):
 load_model(DEFAULT_NAME)
 @spaces.GPU(duration=60)
 def _extract_grid(img):
-    """Extract feature grid from image"""
     global model
     with torch.inference_mode():
         # Move model to GPU for this call
-        model = model.to('cuda')
-        # Process image and move to GPU
-        pv = processor(images=img, return_tensors="pt").pixel_values.to(model.device)
-        # Run inference
         out = model(pixel_values=pv)
         last = out.last_hidden_state[0].to(torch.float32)
         # Extract features
         num_reg = getattr(model.config, "num_register_tokens", 0)
         p = model.config.patch_size
-        _, _, Ht, Wt = pv.shape
-        gh, gw = Ht // p, Wt // p
-        feats = last[1 + num_reg:, :].reshape(gh, gw, -1).cpu()
         # Move model back to CPU before function exits
         model = model.cpu()
         torch.cuda.empty_cache()
-    return feats, gh, gw
 def _overlay(orig, heat01, alpha=0.55, box=None):
@@ -123,14 +187,21 @@ def _overlay(orig, heat01, alpha=0.55, box=None):
 def prepare(img):
-    """Prepare image and extract features"""
     if img is None:
         return None
     base = ImageOps.exif_transpose(img.convert("RGB"))
-    feats, gh, gw = _extract_grid(base)
-    return {"orig": base, "feats": feats, "gh": gh, "gw": gw}
 def click(state, opacity, img_value, evt: gr.SelectData):
@@ -160,12 +231,16 @@ def click(state, opacity, img_value, evt: gr.SelectData):
     box = (int(i * px_x), int(j * px_y), int((i + 1) * px_x), int((j + 1) * px_y))
     overlay = _overlay(base, heat01, alpha=opacity, box=box)
-    return overlay, state
 def reset():
     """Reset the interface"""
-    return None, None
 with gr.Blocks(
@@ -207,6 +282,12 @@ with gr.Blocks(
                 interactive=False,
                 lines=1,
             )
             opacity = gr.Slider(
                 0.0,
                 1.0,
@@ -241,12 +322,12 @@ with gr.Blocks(
     img.select(
         click,
         inputs=[state, opacity, img],
-        outputs=[img, state],
         show_progress="minimal",
     )
-    reset_btn.click(reset, outputs=[img, state])
-    clear_btn.add([img, state])
     # Examples from current directory
     example_files = [
@@ -267,9 +348,12 @@ with gr.Blocks(
         )
     gr.Markdown(
-        """
     ---
     <div style="text-align: center; color: #666; font-size: 0.9em;">
         <b>Performance Notes:</b> Satellite models are optimized for geographic patterns, land use classification,
         and structural analysis. The 7B model provides exceptional detail but requires significant compute.
         <br><br>

 }
 DEFAULT_NAME = list(MODEL_MAP.keys())[0]
+MAX_IMAGE_DIM = 720  # Maximum dimension for longer side
 # Global model state
 processor = None
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+def compute_dynamic_size(height, width, max_dim=720, patch_size=16):
+    """
+    Compute new dimensions preserving aspect ratio with max_dim constraint.
+    Ensures dimensions are divisible by patch_size for clean patch extraction.
+    """
+    # Determine scaling factor
+    if height > width:
+        scale = min(1.0, max_dim / height)
+    else:
+        scale = min(1.0, max_dim / width)
+    # Compute new dimensions
+    new_height = int(height * scale)
+    new_width = int(width * scale)
+    # Round to nearest multiple of patch_size for clean patches
+    new_height = (new_height // patch_size) * patch_size
+    new_width = (new_width // patch_size) * patch_size
+    return new_height, new_width
 def load_model(name):
     """Load model with CORRECT dtype"""
     global processor, model
     cleanup_memory()
     model_id = MODEL_MAP[name]
     processor = AutoImageProcessor.from_pretrained(model_id)
     model = AutoModel.from_pretrained(
         model_id,
+        torch_dtype="auto",
     ).eval()
     param_count = sum(p.numel() for p in model.parameters()) / 1e9
     return f"Loaded: {name} | {param_count:.1f}B params | Ready"
 load_model(DEFAULT_NAME)
+def preprocess_image(img):
+    """
+    Custom preprocessing that respects aspect ratio and uses dynamic sizing.
+    DINOv3's RoPE handles variable sizes beautifully - no need to constrain to 224x224!
+    """
+    # Convert to RGB if needed
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    # Compute dynamic size
+    orig_h, orig_w = img.height, img.width
+    patch_size = model.config.patch_size
+    new_h, new_w = compute_dynamic_size(orig_h, orig_w, MAX_IMAGE_DIM, patch_size)
+    # Resize image
+    img_resized = img.resize((new_w, new_h), Image.Resampling.BICUBIC)
+    # Convert to tensor and normalize using the processor's normalization params
+    img_array = np.array(img_resized).astype(np.float32) / 255.0
+    # Apply ImageNet normalization (from processor config)
+    mean = (
+        processor.image_mean
+        if hasattr(processor, "image_mean")
+        else [0.485, 0.456, 0.406]
+    )
+    std = (
+        processor.image_std
+        if hasattr(processor, "image_std")
+        else [0.229, 0.224, 0.225]
+    )
+    img_array = (img_array - mean) / std
+    # Convert to tensor with correct shape: [1, C, H, W]
+    img_tensor = torch.from_numpy(img_array).permute(2, 0, 1).unsqueeze(0).float()
+    return img_tensor, new_h, new_w
 @spaces.GPU(duration=60)
 def _extract_grid(img):
+    """Extract feature grid from image - now with dynamic sizing!"""
     global model
     with torch.inference_mode():
         # Move model to GPU for this call
+        model = model.to("cuda")
+        # Preprocess with dynamic sizing
+        pv, img_h, img_w = preprocess_image(img)
+        pv = pv.to(model.device)
+        # Run inference - the model handles variable sizes perfectly!
         out = model(pixel_values=pv)
         last = out.last_hidden_state[0].to(torch.float32)
         # Extract features
         num_reg = getattr(model.config, "num_register_tokens", 0)
         p = model.config.patch_size
+        # Calculate grid dimensions based on actual image size
+        gh, gw = img_h // p, img_w // p
+        feats = last[1 + num_reg :, :].reshape(gh, gw, -1).cpu()
         # Move model back to CPU before function exits
         model = model.cpu()
         torch.cuda.empty_cache()
+    return feats, gh, gw, img_h, img_w
 def _overlay(orig, heat01, alpha=0.55, box=None):
 def prepare(img):
+    """Prepare image and extract features with dynamic sizing"""
     if img is None:
         return None
     base = ImageOps.exif_transpose(img.convert("RGB"))
+    feats, gh, gw, img_h, img_w = _extract_grid(base)
+    return {
+        "orig": base,
+        "feats": feats,
+        "gh": gh,
+        "gw": gw,
+        "processed_h": img_h,
+        "processed_w": img_w,
+    }
 def click(state, opacity, img_value, evt: gr.SelectData):
     box = (int(i * px_x), int(j * px_y), int((i + 1) * px_x), int((j + 1) * px_y))
     overlay = _overlay(base, heat01, alpha=opacity, box=box)
+    # Add info about resolution being processed
+    info_text = f"Processing at: {state['processed_w']}×{state['processed_h']} ({gh}×{gw} patches)"
+    return overlay, state, info_text
 def reset():
     """Reset the interface"""
+    return None, None, ""
 with gr.Blocks(
                 interactive=False,
                 lines=1,
             )
+            resolution_info = gr.Textbox(
+                label="Processing Resolution",
+                value="",
+                interactive=False,
+                lines=1,
+            )
             opacity = gr.Slider(
                 0.0,
                 1.0,
     img.select(
         click,
         inputs=[state, opacity, img],
+        outputs=[img, state, resolution_info],
         show_progress="minimal",
     )
+    reset_btn.click(reset, outputs=[img, state, resolution_info])
+    clear_btn.add([img, state, resolution_info])
     # Examples from current directory
     example_files = [
         )
     gr.Markdown(
+        f"""
     ---
     <div style="text-align: center; color: #666; font-size: 0.9em;">
+        <b>Dynamic Resolution:</b> Images are processed at up to {MAX_IMAGE_DIM}px (longer side) while preserving aspect ratio.
+        DINOv3's RoPE embeddings handle variable sizes perfectly - no need to squash to 224×224!
+        <br><br>
         <b>Performance Notes:</b> Satellite models are optimized for geographic patterns, land use classification,
         and structural analysis. The 7B model provides exceptional detail but requires significant compute.
         <br><br>