Spaces:

Deagin
/

RoofSegmentation2

Runtime error

Deagin Claude Opus 4.6 commited on Mar 9

Commit

8266ce5

1 Parent(s): dda24a3

Fix: Free SigLIP2 text encoder after caching embeddings (~7.5GB saved)

The SigLIP2 text encoder was consuming ~7.5GB RAM on top of the 1.6GB
RADIO backbone, exceeding the HF Spaces free tier memory limit.

Changes:
- Pre-compute text embeddings for 9 fixed prompts at startup
- Free the text encoder from RAM after caching (reclaims ~7.5GB)
- Only load siglip2-g adaptor (dino_v3_7b and sam3 were unused)
- Use cached embeddings during inference instead of re-encoding
- Remove unused get_dino_feature_map import from pipeline

Runtime memory: ~1.6GB model + ~1KB cached embeddings vs ~9GB before.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

pipeline.py +1 -1
radio_backbone.py +88 -73

pipeline.py CHANGED Viewed

@@ -12,7 +12,7 @@ from PIL import Image
 from google_solar import geocode_address, fetch_geotiff, parse_geotiff, parse_building_mask, parse_dsm
 from building import isolate_primary_building, crop_to_building, recalculate_bounds
 from ransac_planes import preprocess_dsm, dsm_to_point_cloud, fit_planes, planes_to_label_map, build_plane_info
-from radio_backbone import zero_shot_segment, get_dino_feature_map, get_roof_mask, move_to
 from fusion import fuse_segmentations, split_disconnected_regions, merge_small_fragments
 from geo_export import labels_to_geojson

 from google_solar import geocode_address, fetch_geotiff, parse_geotiff, parse_building_mask, parse_dsm
 from building import isolate_primary_building, crop_to_building, recalculate_bounds
 from ransac_planes import preprocess_dsm, dsm_to_point_cloud, fit_planes, planes_to_label_map, build_plane_info
+from radio_backbone import zero_shot_segment, get_roof_mask, move_to
 from fusion import fuse_segmentations, split_disconnected_regions, merge_small_fragments
 from geo_export import labels_to_geojson

radio_backbone.py CHANGED Viewed

@@ -1,10 +1,15 @@
 """NVIDIA C-RADIOv4-H unified vision backbone.
 Distills DINOv3-7B + SAM3 + SigLIP2 into a single 631M-param encoder.
-Provides dense semantic features, instance segmentation features,
-and zero-shot text-prompted segmentation via adaptor heads.
 """
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -28,13 +33,17 @@ NON_ROOF_PROMPTS = [
     "shadow",
 ]
-# Module-level model cache
 _model = None
 _device = None
 def load_model(device: str = "cuda", vitdet_window_size: int = 8):
-    """Load C-RADIOv4-H with all three adaptor heads.
     Args:
         device: 'cuda' or 'cpu'. For ZeroGPU, load to 'cpu' at startup
@@ -46,7 +55,7 @@ def load_model(device: str = "cuda", vitdet_window_size: int = 8):
     Returns:
         The loaded model.
     """
-    global _model, _device
     if _model is not None:
         return _model
@@ -54,7 +63,7 @@ def load_model(device: str = "cuda", vitdet_window_size: int = 8):
     kwargs = {
         "version": "c-radio_v4-h",
-        "adaptor_names": ["dino_v3_7b", "sam3", "siglip2-g"],
         "progress": True,
         "skip_validation": True,
     }
@@ -66,10 +75,74 @@ def load_model(device: str = "cuda", vitdet_window_size: int = 8):
     _model.to(device)
     _device = device
-    print(f"C-RADIOv4-H loaded on {device}")
     return _model
 def get_model():
     """Get the cached model, loading if necessary."""
     global _model
@@ -80,10 +153,11 @@ def get_model():
 def move_to(device: str):
     """Move the model to a different device (for ZeroGPU)."""
-    global _model, _device
     if _model is not None and _device != device:
         _model.to(device)
         _device = device
 def prepare_image(
@@ -122,28 +196,6 @@ def prepare_image(
     return x, original_size, snapped
-def extract_features(
-    pixel_values: torch.Tensor,
-    model=None,
-    device: str = "cuda",
-) -> dict:
-    """Run C-RADIOv4-H and extract all adaptor features.
-    Returns:
-        Dict with keys 'backbone', 'dino_v3_7b', 'sam3', 'siglip2-g'.
-        Each value is (summary_tensor, spatial_features_tensor).
-    """
-    if model is None:
-        model = get_model()
-    pixel_values = pixel_values.to(device)
-    with torch.no_grad(), torch.autocast(device, dtype=torch.bfloat16):
-        vis_output = model(pixel_values)
-    return vis_output
 def zero_shot_segment(
     image: np.ndarray | Image.Image,
     roof_prompts: list[str] = ROOF_PROMPTS,
@@ -154,11 +206,11 @@ def zero_shot_segment(
     """Zero-shot roof segmentation via RADSeg approach.
     Uses SigLIP2 adaptor to create dense language-aligned patch features,
-    then computes cosine similarity against text prompts.
     Args:
         image: RGB image.
-        roof_prompts: Text labels for roof types.
         non_roof_prompts: Text labels for non-roof classes.
         model: C-RADIOv4-H model.
         device: Compute device.
@@ -167,6 +219,8 @@ def zero_shot_segment(
         (score_map: H x W x C float, seg_map: H x W int, all_labels: list[str])
         where seg_map[y,x] is the index into all_labels.
     """
     if model is None:
         model = get_model()
@@ -181,11 +235,8 @@ def zero_shot_segment(
     # Get SigLIP2-aligned spatial features
     sig2_summary, sig2_features = vis_output["siglip2-g"]
-    # Encode text prompts
-    sig2_adaptor = model.adaptors["siglip2-g"]
-    text_input = sig2_adaptor.tokenizer(all_labels).to(device)
-    with torch.no_grad():
-        text_embeddings = sig2_adaptor.encode_text(text_input, normalize=True)
     # Cosine similarity: (1, T, D) vs (C, D) -> (1, T, C)
     dense_features = F.normalize(sig2_features.float(), dim=-1)
@@ -209,42 +260,6 @@ def zero_shot_segment(
     return score_map_np, seg_map, all_labels
-def get_dino_feature_map(
-    image: np.ndarray | Image.Image,
-    model=None,
-    device: str = "cuda",
-) -> np.ndarray:
-    """Extract DINOv3-aligned spatial features as a 2D feature map.
-    Returns:
-        Feature map (H, W, D) upsampled to original image size.
-    """
-    if model is None:
-        model = get_model()
-    pixel_values, original_size, snapped_size = prepare_image(image, model)
-    pixel_values = pixel_values.to(device)
-    with torch.no_grad(), torch.autocast(device, dtype=torch.bfloat16):
-        vis_output = model(pixel_values)
-    _, dino_features = vis_output["dino_v3_7b"]
-    h_patches = snapped_size[0] // PATCH_SIZE
-    w_patches = snapped_size[1] // PATCH_SIZE
-    # (1, T, D) -> (1, D, H_p, W_p)
-    feat_2d = rearrange(dino_features, "b (h w) d -> b d h w", h=h_patches, w=w_patches)
-    # Upsample to original size
-    feat_2d = F.interpolate(
-        feat_2d.float(), size=original_size, mode="bilinear", align_corners=False
-    )
-    # (1, D, H, W) -> (H, W, D)
-    return feat_2d[0].permute(1, 2, 0).cpu().numpy()
 def get_roof_mask(seg_map: np.ndarray, num_roof_classes: int = 4) -> np.ndarray:
     """Extract binary roof mask from segmentation map.

 """NVIDIA C-RADIOv4-H unified vision backbone.
 Distills DINOv3-7B + SAM3 + SigLIP2 into a single 631M-param encoder.
+Uses the SigLIP2 adaptor head for zero-shot text-prompted roof segmentation.
+Memory optimization: the SigLIP2 text encoder (~7.5GB) is loaded once to
+pre-compute text embeddings for our fixed prompt set, then freed from RAM.
+Only the vision backbone + adaptor projection head are kept (~1.6GB).
 """
+import gc
 import numpy as np
 import torch
 import torch.nn.functional as F
     "shadow",
 ]
+# Module-level caches
 _model = None
 _device = None
+_cached_text_embeddings = None  # Pre-computed for ROOF_PROMPTS + NON_ROOF_PROMPTS
 def load_model(device: str = "cuda", vitdet_window_size: int = 8):
+    """Load C-RADIOv4-H with siglip2-g adaptor.
+    Pre-computes text embeddings for fixed prompts, then frees the
+    SigLIP2 text encoder to reclaim ~7.5GB of RAM.
     Args:
         device: 'cuda' or 'cpu'. For ZeroGPU, load to 'cpu' at startup
     Returns:
         The loaded model.
     """
+    global _model, _device, _cached_text_embeddings
     if _model is not None:
         return _model
     kwargs = {
         "version": "c-radio_v4-h",
+        "adaptor_names": ["siglip2-g"],
         "progress": True,
         "skip_validation": True,
     }
     _model.to(device)
     _device = device
+    # --- Pre-compute text embeddings, then free the text encoder ---
+    all_labels = ROOF_PROMPTS + NON_ROOF_PROMPTS
+    print(f"Caching text embeddings for {len(all_labels)} prompts...")
+    sig2_adaptor = _model.adaptors["siglip2-g"]
+    text_input = sig2_adaptor.tokenizer(all_labels).to(device)
+    with torch.no_grad():
+        _cached_text_embeddings = sig2_adaptor.encode_text(
+            text_input, normalize=True
+        ).cpu().clone()
+    # Free the heavy SigLIP2 text encoder (~7.5GB)
+    _free_text_encoder(sig2_adaptor)
+    print(f"C-RADIOv4-H loaded on {device} (text encoder freed, embeddings cached)")
     return _model
+def _free_text_encoder(adaptor):
+    """Delete large sub-modules from the SigLIP2 adaptor to free RAM.
+    After pre-computing text embeddings, we no longer need the text
+    encoder, tokenizer model weights, or any module > 100MB.
+    """
+    freed = 0
+    # Check all direct children of the adaptor
+    for name in list(vars(adaptor).keys()):
+        obj = getattr(adaptor, name, None)
+        if obj is None:
+            continue
+        if hasattr(obj, "parameters"):
+            param_bytes = sum(
+                p.numel() * p.element_size() for p in obj.parameters()
+            )
+            if param_bytes > 100_000_000:  # > 100MB
+                size_gb = param_bytes / 1e9
+                print(f"  Freeing adaptor.{name} ({size_gb:.1f} GB)")
+                try:
+                    delattr(adaptor, name)
+                    freed += param_bytes
+                except Exception:
+                    pass
+    # Also check nn.Module named children
+    for name, module in list(adaptor.named_children()):
+        param_bytes = sum(
+            p.numel() * p.element_size() for p in module.parameters()
+        )
+        if param_bytes > 100_000_000:
+            size_gb = param_bytes / 1e9
+            print(f"  Freeing adaptor child '{name}' ({size_gb:.1f} GB)")
+            try:
+                delattr(adaptor, name)
+                freed += param_bytes
+            except Exception:
+                pass
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if freed > 0:
+        print(f"  Total freed: {freed / 1e9:.1f} GB")
+    else:
+        print("  Warning: could not identify text encoder to free")
 def get_model():
     """Get the cached model, loading if necessary."""
     global _model
 def move_to(device: str):
     """Move the model to a different device (for ZeroGPU)."""
+    global _model, _device, _cached_text_embeddings
     if _model is not None and _device != device:
         _model.to(device)
         _device = device
+    # Text embeddings stay on CPU; moved to device in zero_shot_segment
 def prepare_image(
     return x, original_size, snapped
 def zero_shot_segment(
     image: np.ndarray | Image.Image,
     roof_prompts: list[str] = ROOF_PROMPTS,
     """Zero-shot roof segmentation via RADSeg approach.
     Uses SigLIP2 adaptor to create dense language-aligned patch features,
+    then computes cosine similarity against pre-computed text embeddings.
     Args:
         image: RGB image.
+        roof_prompts: Text labels for roof types (must match startup prompts).
         non_roof_prompts: Text labels for non-roof classes.
         model: C-RADIOv4-H model.
         device: Compute device.
         (score_map: H x W x C float, seg_map: H x W int, all_labels: list[str])
         where seg_map[y,x] is the index into all_labels.
     """
+    global _cached_text_embeddings
     if model is None:
         model = get_model()
     # Get SigLIP2-aligned spatial features
     sig2_summary, sig2_features = vis_output["siglip2-g"]
+    # Use pre-computed text embeddings (cached at startup)
+    text_embeddings = _cached_text_embeddings.to(device)
     # Cosine similarity: (1, T, D) vs (C, D) -> (1, T, C)
     dense_features = F.normalize(sig2_features.float(), dim=-1)
     return score_map_np, seg_map, all_labels
 def get_roof_mask(seg_map: np.ndarray, num_roof_classes: int = 4) -> np.ndarray:
     """Extract binary roof mask from segmentation map.