Deagin Claude Opus 4.6 commited on
Commit
8266ce5
·
1 Parent(s): dda24a3

Fix: Free SigLIP2 text encoder after caching embeddings (~7.5GB saved)

Browse files

The SigLIP2 text encoder was consuming ~7.5GB RAM on top of the 1.6GB
RADIO backbone, exceeding the HF Spaces free tier memory limit.

Changes:
- Pre-compute text embeddings for 9 fixed prompts at startup
- Free the text encoder from RAM after caching (reclaims ~7.5GB)
- Only load siglip2-g adaptor (dino_v3_7b and sam3 were unused)
- Use cached embeddings during inference instead of re-encoding
- Remove unused get_dino_feature_map import from pipeline

Runtime memory: ~1.6GB model + ~1KB cached embeddings vs ~9GB before.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. pipeline.py +1 -1
  2. radio_backbone.py +88 -73
pipeline.py CHANGED
@@ -12,7 +12,7 @@ from PIL import Image
12
  from google_solar import geocode_address, fetch_geotiff, parse_geotiff, parse_building_mask, parse_dsm
13
  from building import isolate_primary_building, crop_to_building, recalculate_bounds
14
  from ransac_planes import preprocess_dsm, dsm_to_point_cloud, fit_planes, planes_to_label_map, build_plane_info
15
- from radio_backbone import zero_shot_segment, get_dino_feature_map, get_roof_mask, move_to
16
  from fusion import fuse_segmentations, split_disconnected_regions, merge_small_fragments
17
  from geo_export import labels_to_geojson
18
 
 
12
  from google_solar import geocode_address, fetch_geotiff, parse_geotiff, parse_building_mask, parse_dsm
13
  from building import isolate_primary_building, crop_to_building, recalculate_bounds
14
  from ransac_planes import preprocess_dsm, dsm_to_point_cloud, fit_planes, planes_to_label_map, build_plane_info
15
+ from radio_backbone import zero_shot_segment, get_roof_mask, move_to
16
  from fusion import fuse_segmentations, split_disconnected_regions, merge_small_fragments
17
  from geo_export import labels_to_geojson
18
 
radio_backbone.py CHANGED
@@ -1,10 +1,15 @@
1
  """NVIDIA C-RADIOv4-H unified vision backbone.
2
 
3
  Distills DINOv3-7B + SAM3 + SigLIP2 into a single 631M-param encoder.
4
- Provides dense semantic features, instance segmentation features,
5
- and zero-shot text-prompted segmentation via adaptor heads.
 
 
 
6
  """
7
 
 
 
8
  import numpy as np
9
  import torch
10
  import torch.nn.functional as F
@@ -28,13 +33,17 @@ NON_ROOF_PROMPTS = [
28
  "shadow",
29
  ]
30
 
31
- # Module-level model cache
32
  _model = None
33
  _device = None
 
34
 
35
 
36
  def load_model(device: str = "cuda", vitdet_window_size: int = 8):
37
- """Load C-RADIOv4-H with all three adaptor heads.
 
 
 
38
 
39
  Args:
40
  device: 'cuda' or 'cpu'. For ZeroGPU, load to 'cpu' at startup
@@ -46,7 +55,7 @@ def load_model(device: str = "cuda", vitdet_window_size: int = 8):
46
  Returns:
47
  The loaded model.
48
  """
49
- global _model, _device
50
  if _model is not None:
51
  return _model
52
 
@@ -54,7 +63,7 @@ def load_model(device: str = "cuda", vitdet_window_size: int = 8):
54
 
55
  kwargs = {
56
  "version": "c-radio_v4-h",
57
- "adaptor_names": ["dino_v3_7b", "sam3", "siglip2-g"],
58
  "progress": True,
59
  "skip_validation": True,
60
  }
@@ -66,10 +75,74 @@ def load_model(device: str = "cuda", vitdet_window_size: int = 8):
66
  _model.to(device)
67
  _device = device
68
 
69
- print(f"C-RADIOv4-H loaded on {device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  return _model
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def get_model():
74
  """Get the cached model, loading if necessary."""
75
  global _model
@@ -80,10 +153,11 @@ def get_model():
80
 
81
  def move_to(device: str):
82
  """Move the model to a different device (for ZeroGPU)."""
83
- global _model, _device
84
  if _model is not None and _device != device:
85
  _model.to(device)
86
  _device = device
 
87
 
88
 
89
  def prepare_image(
@@ -122,28 +196,6 @@ def prepare_image(
122
  return x, original_size, snapped
123
 
124
 
125
- def extract_features(
126
- pixel_values: torch.Tensor,
127
- model=None,
128
- device: str = "cuda",
129
- ) -> dict:
130
- """Run C-RADIOv4-H and extract all adaptor features.
131
-
132
- Returns:
133
- Dict with keys 'backbone', 'dino_v3_7b', 'sam3', 'siglip2-g'.
134
- Each value is (summary_tensor, spatial_features_tensor).
135
- """
136
- if model is None:
137
- model = get_model()
138
-
139
- pixel_values = pixel_values.to(device)
140
-
141
- with torch.no_grad(), torch.autocast(device, dtype=torch.bfloat16):
142
- vis_output = model(pixel_values)
143
-
144
- return vis_output
145
-
146
-
147
  def zero_shot_segment(
148
  image: np.ndarray | Image.Image,
149
  roof_prompts: list[str] = ROOF_PROMPTS,
@@ -154,11 +206,11 @@ def zero_shot_segment(
154
  """Zero-shot roof segmentation via RADSeg approach.
155
 
156
  Uses SigLIP2 adaptor to create dense language-aligned patch features,
157
- then computes cosine similarity against text prompts.
158
 
159
  Args:
160
  image: RGB image.
161
- roof_prompts: Text labels for roof types.
162
  non_roof_prompts: Text labels for non-roof classes.
163
  model: C-RADIOv4-H model.
164
  device: Compute device.
@@ -167,6 +219,8 @@ def zero_shot_segment(
167
  (score_map: H x W x C float, seg_map: H x W int, all_labels: list[str])
168
  where seg_map[y,x] is the index into all_labels.
169
  """
 
 
170
  if model is None:
171
  model = get_model()
172
 
@@ -181,11 +235,8 @@ def zero_shot_segment(
181
  # Get SigLIP2-aligned spatial features
182
  sig2_summary, sig2_features = vis_output["siglip2-g"]
183
 
184
- # Encode text prompts
185
- sig2_adaptor = model.adaptors["siglip2-g"]
186
- text_input = sig2_adaptor.tokenizer(all_labels).to(device)
187
- with torch.no_grad():
188
- text_embeddings = sig2_adaptor.encode_text(text_input, normalize=True)
189
 
190
  # Cosine similarity: (1, T, D) vs (C, D) -> (1, T, C)
191
  dense_features = F.normalize(sig2_features.float(), dim=-1)
@@ -209,42 +260,6 @@ def zero_shot_segment(
209
  return score_map_np, seg_map, all_labels
210
 
211
 
212
- def get_dino_feature_map(
213
- image: np.ndarray | Image.Image,
214
- model=None,
215
- device: str = "cuda",
216
- ) -> np.ndarray:
217
- """Extract DINOv3-aligned spatial features as a 2D feature map.
218
-
219
- Returns:
220
- Feature map (H, W, D) upsampled to original image size.
221
- """
222
- if model is None:
223
- model = get_model()
224
-
225
- pixel_values, original_size, snapped_size = prepare_image(image, model)
226
- pixel_values = pixel_values.to(device)
227
-
228
- with torch.no_grad(), torch.autocast(device, dtype=torch.bfloat16):
229
- vis_output = model(pixel_values)
230
-
231
- _, dino_features = vis_output["dino_v3_7b"]
232
-
233
- h_patches = snapped_size[0] // PATCH_SIZE
234
- w_patches = snapped_size[1] // PATCH_SIZE
235
-
236
- # (1, T, D) -> (1, D, H_p, W_p)
237
- feat_2d = rearrange(dino_features, "b (h w) d -> b d h w", h=h_patches, w=w_patches)
238
-
239
- # Upsample to original size
240
- feat_2d = F.interpolate(
241
- feat_2d.float(), size=original_size, mode="bilinear", align_corners=False
242
- )
243
-
244
- # (1, D, H, W) -> (H, W, D)
245
- return feat_2d[0].permute(1, 2, 0).cpu().numpy()
246
-
247
-
248
  def get_roof_mask(seg_map: np.ndarray, num_roof_classes: int = 4) -> np.ndarray:
249
  """Extract binary roof mask from segmentation map.
250
 
 
1
  """NVIDIA C-RADIOv4-H unified vision backbone.
2
 
3
  Distills DINOv3-7B + SAM3 + SigLIP2 into a single 631M-param encoder.
4
+ Uses the SigLIP2 adaptor head for zero-shot text-prompted roof segmentation.
5
+
6
+ Memory optimization: the SigLIP2 text encoder (~7.5GB) is loaded once to
7
+ pre-compute text embeddings for our fixed prompt set, then freed from RAM.
8
+ Only the vision backbone + adaptor projection head are kept (~1.6GB).
9
  """
10
 
11
+ import gc
12
+
13
  import numpy as np
14
  import torch
15
  import torch.nn.functional as F
 
33
  "shadow",
34
  ]
35
 
36
+ # Module-level caches
37
  _model = None
38
  _device = None
39
+ _cached_text_embeddings = None # Pre-computed for ROOF_PROMPTS + NON_ROOF_PROMPTS
40
 
41
 
42
  def load_model(device: str = "cuda", vitdet_window_size: int = 8):
43
+ """Load C-RADIOv4-H with siglip2-g adaptor.
44
+
45
+ Pre-computes text embeddings for fixed prompts, then frees the
46
+ SigLIP2 text encoder to reclaim ~7.5GB of RAM.
47
 
48
  Args:
49
  device: 'cuda' or 'cpu'. For ZeroGPU, load to 'cpu' at startup
 
55
  Returns:
56
  The loaded model.
57
  """
58
+ global _model, _device, _cached_text_embeddings
59
  if _model is not None:
60
  return _model
61
 
 
63
 
64
  kwargs = {
65
  "version": "c-radio_v4-h",
66
+ "adaptor_names": ["siglip2-g"],
67
  "progress": True,
68
  "skip_validation": True,
69
  }
 
75
  _model.to(device)
76
  _device = device
77
 
78
+ # --- Pre-compute text embeddings, then free the text encoder ---
79
+ all_labels = ROOF_PROMPTS + NON_ROOF_PROMPTS
80
+ print(f"Caching text embeddings for {len(all_labels)} prompts...")
81
+
82
+ sig2_adaptor = _model.adaptors["siglip2-g"]
83
+ text_input = sig2_adaptor.tokenizer(all_labels).to(device)
84
+ with torch.no_grad():
85
+ _cached_text_embeddings = sig2_adaptor.encode_text(
86
+ text_input, normalize=True
87
+ ).cpu().clone()
88
+
89
+ # Free the heavy SigLIP2 text encoder (~7.5GB)
90
+ _free_text_encoder(sig2_adaptor)
91
+
92
+ print(f"C-RADIOv4-H loaded on {device} (text encoder freed, embeddings cached)")
93
  return _model
94
 
95
 
96
+ def _free_text_encoder(adaptor):
97
+ """Delete large sub-modules from the SigLIP2 adaptor to free RAM.
98
+
99
+ After pre-computing text embeddings, we no longer need the text
100
+ encoder, tokenizer model weights, or any module > 100MB.
101
+ """
102
+ freed = 0
103
+
104
+ # Check all direct children of the adaptor
105
+ for name in list(vars(adaptor).keys()):
106
+ obj = getattr(adaptor, name, None)
107
+ if obj is None:
108
+ continue
109
+ if hasattr(obj, "parameters"):
110
+ param_bytes = sum(
111
+ p.numel() * p.element_size() for p in obj.parameters()
112
+ )
113
+ if param_bytes > 100_000_000: # > 100MB
114
+ size_gb = param_bytes / 1e9
115
+ print(f" Freeing adaptor.{name} ({size_gb:.1f} GB)")
116
+ try:
117
+ delattr(adaptor, name)
118
+ freed += param_bytes
119
+ except Exception:
120
+ pass
121
+
122
+ # Also check nn.Module named children
123
+ for name, module in list(adaptor.named_children()):
124
+ param_bytes = sum(
125
+ p.numel() * p.element_size() for p in module.parameters()
126
+ )
127
+ if param_bytes > 100_000_000:
128
+ size_gb = param_bytes / 1e9
129
+ print(f" Freeing adaptor child '{name}' ({size_gb:.1f} GB)")
130
+ try:
131
+ delattr(adaptor, name)
132
+ freed += param_bytes
133
+ except Exception:
134
+ pass
135
+
136
+ gc.collect()
137
+ if torch.cuda.is_available():
138
+ torch.cuda.empty_cache()
139
+
140
+ if freed > 0:
141
+ print(f" Total freed: {freed / 1e9:.1f} GB")
142
+ else:
143
+ print(" Warning: could not identify text encoder to free")
144
+
145
+
146
  def get_model():
147
  """Get the cached model, loading if necessary."""
148
  global _model
 
153
 
154
  def move_to(device: str):
155
  """Move the model to a different device (for ZeroGPU)."""
156
+ global _model, _device, _cached_text_embeddings
157
  if _model is not None and _device != device:
158
  _model.to(device)
159
  _device = device
160
+ # Text embeddings stay on CPU; moved to device in zero_shot_segment
161
 
162
 
163
  def prepare_image(
 
196
  return x, original_size, snapped
197
 
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def zero_shot_segment(
200
  image: np.ndarray | Image.Image,
201
  roof_prompts: list[str] = ROOF_PROMPTS,
 
206
  """Zero-shot roof segmentation via RADSeg approach.
207
 
208
  Uses SigLIP2 adaptor to create dense language-aligned patch features,
209
+ then computes cosine similarity against pre-computed text embeddings.
210
 
211
  Args:
212
  image: RGB image.
213
+ roof_prompts: Text labels for roof types (must match startup prompts).
214
  non_roof_prompts: Text labels for non-roof classes.
215
  model: C-RADIOv4-H model.
216
  device: Compute device.
 
219
  (score_map: H x W x C float, seg_map: H x W int, all_labels: list[str])
220
  where seg_map[y,x] is the index into all_labels.
221
  """
222
+ global _cached_text_embeddings
223
+
224
  if model is None:
225
  model = get_model()
226
 
 
235
  # Get SigLIP2-aligned spatial features
236
  sig2_summary, sig2_features = vis_output["siglip2-g"]
237
 
238
+ # Use pre-computed text embeddings (cached at startup)
239
+ text_embeddings = _cached_text_embeddings.to(device)
 
 
 
240
 
241
  # Cosine similarity: (1, T, D) vs (C, D) -> (1, T, C)
242
  dense_features = F.normalize(sig2_features.float(), dim=-1)
 
260
  return score_map_np, seg_map, all_labels
261
 
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  def get_roof_mask(seg_map: np.ndarray, num_roof_classes: int = 4) -> np.ndarray:
264
  """Extract binary roof mask from segmentation map.
265