aryaaan12
/

T-REN

@@ -89,15 +89,10 @@ class TRENModel(PreTrainedModel):
         # RegionEncoder: the trained T-REN head. HF saves/loads these weights.
         self.region_encoder = RegionEncoder(cfg)
-        # Dense grid of point prompts covering the full image at patch stride.
-        res = config.image_resolution
-        ps = config.patch_size
-        coords = np.linspace(1, res - 2, res // ps, dtype=int)
-        grid_points = torch.tensor([(y, x) for y in coords for x in coords])
-        # Store grid_points and lazy backbone refs without registering them as
-        # nn.Module submodules (so they are excluded from HF save/load).
-        object.__setattr__(self, "_grid_points", grid_points)
         object.__setattr__(self, "_image_encoder", None)
         object.__setattr__(self, "_text_encoder", None)
@@ -159,6 +154,15 @@ class TRENModel(PreTrainedModel):
             )
         device = pixel_values.device
         prompts = [self._grid_points.to(device) for _ in range(pixel_values.shape[0])]
         with torch.no_grad():

         # RegionEncoder: the trained T-REN head. HF saves/loads these weights.
         self.region_encoder = RegionEncoder(cfg)
+        # Lazy placeholders — not registered as nn.Module submodules so they
+        # are excluded from HF save/load. _grid_points is computed on first
+        # forward() call to avoid meta-device issues during from_pretrained().
+        object.__setattr__(self, "_grid_points", None)
         object.__setattr__(self, "_image_encoder", None)
         object.__setattr__(self, "_text_encoder", None)
             )
         device = pixel_values.device
+        # Build grid on first call (avoids meta-device issues during from_pretrained).
+        if self._grid_points is None:
+            res = self.config.image_resolution
+            ps = self.config.patch_size
+            coords = np.linspace(1, res - 2, res // ps, dtype=int)
+            object.__setattr__(self, "_grid_points",
+                               torch.tensor([(y, x) for y in coords for x in coords]))
         prompts = [self._grid_points.to(device) for _ in range(pixel_values.shape[0])]
         with torch.no_grad():