Spaces:

bulatko
/

zoo3d

Sleeping

App Files Files Community

bulatko commited on Dec 12, 2025

Commit

daaac94

1 Parent(s): 6c099d4

chore: cleanup for HF Spaces (ignore pth)

Browse files

Files changed (5) hide show

.gitignore +1 -1
MaskClustering/graph/construction.py +164 -0
MaskClustering/graph/iterative_clustering.py +44 -0
MaskClustering/graph/node.py +49 -0
mvp.py +45 -26

.gitignore CHANGED Viewed

@@ -153,4 +153,4 @@ temp/
 **/*.glb
 **/*.bin
 data/
-**/*.pth

 **/*.glb
 **/*.bin
 data/
+**/*.pth

MaskClustering/graph/construction.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+from utils.mask_backprojection import frame_backprojection
+from graph.node import Node
+def mask_graph_construction(args, scene_points, frame_list, dataset):
+    '''
+        Construct the mask graph:
+        1. Build the point in mask matrix. (To speed up the following computation of view consensus rate.)
+        2. For each mask, compute the frames that it appears and the masks that contains it. Concurrently, we judge whether this mask is undersegmented.
+        3. Build the nodes in the graph.
+    '''
+    if args.debug:
+        print('start building point in mask matrix')
+    boundary_points, point_in_mask_matrix, mask_point_clouds, point_frame_matrix, global_frame_mask_list = build_point_in_mask_matrix(args, scene_points, frame_list, dataset)
+    visible_frames, contained_masks, undersegment_mask_ids = process_masks(frame_list, global_frame_mask_list, point_in_mask_matrix, boundary_points, mask_point_clouds, args)
+    observer_num_thresholds = get_observer_num_thresholds(visible_frames)
+    nodes = init_nodes(global_frame_mask_list, visible_frames, contained_masks, undersegment_mask_ids, mask_point_clouds)
+    return nodes, observer_num_thresholds, mask_point_clouds, point_frame_matrix
+def build_point_in_mask_matrix(args, scene_points, frame_list, dataset):
+    '''
+        To speed up the view consensus rate computation, we build a 'point in mask' matrix by a trade-off of space for time. This matrix is of size (scene_points_num, frame_num). For point i and frame j, if point i is in the k-th mask in frame j, then M[i,j] = k. Otherwise, M[i,j] = 0. (Note that mask id starts from 1).
+        Returns:
+            boundary_points: a set of points that are contained by multiple masks in a frame and thus are on the boundary of the masks. We will not consider these points in the following computation of view consensus rate.
+            point_in_mask_matrix: the 'point in mask' matrix.
+            mask_point_clouds: a dict where each key is the mask id in a frame, and the value is the point ids that are in this mask.
+            point_frame_matrix: a matrix of size (scene_points_num, frame_num). For point i and frame j, if point i is visible in frame j, then M[i,j] = True. Otherwise, M[i,j] = False.
+            global_frame_mask_list: a list of masks in the whole sequence. Each tuple contains the frame id and the mask id in this frame.
+    '''
+    scene_points_num = len(scene_points)
+    frame_num = len(frame_list)
+    scene_points = torch.tensor(scene_points).float().cuda()
+    boundary_points = set()
+    point_in_mask_matrix = np.zeros((scene_points_num, frame_num), dtype=np.uint16)
+    point_frame_matrix = np.zeros((scene_points_num, frame_num), dtype=bool)
+    global_frame_mask_list = []
+    mask_point_clouds = {}
+    iterator = tqdm(enumerate(frame_list), total=len(frame_list)) if args.debug else enumerate(frame_list)
+    for frame_cnt, frame_id in iterator:
+        mask_dict, frame_point_cloud_ids = frame_backprojection(dataset, scene_points, frame_id)
+        if len(frame_point_cloud_ids) == 0:
+            continue
+        point_frame_matrix[frame_point_cloud_ids, frame_cnt] = True
+        appeared_point_ids = set()
+        frame_boundary_point_index = set()
+        for mask_id, mask_point_cloud_ids in mask_dict.items():
+            frame_boundary_point_index.update(mask_point_cloud_ids.intersection(appeared_point_ids))
+            mask_point_clouds[f'{frame_id}_{mask_id}'] = mask_point_cloud_ids
+            point_in_mask_matrix[list(mask_point_cloud_ids), frame_cnt] = mask_id
+            appeared_point_ids.update(mask_point_cloud_ids)
+            global_frame_mask_list.append((frame_id, mask_id))
+        point_in_mask_matrix[list(frame_boundary_point_index), frame_cnt] = 0
+        boundary_points.update(frame_boundary_point_index)
+    return boundary_points, point_in_mask_matrix, mask_point_clouds, point_frame_matrix, global_frame_mask_list
+def init_nodes(global_frame_mask_list, mask_project_on_all_frames, contained_masks, undersegment_mask_ids, mask_point_clouds):
+    nodes = []
+    for global_mask_id, (frame_id, mask_id) in enumerate(global_frame_mask_list):
+        if global_mask_id in undersegment_mask_ids:
+            continue
+        mask_list = [(frame_id, mask_id)]
+        frame = mask_project_on_all_frames[global_mask_id]
+        frame_mask = contained_masks[global_mask_id]
+        point_ids = mask_point_clouds[f'{frame_id}_{mask_id}']
+        node_info = (0, len(nodes))
+        node = Node(mask_list, frame, frame_mask, point_ids, node_info, None)
+        nodes.append(node)
+    return nodes
+def get_observer_num_thresholds(visible_frames):
+    '''
+        Compute the observer number thresholds for each iteration. Range from 95% to 0%.
+    '''
+    observer_num_matrix = torch.matmul(visible_frames, visible_frames.transpose(0,1))
+    observer_num_list = observer_num_matrix.flatten()
+    observer_num_list = observer_num_list[observer_num_list > 0].cpu().numpy()
+    observer_num_thresholds = []
+    for percentile in range(95, -5, -5):
+        observer_num = np.percentile(observer_num_list, percentile)
+        if observer_num <= 1:
+            if percentile < 50:
+                break
+            else:
+                observer_num = 1
+        observer_num_thresholds.append(observer_num)
+    return observer_num_thresholds
+def process_one_mask(point_in_mask_matrix, boundary_points, mask_point_cloud, frame_list, global_frame_mask_list, args):
+    '''
+        For a mask, compute the frames that it is visible and the masks that contains it.
+    '''
+    visible_frame = torch.zeros(len(frame_list))
+    contained_mask = torch.zeros(len(global_frame_mask_list))
+    valid_mask_point_cloud = mask_point_cloud - boundary_points
+    mask_point_cloud_info = point_in_mask_matrix[list(valid_mask_point_cloud), :]
+    possibly_visible_frames = np.where(np.sum(mask_point_cloud_info, axis=0) > 0)[0]
+    split_num = 0
+    visible_num = 0
+    for frame_id in possibly_visible_frames:
+        mask_id_count = np.bincount(mask_point_cloud_info[:, frame_id])
+        invisible_ratio = mask_id_count[0] / np.sum(mask_id_count) # 0 means that this point is invisible in this frame
+        # If in a frame, most points in this mask are missing, then we think this mask is invisible in this frame.
+        if 1 - invisible_ratio < args.mask_visible_threshold and (np.sum(mask_id_count) - mask_id_count[0]) < 500:
+            continue
+        visible_num += 1
+        mask_id_count[0] = 0
+        max_mask_id = np.argmax(mask_id_count)
+        contained_ratio = mask_id_count[max_mask_id] / np.sum(mask_id_count)
+        if contained_ratio > args.contained_threshold:
+            visible_frame[frame_id] = 1
+            frame_mask_idx = global_frame_mask_list.index((frame_list[frame_id], max_mask_id))
+            contained_mask[frame_mask_idx] = 1
+        else:
+            split_num += 1 # This mask is splitted into two masks in this frame
+    if visible_num == 0 or split_num / visible_num > args.undersegment_filter_threshold:
+        return False, visible_frame, contained_mask
+    else:
+        return True, visible_frame, contained_mask
+def process_masks(frame_list, global_frame_mask_list, point_in_mask_matrix, boundary_points, mask_point_clouds, args):
+    '''
+        For each mask, compute the frames that it is visible and the masks that contains it.
+        Meanwhile, we judge whether this mask is undersegmented.
+    '''
+    if args.debug:
+        print('start processing masks')
+    visible_frames = []
+    contained_masks = []
+    undersegment_mask_ids = []
+    iterator = tqdm(global_frame_mask_list) if args.debug else global_frame_mask_list
+    for frame_id, mask_id in iterator:
+        valid, visible_frame, contained_mask = process_one_mask(point_in_mask_matrix, boundary_points, mask_point_clouds[f'{frame_id}_{mask_id}'], frame_list, global_frame_mask_list, args)
+        visible_frames.append(visible_frame)
+        contained_masks.append(contained_mask)
+        if not valid:
+            global_mask_id = global_frame_mask_list.index((frame_id, mask_id))
+            undersegment_mask_ids.append(global_mask_id)
+    if len(visible_frames) == 0:
+        return torch.zeros(0, len(frame_list)).cuda(), torch.zeros(0, len(global_frame_mask_list)).cuda(), undersegment_mask_ids
+    visible_frames = torch.stack(visible_frames, dim=0).cuda() # (mask_num, frame_num)
+    contained_masks = torch.stack(contained_masks, dim=0).cuda() # (mask_num, mask_num)
+    # Undo the effect of undersegment observer masks to avoid merging two objects that are actually separated
+    for global_mask_id in undersegment_mask_ids:
+        frame_id, _ = global_frame_mask_list[global_mask_id]
+        global_frame_id = frame_list.index(frame_id)
+        mask_projected_idx = torch.where(contained_masks[:, global_mask_id])[0]
+        contained_masks[:, global_mask_id] = False
+        visible_frames[mask_projected_idx, global_frame_id] = False
+    return visible_frames, contained_masks, undersegment_mask_ids

MaskClustering/graph/iterative_clustering.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import networkx as nx
+from graph.node import Node
+import torch
+def cluster_into_new_nodes(iteration, old_nodes, graph):
+    new_nodes = []
+    for component in nx.connected_components(graph):
+        node_info = (iteration, len(new_nodes))
+        new_nodes.append(Node.create_node_from_list([old_nodes[node] for node in component], node_info))
+    return new_nodes
+def update_graph(nodes, observer_num_threshold, connect_threshold):
+    '''
+        update view consensus rates between nodes and return a new graph
+    '''
+    node_visible_frames = torch.stack([node.visible_frame for node in nodes], dim=0)
+    node_contained_masks = torch.stack([node.contained_mask for node in nodes], dim=0)
+    observer_nums = torch.matmul(node_visible_frames, node_visible_frames.transpose(0,1)) # M[i,j] stores the number of frames that node i and node j both appear
+    supporter_nums = torch.matmul(node_contained_masks, node_contained_masks.transpose(0,1)) # M[i,j] stores the number of frames that supports the merging of node i and node j
+    view_concensus_rate = supporter_nums / (observer_nums + 1e-7)
+    disconnect = torch.eye(len(nodes), dtype=bool).cuda()
+    disconnect = disconnect | (observer_nums < observer_num_threshold) # node pairs with less than observer_num_threshold observers are disconnected
+    A = view_concensus_rate >= connect_threshold
+    A = A & ~disconnect
+    A = A.cpu().numpy()
+    G = nx.from_numpy_array(A)
+    return G
+def iterative_clustering(nodes, observer_num_thresholds, connect_threshold, debug):
+    if debug:
+        print('====> Start iterative clustering')
+    for iterate_id, observer_num_threshold in enumerate(observer_num_thresholds):
+        if debug:
+            print(f'Iterate {iterate_id}: observer_num', observer_num_threshold, ', number of nodes', len(nodes))
+        graph = update_graph(nodes, observer_num_threshold, connect_threshold)
+        nodes = cluster_into_new_nodes(iterate_id+1, nodes, graph)
+    return nodes

MaskClustering/graph/node.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import open3d as o3d
+class Node:
+    def __init__(self, mask_list, visible_frame, contained_mask, point_ids, node_info, son_node_info):
+        '''
+            mask_list: list of masks that is within this cluster
+            visible_frame: one-hot vector, 1 if the node appears in the frame
+            contained_mask: one-hot vector, 1 if the node is contained by the mask
+            point_ids: the corresponding 3D point ids
+            node_info: for debugging. The iteration and the index of the node in this iteration
+            son_node_info: for debugging. Node infos from the last iteration that are merged into this node
+        '''
+        self.mask_list = mask_list
+        self.visible_frame = visible_frame
+        self.contained_mask = contained_mask
+        self.point_ids = point_ids
+        self.node_info = node_info
+        self.son_node_info = son_node_info
+    @ staticmethod
+    def create_node_from_list(node_list, node_info):
+        mask_list = []
+        visible_frame = torch.zeros(len(node_list[0].visible_frame), dtype=bool).cuda()
+        contained_mask = torch.zeros(len(node_list[0].contained_mask), dtype=bool).cuda()
+        point_ids = set()
+        son_node_info = set()
+        for node in node_list:
+            mask_list += node.mask_list
+            visible_frame = visible_frame | (node.visible_frame).bool()
+            contained_mask = contained_mask | (node.contained_mask).bool()
+            point_ids = point_ids.union(node.point_ids)
+            son_node_info.add(node.node_info)
+        return Node(mask_list, visible_frame.float(), contained_mask.float(), point_ids, node_info, son_node_info)
+    def get_point_cloud(self, scene_points):
+        '''
+            return:
+                pcld: open3d.geometry.PointCloud object, the point cloud of the node
+                point_ids: list of int, the corresponding 3D point ids of the node
+        '''
+        point_ids = list(self.point_ids)
+        points = scene_points[point_ids]
+        pcld = o3d.geometry.PointCloud()
+        pcld.points = o3d.utility.Vector3dVector(points)
+        return pcld, point_ids

mvp.py CHANGED Viewed

@@ -52,23 +52,45 @@ _METRIC3D_MODEL = None
 _CLIP_MODEL = None
-def _download_vggt_weights(dst_path: str) -> str:
     """
-    Download VGGT weights from Google Drive to dst_path.
-    The user provided:
-    https://drive.google.com/file/d/10G7s6bVMwN__bcrR2fBal3goo69Y5Do4/view?usp=sharing
     """
     if os.path.exists(dst_path) and os.path.getsize(dst_path) > 0:
         return dst_path
-    if gdown is None:
-        raise RuntimeError("Не найден пакет gdown. Добавь gdown в requirements.txt для загрузки весов из Google Drive.")
     os.makedirs(os.path.dirname(dst_path), exist_ok=True)
-    url = "https://drive.google.com/uc?id=10G7s6bVMwN__bcrR2fBal3goo69Y5Do4"
-    out = gdown.download(url, dst_path, quiet=False)
-    if out is None or not os.path.exists(dst_path) or os.path.getsize(dst_path) == 0:
-        raise RuntimeError("Не удалось скачать веса VGGT из Google Drive (проверь доступ/квоты/публичность).")
     return dst_path
@@ -83,13 +105,13 @@ def _init_models():
     if _VGGT_MODEL is None:
         print("Initializing and loading VGGT model...")
-        m = VGGT()
-        weights_path = os.environ.get("VGGT_WEIGHTS_PATH")
-        if not weights_path:
-            weights_path = os.path.join(WORK_DIR, "weights", "vggt_model.pt")
-            _download_vggt_weights(weights_path)
-        state = torch.load(weights_path, map_location="cpu")
-        m.load_state_dict(state)
         m.eval()
         _VGGT_MODEL = m.to(device)
@@ -118,15 +140,9 @@ cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
 def check_weights():
     if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
         print(f"Downloading {cropformer_name}...")
-        # Prefer HF cache over `wget` for Spaces compatibility.
-        cached = hf_hub_download(
-            repo_id="qqlu1992/Adobe_EntitySeg",
-            repo_type="dataset",
-            filename="CropFormer_model/Entity_Segmentation/Mask2Former_hornet_3x/Mask2Former_hornet_3x_576d0b.pth",
-        )
         os.makedirs(MK_PATH, exist_ok=True)
         dst = os.path.join(MK_PATH, cropformer_name)
-        shutil.copyfile(cached, dst)
         print(f"Downloaded {cropformer_name}...")
     else:
         print(f"{cropformer_name} already exists...")
@@ -222,7 +238,7 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
         # Scale alignment: scale = median(Depths_VGGT / Depths_Metric3D)
         # We need to make sure we use valid depths (e.g. > 0) to avoid numerical issues
-        vggt_depth = predictions["depth"]  # (B, H, W, 1) or similar
         metric_depth = predictions["metric3d_depth"]  # (B, 1, H, W) presumably
         # Ensure shapes match for broadcasting or direct division
@@ -251,6 +267,9 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
         valid_mask = (metric_depth > 1e-6) & (vggt_depth > 1e-6)
         if valid_mask.sum() > 0:
             ratio = metric_depth[valid_mask] / vggt_depth[valid_mask]
             scale_factor = torch.median(ratio)
             print(f"Computed scale factor (VGGT / Metric3D): {scale_factor.item():.4f}")

 _CLIP_MODEL = None
+_MASK2FORMER_GDRIVE_FILE_ID = "10G7s6bVMwN__bcrR2fBal3goo69Y5Do4"
+def _ensure_mask2former_weights(dst_path: str) -> str:
     """
+    Ensure Mask2Former/CropFormer weights exist at dst_path.
+    Priority:
+    1) Use existing file (if present)
+    2) Download from Google Drive (user-provided link / file id)
+    3) Fallback: download from HF dataset (qqlu1992/Adobe_EntitySeg)
     """
     if os.path.exists(dst_path) and os.path.getsize(dst_path) > 0:
         return dst_path
     os.makedirs(os.path.dirname(dst_path), exist_ok=True)
+    # Allow user override via local path
+    override_path = os.environ.get("MASK2FORMER_WEIGHTS_PATH")
+    if override_path and os.path.exists(override_path) and os.path.getsize(override_path) > 0:
+        shutil.copyfile(override_path, dst_path)
+        return dst_path
+    # 2) Google Drive
+    if gdown is not None:
+        url = f"https://drive.google.com/uc?id={_MASK2FORMER_GDRIVE_FILE_ID}"
+        out = gdown.download(url, dst_path, quiet=False)
+        if out is not None and os.path.exists(dst_path) and os.path.getsize(dst_path) > 0:
+            return dst_path
+        print("Warning: gdown download failed for Mask2Former weights; falling back to HF dataset...")
+    else:
+        print("Warning: gdown is not available; falling back to HF dataset for Mask2Former weights...")
+    # 3) HF fallback
+    cached = hf_hub_download(
+        repo_id="qqlu1992/Adobe_EntitySeg",
+        repo_type="dataset",
+        filename="CropFormer_model/Entity_Segmentation/Mask2Former_hornet_3x/Mask2Former_hornet_3x_576d0b.pth",
+    )
+    shutil.copyfile(cached, dst_path)
     return dst_path
     if _VGGT_MODEL is None:
         print("Initializing and loading VGGT model...")
+        # Prefer Hugging Face weights for VGGT
+        try:
+            m = VGGT.from_pretrained("facebook/VGGT-1B")
+        except Exception:
+            m = VGGT()
+            _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
+            m.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
         m.eval()
         _VGGT_MODEL = m.to(device)
 def check_weights():
     if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
         print(f"Downloading {cropformer_name}...")
         os.makedirs(MK_PATH, exist_ok=True)
         dst = os.path.join(MK_PATH, cropformer_name)
+        _ensure_mask2former_weights(dst)
         print(f"Downloaded {cropformer_name}...")
     else:
         print(f"{cropformer_name} already exists...")
         # Scale alignment: scale = median(Depths_VGGT / Depths_Metric3D)
         # We need to make sure we use valid depths (e.g. > 0) to avoid numerical issues
+        vggt_depth = predictions["depth"][0]  # (B, H, W, 1) or similar
         metric_depth = predictions["metric3d_depth"]  # (B, 1, H, W) presumably
         # Ensure shapes match for broadcasting or direct division
         valid_mask = (metric_depth > 1e-6) & (vggt_depth > 1e-6)
         if valid_mask.sum() > 0:
+            print(f"Valid mask shape: {valid_mask.shape}")
+            print(f"Metric depth shape: {metric_depth.shape}")
+            print(f"VGGT depth shape: {vggt_depth.shape}")
             ratio = metric_depth[valid_mask] / vggt_depth[valid_mask]
             scale_factor = torch.median(ratio)
             print(f"Computed scale factor (VGGT / Metric3D): {scale_factor.item():.4f}")