Spaces:

yifehuang97
/

CountEx

Sleeping

App Files Files Community

yifehuang97 commited on Nov 25, 2025

Commit

5a0ba26

1 Parent(s): 64a43aa

(feat) semantic post processing

Browse files

Files changed (2) hide show

app.py +50 -69
utils.py +32 -0

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 from PIL import Image, ImageDraw
 from transformers import GroundingDinoProcessor
 from hf_model import CountEX
-from utils import post_process_grounded_object_detection
 # Global variables for model and processor
 model = None
@@ -75,25 +75,27 @@ def filter_points_by_negative(points, neg_points, image_size, pixel_threshold=5)
     return filtered_points, filtered_indices
 def discriminative_point_suppression(
     points,
     neg_points,
-    pos_queries,
-    neg_queries,
     image_size,
     pixel_threshold=5,
-    similarity_threshold=0.5,
-    mode="and"
 ):
     """
     Discriminative Point Suppression (DPS):
-    Suppress positive predictions that are both spatially close to
-    AND semantically similar with negative predictions.
-    Motivation: Spatial proximity alone may cause false suppression when
-    positive and negative queries represent different semantic concepts.
-    By jointly verifying spatial AND semantic alignment, we ensure
-    suppression only occurs for true conflicts.
     Args:
         points: List of [x, y] positive points (normalized, 0-1)
@@ -102,13 +104,12 @@ def discriminative_point_suppression(
         neg_queries: (M, D) query embeddings for negative predictions
         image_size: (width, height) in pixels
         pixel_threshold: spatial distance threshold in pixels
-        similarity_threshold: cosine similarity threshold for semantic match
-        mode: "and" for hard joint condition, "weighted" for soft combination
     Returns:
         filtered_points: points after suppression
         filtered_indices: indices of kept points
-        suppression_info: dict with detailed suppression decisions (for analysis)
     """
     if not neg_points or not points:
         return points, list(range(len(points))), {}
@@ -116,74 +117,53 @@ def discriminative_point_suppression(
     width, height = image_size
     N, M = len(points), len(neg_points)
-    # === Spatial Distance ===
     points_arr = np.array(points) * np.array([width, height])  # (N, 2)
     neg_points_arr = np.array(neg_points) * np.array([width, height])  # (M, 2)
     spatial_dist = np.linalg.norm(
         points_arr[:, None, :] - neg_points_arr[None, :, :], axis=-1
     )  # (N, M)
-    # === Query Similarity (Cosine) ===
     # Normalize queries
     pos_q = pos_queries / (np.linalg.norm(pos_queries, axis=-1, keepdims=True) + 1e-8)
     neg_q = neg_queries / (np.linalg.norm(neg_queries, axis=-1, keepdims=True) + 1e-8)
-    query_sim = np.dot(pos_q, neg_q.T)  # (N, M), range [-1, 1]
-    # === Joint Suppression Decision ===
-    if mode == "and":
-        # Hard condition: suppress only if BOTH spatially close AND semantically similar
-        spatial_close = spatial_dist < pixel_threshold  # (N, M)
-        semantic_similar = query_sim > similarity_threshold  # (N, M)
-        # A positive is suppressed if ANY negative satisfies both conditions
-        should_suppress = (spatial_close & semantic_similar).any(axis=1)  # (N,)
-    elif mode == "weighted":
-        # Soft combination: weighted score
-        # Convert distance to proximity score (0-1, higher = closer)
-        spatial_proximity = np.exp(-spatial_dist / pixel_threshold)  # (N, M)
-        # Normalize similarity to [0, 1]
-        semantic_score = (query_sim + 1) / 2  # (N, M)
-        # Combined suppression score
-        suppression_score = spatial_proximity * semantic_score  # (N, M)
-        max_suppression = suppression_score.max(axis=1)  # (N,)
-        should_suppress = max_suppression > similarity_threshold
-    else:
-        raise ValueError(f"Unknown mode: {mode}")
     # === Filter ===
     keep_mask = ~should_suppress
     filtered_points = np.array(points)[keep_mask].tolist()
     filtered_indices = np.where(keep_mask)[0].tolist()
-    # === Suppression Info (for analysis/visualization) ===
     suppression_info = {
-        "spatial_dist": spatial_dist,
-        "query_similarity": query_sim,
         "suppressed_indices": np.where(should_suppress)[0].tolist(),
-        "suppressed_reasons": []
     }
-    # Record why each point was suppressed
-    for i in np.where(should_suppress)[0]:
-        if mode == "and":
-            matching_negs = np.where(spatial_close[i] & semantic_similar[i])[0]
-        else:
-            matching_negs = [suppression_score[i].argmax()]
-        suppression_info["suppressed_reasons"].append({
-            "pos_idx": int(i),
-            "matched_neg_idx": matching_negs.tolist() if isinstance(matching_negs, np.ndarray) else matching_negs,
-            "min_spatial_dist": float(spatial_dist[i].min()),
-            "max_query_sim": float(query_sim[i].max())
-        })
     return filtered_points, filtered_indices, suppression_info
 def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
@@ -259,7 +239,13 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
     outputs["pred_logits"] = outputs["logits"]
     threshold = box_threshold if box_threshold > 0 else model.box_threshold
-    results = post_process_grounded_object_detection(outputs, box_threshold=threshold)[0]
     boxes = results["boxes"]
     boxes = [box.tolist() for box in boxes]
@@ -273,17 +259,13 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
         neg_outputs["pred_points"] = outputs["neg_pred_boxes"][:, :, :2]
         neg_outputs["pred_logits"] = outputs["neg_logits"]
-        neg_results = post_process_grounded_object_detection(neg_outputs, box_threshold=threshold)[0]
         neg_boxes = neg_results["boxes"]
         neg_boxes = [box.tolist() for box in neg_boxes]
         neg_points = [[box[0], box[1]] for box in neg_boxes]
-    pos_queries = outputs["pos_queries"].squeeze(0).float()
-    neg_queries = outputs["neg_queries"].squeeze(0).float()
-    pos_queries = pos_queries[-1].squeeze(0)
-    neg_queries = neg_queries[-1].squeeze(0)
-    pos_queries = pos_queries.cpu().numpy()
-    neg_queries = neg_queries.cpu().numpy()
     img_size = image.size
     # filtered_points, kept_indices = filter_points_by_negative(
@@ -299,8 +281,7 @@ def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius,
         neg_queries,
         image_size=img_size,
         pixel_threshold=5,
-        similarity_threshold=0.5,
-        mode="and"
     )
     filtered_boxes = [boxes[i] for i in kept_indices]

 from PIL import Image, ImageDraw
 from transformers import GroundingDinoProcessor
 from hf_model import CountEX
+from utils import post_process_grounded_object_detection, post_process_grounded_object_detection_with_queries
 # Global variables for model and processor
 model = None
     return filtered_points, filtered_indices
+import numpy as np
 def discriminative_point_suppression(
     points,
     neg_points,
+    pos_queries,      # (N, D) numpy array
+    neg_queries,      # (M, D) numpy array
     image_size,
     pixel_threshold=5,
+    similarity_threshold=0.3,
 ):
     """
     Discriminative Point Suppression (DPS):
+    Step 1: Find spatially closest negative point for each positive point
+    Step 2: If distance < pixel_threshold, check query similarity
+    Step 3: Suppress only if query similarity > similarity_threshold
+    This two-stage design ensures suppression only when predictions are
+    both spatially overlapping AND semantically conflicting.
     Args:
         points: List of [x, y] positive points (normalized, 0-1)
         neg_queries: (M, D) query embeddings for negative predictions
         image_size: (width, height) in pixels
         pixel_threshold: spatial distance threshold in pixels
+        similarity_threshold: cosine similarity threshold for semantic conflict
     Returns:
         filtered_points: points after suppression
         filtered_indices: indices of kept points
+        suppression_info: dict with detailed suppression decisions
     """
     if not neg_points or not points:
         return points, list(range(len(points))), {}
     width, height = image_size
     N, M = len(points), len(neg_points)
+    # === Step 1: Spatial Matching ===
     points_arr = np.array(points) * np.array([width, height])  # (N, 2)
     neg_points_arr = np.array(neg_points) * np.array([width, height])  # (M, 2)
+    # Compute pairwise distances
     spatial_dist = np.linalg.norm(
         points_arr[:, None, :] - neg_points_arr[None, :, :], axis=-1
     )  # (N, M)
+    # Find nearest negative for each positive
+    nearest_neg_idx = spatial_dist.argmin(axis=1)  # (N,)
+    nearest_neg_dist = spatial_dist.min(axis=1)    # (N,)
+    # Check spatial condition
+    spatially_close = nearest_neg_dist < pixel_threshold  # (N,)
+    # === Step 2: Query Similarity Check (only for spatially close pairs) ===
     # Normalize queries
     pos_q = pos_queries / (np.linalg.norm(pos_queries, axis=-1, keepdims=True) + 1e-8)
     neg_q = neg_queries / (np.linalg.norm(neg_queries, axis=-1, keepdims=True) + 1e-8)
+    # Compute similarity only for matched pairs
+    matched_neg_q = neg_q[nearest_neg_idx]  # (N, D)
+    query_sim = (pos_q * matched_neg_q).sum(axis=-1)  # (N,) cosine similarity
+    # Check semantic condition
+    semantically_similar = query_sim > similarity_threshold  # (N,)
+    # === Step 3: Joint Decision ===
+    # Suppress only if BOTH conditions are met
+    should_suppress = spatially_close & semantically_similar  # (N,)
     # === Filter ===
     keep_mask = ~should_suppress
     filtered_points = np.array(points)[keep_mask].tolist()
     filtered_indices = np.where(keep_mask)[0].tolist()
+    # === Suppression Info ===
     suppression_info = {
+        "nearest_neg_idx": nearest_neg_idx.tolist(),
+        "nearest_neg_dist": nearest_neg_dist.tolist(),
+        "query_similarity": query_sim.tolist(),
+        "spatially_close": spatially_close.tolist(),
+        "semantically_similar": semantically_similar.tolist(),
         "suppressed_indices": np.where(should_suppress)[0].tolist(),
     }
     return filtered_points, filtered_indices, suppression_info
 def count_objects(image, pos_caption, neg_caption, box_threshold, point_radius, point_color):
     outputs["pred_logits"] = outputs["logits"]
     threshold = box_threshold if box_threshold > 0 else model.box_threshold
+    pos_queries = outputs["pos_queries"].squeeze(0).float()
+    neg_queries = outputs["neg_queries"].squeeze(0).float()
+    pos_queries = pos_queries[-1].squeeze(0)
+    neg_queries = neg_queries[-1].squeeze(0)
+    pos_queries = pos_queries.cpu().numpy()
+    neg_queries = neg_queries.cpu().numpy()
+    results = post_process_grounded_object_detection_with_queries(outputs, pos_queries, box_threshold=threshold)[0]
     boxes = results["boxes"]
     boxes = [box.tolist() for box in boxes]
         neg_outputs["pred_points"] = outputs["neg_pred_boxes"][:, :, :2]
         neg_outputs["pred_logits"] = outputs["neg_logits"]
+        neg_results = post_process_grounded_object_detection_with_queries(neg_outputs, neg_queries, box_threshold=threshold)[0]
         neg_boxes = neg_results["boxes"]
         neg_boxes = [box.tolist() for box in neg_boxes]
         neg_points = [[box[0], box[1]] for box in neg_boxes]
+    pos_queries = results["queries"]
+    neg_queries = neg_results["queries"]
     img_size = image.size
     # filtered_points, kept_indices = filter_points_by_negative(
         neg_queries,
         image_size=img_size,
         pixel_threshold=5,
+        similarity_threshold=0.25,
     )
     filtered_boxes = [boxes[i] for i in kept_indices]

utils.py CHANGED Viewed

@@ -45,6 +45,38 @@ def post_process_grounded_object_detection(
     return results
 class collator:
     def __init__(self, processor=None, use_negative=True):

     return results
+def post_process_grounded_object_detection_with_queries(
+    outputs,
+    queries,
+    box_threshold: float = 0.4,
+):
+    """
+    Post-process grounded object detection outputs.
+    Now also returns the query embeddings for each kept prediction.
+    """
+    logits, boxes = outputs.logits, outputs.pred_boxes
+    assert len(logits) == queries.shape[0], "logits and queries must have the same batch size"
+    probs = torch.sigmoid(logits)  # (batch_size, num_queries, 256)
+    scores = torch.max(probs, dim=-1)[0]  # (batch_size, num_queries)
+    results = []
+    for idx, (s, b, p) in enumerate(zip(scores, boxes, probs)):
+        mask = s > box_threshold
+        score = s[mask]
+        box = b[mask]
+        prob = p[mask]
+        result = {"scores": score, "boxes": box}
+        # 保存对应的 query embeddings
+        if queries is not None:
+            result["queries"] = queries[idx][mask]  # (num_kept, D)
+        results.append(result)
+    assert len(results['scores']) == len(results['boxes']) == results['queries'].shape[0], "scores, boxes and queries must have the same length"
+    return results
 class collator:
     def __init__(self, processor=None, use_negative=True):