Spaces:

TiH0
/

findit-backend

Running

App Files Files Community

TiH0 commited on 24 days ago

Commit

3c5eb2e

verified ·

1 Parent(s): a2c796e

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -11

app.py CHANGED Viewed

@@ -1738,14 +1738,18 @@ def _yolo_world_find(image: Image.Image, text_query: str, threshold: float = 0.0
     detections.sort(key=lambda d: d["score"], reverse=True)
     return detections
-_ref_image_query_cache: dict = {}  # md5(ref_img) → yolo query, computed once
 def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, threshold: float = 0.01):
     """
-    Reference-image mode: caption the query image ONCE with Florence-2 (cached),
-    extract the core noun, use it as YOLO query on every frame.
     """
-    import torch, hashlib, io as _io
     buf = _io.BytesIO()
     query_img.save(buf, format="JPEG", quality=60)
@@ -1754,13 +1758,11 @@ def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, thresh
     if img_hash not in _ref_image_query_cache:
         try:
             proc, model = _load_florence()
-            q = query_img.copy()
-            q.thumbnail((256, 256))
             inputs = proc(text="<MORE_DETAILED_CAPTION>", images=q, return_tensors="pt")
             with torch.no_grad():
                 ids = model.generate(
-                    input_ids=inputs["input_ids"],
-                    pixel_values=inputs["pixel_values"],
                     max_new_tokens=30, num_beams=1, do_sample=False,
                 )
             caption = proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
@@ -1769,9 +1771,39 @@ def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, thresh
         except Exception as e:
             print(f"[ref-image error] {e}")
             query = "object"
-        _ref_image_query_cache[img_hash] = query
-    return _yolo_world_find(frame, _ref_image_query_cache[img_hash], threshold)
 # ── F-C: store SigLIP embedding on upload ────────────────────────────────────
@@ -1946,7 +1978,7 @@ async def find_in_frame(
             detections = _yolo_world_find_by_image(frame_img, ref_img)
             # Get the actual noun Florence derived (cached after first call)
             buf = _refio.BytesIO(); ref_img.save(buf, format="JPEG", quality=60)
-            yolo_query = _ref_image_query_cache.get(hashlib.md5(buf.getvalue()).hexdigest(), "?")
         else:
             return {"found": False, "box": None, "label": "", "confidence": 0.0}
         print(f"[camera] {W}x{H} → {len(detections)} detections, top={round(detections[0]['score'],2) if detections else 'none'}")

     detections.sort(key=lambda d: d["score"], reverse=True)
     return detections
+_ref_image_query_cache: dict = {}   # md5 → {"query": str, "embedding": list}
+_CARD_LIKE_NOUNS = {"card","id","badge","pass","ticket","document","license","permit","certificate"}
 def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, threshold: float = 0.01):
     """
+    Hybrid reference-image finder:
+    1. Caption the ref image once with Florence (cached by md5).
+    2. Extract core noun.
+    3a. If noun is a flat/card-like item YOLO can't detect → SigLIP sliding-window similarity.
+    3b. Otherwise → YOLO-World with the noun.
     """
+    import torch, hashlib, io as _io, numpy as np
     buf = _io.BytesIO()
     query_img.save(buf, format="JPEG", quality=60)
     if img_hash not in _ref_image_query_cache:
         try:
             proc, model = _load_florence()
+            q = query_img.copy(); q.thumbnail((256, 256))
             inputs = proc(text="<MORE_DETAILED_CAPTION>", images=q, return_tensors="pt")
             with torch.no_grad():
                 ids = model.generate(
+                    input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"],
                     max_new_tokens=30, num_beams=1, do_sample=False,
                 )
             caption = proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
         except Exception as e:
             print(f"[ref-image error] {e}")
             query = "object"
+        # Pre-compute SigLIP embedding of the ref image (used for sliding window)
+        ref_buf = _io.BytesIO()
+        query_img.save(ref_buf, format="JPEG")
+        ref_emb = _siglip_embed_image(ref_buf.getvalue())
+        _ref_image_query_cache[img_hash] = {"query": query, "embedding": ref_emb}
+    cached    = _ref_image_query_cache[img_hash]
+    query     = cached["query"]
+    ref_emb   = cached["embedding"]
+    # ── Card/flat items: SigLIP sliding window ─────────────────────────────
+    if query in _CARD_LIKE_NOUNS or query == "object":
+        W, H = frame.size
+        best_score, best_box = 0.0, None
+        # Try 3 scales × sliding windows
+        for scale in [0.25, 0.40, 0.60]:
+            ww, wh = max(60, int(W * scale)), max(40, int(H * scale))
+            step_x, step_y = max(20, ww // 3), max(20, wh // 3)
+            for x in range(0, W - ww + 1, step_x):
+                for y in range(0, H - wh + 1, step_y):
+                    patch = frame.crop((x, y, x + ww, y + wh))
+                    pb    = _io.BytesIO(); patch.save(pb, format="JPEG", quality=70)
+                    sim   = _cosine(ref_emb, _siglip_embed_image(pb.getvalue()))
+                    if sim > best_score:
+                        best_score, best_box = sim, [x, y, x + ww, y + wh]
+        print(f"[ref-image sliding] best_sim={round(best_score,3)}")
+        if best_score > 0.70 and best_box:
+            return [{"score": float(best_score), "box": best_box}]
+        return []
+    # ── Normal objects: YOLO-World ──────────────────────────────────────────
+    return _yolo_world_find(frame, query, threshold)
 # ── F-C: store SigLIP embedding on upload ────────────────────────────────────
             detections = _yolo_world_find_by_image(frame_img, ref_img)
             # Get the actual noun Florence derived (cached after first call)
             buf = _refio.BytesIO(); ref_img.save(buf, format="JPEG", quality=60)
+            yolo_query = (_ref_image_query_cache.get(hashlib.md5(buf.getvalue()).hexdigest()) or {}).get("query", "?")
         else:
             return {"found": False, "box": None, "label": "", "confidence": 0.0}
         print(f"[camera] {W}x{H} → {len(detections)} detections, top={round(detections[0]['score'],2) if detections else 'none'}")