TiH0 commited on
Commit
3c5eb2e
Β·
verified Β·
1 Parent(s): a2c796e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -11
app.py CHANGED
@@ -1738,14 +1738,18 @@ def _yolo_world_find(image: Image.Image, text_query: str, threshold: float = 0.0
1738
  detections.sort(key=lambda d: d["score"], reverse=True)
1739
  return detections
1740
 
1741
- _ref_image_query_cache: dict = {} # md5(ref_img) β†’ yolo query, computed once
 
1742
 
1743
  def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, threshold: float = 0.01):
1744
  """
1745
- Reference-image mode: caption the query image ONCE with Florence-2 (cached),
1746
- extract the core noun, use it as YOLO query on every frame.
 
 
 
1747
  """
1748
- import torch, hashlib, io as _io
1749
 
1750
  buf = _io.BytesIO()
1751
  query_img.save(buf, format="JPEG", quality=60)
@@ -1754,13 +1758,11 @@ def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, thresh
1754
  if img_hash not in _ref_image_query_cache:
1755
  try:
1756
  proc, model = _load_florence()
1757
- q = query_img.copy()
1758
- q.thumbnail((256, 256))
1759
  inputs = proc(text="<MORE_DETAILED_CAPTION>", images=q, return_tensors="pt")
1760
  with torch.no_grad():
1761
  ids = model.generate(
1762
- input_ids=inputs["input_ids"],
1763
- pixel_values=inputs["pixel_values"],
1764
  max_new_tokens=30, num_beams=1, do_sample=False,
1765
  )
1766
  caption = proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
@@ -1769,9 +1771,39 @@ def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, thresh
1769
  except Exception as e:
1770
  print(f"[ref-image error] {e}")
1771
  query = "object"
1772
- _ref_image_query_cache[img_hash] = query
1773
 
1774
- return _yolo_world_find(frame, _ref_image_query_cache[img_hash], threshold)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1775
 
1776
 
1777
  # ── F-C: store SigLIP embedding on upload ────────────────────────────────────
@@ -1946,7 +1978,7 @@ async def find_in_frame(
1946
  detections = _yolo_world_find_by_image(frame_img, ref_img)
1947
  # Get the actual noun Florence derived (cached after first call)
1948
  buf = _refio.BytesIO(); ref_img.save(buf, format="JPEG", quality=60)
1949
- yolo_query = _ref_image_query_cache.get(hashlib.md5(buf.getvalue()).hexdigest(), "?")
1950
  else:
1951
  return {"found": False, "box": None, "label": "", "confidence": 0.0}
1952
  print(f"[camera] {W}x{H} β†’ {len(detections)} detections, top={round(detections[0]['score'],2) if detections else 'none'}")
 
1738
  detections.sort(key=lambda d: d["score"], reverse=True)
1739
  return detections
1740
 
1741
+ _ref_image_query_cache: dict = {} # md5 β†’ {"query": str, "embedding": list}
1742
+ _CARD_LIKE_NOUNS = {"card","id","badge","pass","ticket","document","license","permit","certificate"}
1743
 
1744
  def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, threshold: float = 0.01):
1745
  """
1746
+ Hybrid reference-image finder:
1747
+ 1. Caption the ref image once with Florence (cached by md5).
1748
+ 2. Extract core noun.
1749
+ 3a. If noun is a flat/card-like item YOLO can't detect β†’ SigLIP sliding-window similarity.
1750
+ 3b. Otherwise β†’ YOLO-World with the noun.
1751
  """
1752
+ import torch, hashlib, io as _io, numpy as np
1753
 
1754
  buf = _io.BytesIO()
1755
  query_img.save(buf, format="JPEG", quality=60)
 
1758
  if img_hash not in _ref_image_query_cache:
1759
  try:
1760
  proc, model = _load_florence()
1761
+ q = query_img.copy(); q.thumbnail((256, 256))
 
1762
  inputs = proc(text="<MORE_DETAILED_CAPTION>", images=q, return_tensors="pt")
1763
  with torch.no_grad():
1764
  ids = model.generate(
1765
+ input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"],
 
1766
  max_new_tokens=30, num_beams=1, do_sample=False,
1767
  )
1768
  caption = proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
 
1771
  except Exception as e:
1772
  print(f"[ref-image error] {e}")
1773
  query = "object"
 
1774
 
1775
+ # Pre-compute SigLIP embedding of the ref image (used for sliding window)
1776
+ ref_buf = _io.BytesIO()
1777
+ query_img.save(ref_buf, format="JPEG")
1778
+ ref_emb = _siglip_embed_image(ref_buf.getvalue())
1779
+ _ref_image_query_cache[img_hash] = {"query": query, "embedding": ref_emb}
1780
+
1781
+ cached = _ref_image_query_cache[img_hash]
1782
+ query = cached["query"]
1783
+ ref_emb = cached["embedding"]
1784
+
1785
+ # ── Card/flat items: SigLIP sliding window ─────────────────────────────
1786
+ if query in _CARD_LIKE_NOUNS or query == "object":
1787
+ W, H = frame.size
1788
+ best_score, best_box = 0.0, None
1789
+ # Try 3 scales Γ— sliding windows
1790
+ for scale in [0.25, 0.40, 0.60]:
1791
+ ww, wh = max(60, int(W * scale)), max(40, int(H * scale))
1792
+ step_x, step_y = max(20, ww // 3), max(20, wh // 3)
1793
+ for x in range(0, W - ww + 1, step_x):
1794
+ for y in range(0, H - wh + 1, step_y):
1795
+ patch = frame.crop((x, y, x + ww, y + wh))
1796
+ pb = _io.BytesIO(); patch.save(pb, format="JPEG", quality=70)
1797
+ sim = _cosine(ref_emb, _siglip_embed_image(pb.getvalue()))
1798
+ if sim > best_score:
1799
+ best_score, best_box = sim, [x, y, x + ww, y + wh]
1800
+ print(f"[ref-image sliding] best_sim={round(best_score,3)}")
1801
+ if best_score > 0.70 and best_box:
1802
+ return [{"score": float(best_score), "box": best_box}]
1803
+ return []
1804
+
1805
+ # ── Normal objects: YOLO-World ──────────────────────────────────────────
1806
+ return _yolo_world_find(frame, query, threshold)
1807
 
1808
 
1809
  # ── F-C: store SigLIP embedding on upload ────────────────────────────────────
 
1978
  detections = _yolo_world_find_by_image(frame_img, ref_img)
1979
  # Get the actual noun Florence derived (cached after first call)
1980
  buf = _refio.BytesIO(); ref_img.save(buf, format="JPEG", quality=60)
1981
+ yolo_query = (_ref_image_query_cache.get(hashlib.md5(buf.getvalue()).hexdigest()) or {}).get("query", "?")
1982
  else:
1983
  return {"found": False, "box": None, "label": "", "confidence": 0.0}
1984
  print(f"[camera] {W}x{H} β†’ {len(detections)} detections, top={round(detections[0]['score'],2) if detections else 'none'}")