Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1738,14 +1738,18 @@ def _yolo_world_find(image: Image.Image, text_query: str, threshold: float = 0.0
|
|
| 1738 |
detections.sort(key=lambda d: d["score"], reverse=True)
|
| 1739 |
return detections
|
| 1740 |
|
| 1741 |
-
_ref_image_query_cache: dict = {}
|
|
|
|
| 1742 |
|
| 1743 |
def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, threshold: float = 0.01):
|
| 1744 |
"""
|
| 1745 |
-
|
| 1746 |
-
|
|
|
|
|
|
|
|
|
|
| 1747 |
"""
|
| 1748 |
-
import torch, hashlib, io as _io
|
| 1749 |
|
| 1750 |
buf = _io.BytesIO()
|
| 1751 |
query_img.save(buf, format="JPEG", quality=60)
|
|
@@ -1754,13 +1758,11 @@ def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, thresh
|
|
| 1754 |
if img_hash not in _ref_image_query_cache:
|
| 1755 |
try:
|
| 1756 |
proc, model = _load_florence()
|
| 1757 |
-
q = query_img.copy()
|
| 1758 |
-
q.thumbnail((256, 256))
|
| 1759 |
inputs = proc(text="<MORE_DETAILED_CAPTION>", images=q, return_tensors="pt")
|
| 1760 |
with torch.no_grad():
|
| 1761 |
ids = model.generate(
|
| 1762 |
-
input_ids=inputs["input_ids"],
|
| 1763 |
-
pixel_values=inputs["pixel_values"],
|
| 1764 |
max_new_tokens=30, num_beams=1, do_sample=False,
|
| 1765 |
)
|
| 1766 |
caption = proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
|
|
@@ -1769,9 +1771,39 @@ def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, thresh
|
|
| 1769 |
except Exception as e:
|
| 1770 |
print(f"[ref-image error] {e}")
|
| 1771 |
query = "object"
|
| 1772 |
-
_ref_image_query_cache[img_hash] = query
|
| 1773 |
|
| 1774 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1775 |
|
| 1776 |
|
| 1777 |
# ββ F-C: store SigLIP embedding on upload ββββββββββββββββββββββββββββββββββββ
|
|
@@ -1946,7 +1978,7 @@ async def find_in_frame(
|
|
| 1946 |
detections = _yolo_world_find_by_image(frame_img, ref_img)
|
| 1947 |
# Get the actual noun Florence derived (cached after first call)
|
| 1948 |
buf = _refio.BytesIO(); ref_img.save(buf, format="JPEG", quality=60)
|
| 1949 |
-
yolo_query = _ref_image_query_cache.get(hashlib.md5(buf.getvalue()).hexdigest(), "?")
|
| 1950 |
else:
|
| 1951 |
return {"found": False, "box": None, "label": "", "confidence": 0.0}
|
| 1952 |
print(f"[camera] {W}x{H} β {len(detections)} detections, top={round(detections[0]['score'],2) if detections else 'none'}")
|
|
|
|
| 1738 |
detections.sort(key=lambda d: d["score"], reverse=True)
|
| 1739 |
return detections
|
| 1740 |
|
| 1741 |
+
_ref_image_query_cache: dict = {} # md5 β {"query": str, "embedding": list}
|
| 1742 |
+
_CARD_LIKE_NOUNS = {"card","id","badge","pass","ticket","document","license","permit","certificate"}
|
| 1743 |
|
| 1744 |
def _yolo_world_find_by_image(frame: Image.Image, query_img: Image.Image, threshold: float = 0.01):
|
| 1745 |
"""
|
| 1746 |
+
Hybrid reference-image finder:
|
| 1747 |
+
1. Caption the ref image once with Florence (cached by md5).
|
| 1748 |
+
2. Extract core noun.
|
| 1749 |
+
3a. If noun is a flat/card-like item YOLO can't detect β SigLIP sliding-window similarity.
|
| 1750 |
+
3b. Otherwise β YOLO-World with the noun.
|
| 1751 |
"""
|
| 1752 |
+
import torch, hashlib, io as _io, numpy as np
|
| 1753 |
|
| 1754 |
buf = _io.BytesIO()
|
| 1755 |
query_img.save(buf, format="JPEG", quality=60)
|
|
|
|
| 1758 |
if img_hash not in _ref_image_query_cache:
|
| 1759 |
try:
|
| 1760 |
proc, model = _load_florence()
|
| 1761 |
+
q = query_img.copy(); q.thumbnail((256, 256))
|
|
|
|
| 1762 |
inputs = proc(text="<MORE_DETAILED_CAPTION>", images=q, return_tensors="pt")
|
| 1763 |
with torch.no_grad():
|
| 1764 |
ids = model.generate(
|
| 1765 |
+
input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"],
|
|
|
|
| 1766 |
max_new_tokens=30, num_beams=1, do_sample=False,
|
| 1767 |
)
|
| 1768 |
caption = proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
|
|
|
|
| 1771 |
except Exception as e:
|
| 1772 |
print(f"[ref-image error] {e}")
|
| 1773 |
query = "object"
|
|
|
|
| 1774 |
|
| 1775 |
+
# Pre-compute SigLIP embedding of the ref image (used for sliding window)
|
| 1776 |
+
ref_buf = _io.BytesIO()
|
| 1777 |
+
query_img.save(ref_buf, format="JPEG")
|
| 1778 |
+
ref_emb = _siglip_embed_image(ref_buf.getvalue())
|
| 1779 |
+
_ref_image_query_cache[img_hash] = {"query": query, "embedding": ref_emb}
|
| 1780 |
+
|
| 1781 |
+
cached = _ref_image_query_cache[img_hash]
|
| 1782 |
+
query = cached["query"]
|
| 1783 |
+
ref_emb = cached["embedding"]
|
| 1784 |
+
|
| 1785 |
+
# ββ Card/flat items: SigLIP sliding window βββββββββββββββββββββββββββββ
|
| 1786 |
+
if query in _CARD_LIKE_NOUNS or query == "object":
|
| 1787 |
+
W, H = frame.size
|
| 1788 |
+
best_score, best_box = 0.0, None
|
| 1789 |
+
# Try 3 scales Γ sliding windows
|
| 1790 |
+
for scale in [0.25, 0.40, 0.60]:
|
| 1791 |
+
ww, wh = max(60, int(W * scale)), max(40, int(H * scale))
|
| 1792 |
+
step_x, step_y = max(20, ww // 3), max(20, wh // 3)
|
| 1793 |
+
for x in range(0, W - ww + 1, step_x):
|
| 1794 |
+
for y in range(0, H - wh + 1, step_y):
|
| 1795 |
+
patch = frame.crop((x, y, x + ww, y + wh))
|
| 1796 |
+
pb = _io.BytesIO(); patch.save(pb, format="JPEG", quality=70)
|
| 1797 |
+
sim = _cosine(ref_emb, _siglip_embed_image(pb.getvalue()))
|
| 1798 |
+
if sim > best_score:
|
| 1799 |
+
best_score, best_box = sim, [x, y, x + ww, y + wh]
|
| 1800 |
+
print(f"[ref-image sliding] best_sim={round(best_score,3)}")
|
| 1801 |
+
if best_score > 0.70 and best_box:
|
| 1802 |
+
return [{"score": float(best_score), "box": best_box}]
|
| 1803 |
+
return []
|
| 1804 |
+
|
| 1805 |
+
# ββ Normal objects: YOLO-World ββββββββββββββββββββββββββββββββββββββββββ
|
| 1806 |
+
return _yolo_world_find(frame, query, threshold)
|
| 1807 |
|
| 1808 |
|
| 1809 |
# ββ F-C: store SigLIP embedding on upload ββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1978 |
detections = _yolo_world_find_by_image(frame_img, ref_img)
|
| 1979 |
# Get the actual noun Florence derived (cached after first call)
|
| 1980 |
buf = _refio.BytesIO(); ref_img.save(buf, format="JPEG", quality=60)
|
| 1981 |
+
yolo_query = (_ref_image_query_cache.get(hashlib.md5(buf.getvalue()).hexdigest()) or {}).get("query", "?")
|
| 1982 |
else:
|
| 1983 |
return {"found": False, "box": None, "label": "", "confidence": 0.0}
|
| 1984 |
print(f"[camera] {W}x{H} β {len(detections)} detections, top={round(detections[0]['score'],2) if detections else 'none'}")
|