File size: 3,696 Bytes
fb12ddc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | #!/usr/bin/env python3
"""
query_vision_video.py
======================
Query HF-native video_index using a text prompt.
Each row is one embedded frame at a specific second.
Adjacent high-scoring frames are merged into contiguous time ranges.
Usage:
python query_vision_video.py "rhino running in the wild"
python query_vision_video.py "person waving" --top 10 --min-score 0.15
"""
import sys
import json
from pathlib import Path
from config import DEFAULT_PROJECT, EMBED_MODEL, EMBED_DIM
from vector_store import get_store
from embedding import embed_text
from search import _merge_video_hits, _fmt as fmt
TOP_K = 30
MIN_SCORE = 0.15 # Adjusted for HF-native CLIP/Qwen scores
MERGE_GAP_SEC = 10
def search_video(query: str, top_k: int = TOP_K, min_score: float = MIN_SCORE):
print(f"\n{'='*60}")
print(f" ARIA Vision β Video Intelligence Search (HF-Native)")
print(f"{'='*60}")
print(f" Query : \"{query}\"")
print(f" Model : {EMBED_MODEL} ({EMBED_DIM}d)")
print(f" Min score : {min_score} | Merge gap: {MERGE_GAP_SEC}s | Fetch top: {top_k}")
print()
print(" [1/3] Embedding query...", end=" ", flush=True)
qvec = embed_text(query)
print("β")
print(" [2/3] Searching video_index...", end=" ", flush=True)
store = get_store(DEFAULT_PROJECT, "video_index")
raw_results = store.search(qvec, top_k=top_k)
if not raw_results:
print("no results.\n β Run ingest_sample_vision.py first.")
return
print(f"β ({len(raw_results)} raw frames returned)")
hits = [r for r in raw_results if r.get("score", 0) >= min_score]
if not hits:
top3 = sorted(raw_results, key=lambda r: -r.get("score", 0))[:3]
print(f"\n β No frames above score threshold ({min_score}).")
print(f" Top 3 raw scores: {[round(r.get('score',0),4) for r in top3]}")
return
print(f" [3/3] Merging {len(hits)} hits into time ranges...")
spans = _merge_video_hits(hits, gap=MERGE_GAP_SEC)
print()
print(f" {'β'*62}")
print(f" {'#':<4} {'Video':<24} {'Time Range':<16} {'Duration':<9} {'Frames':<7} {'Score'}")
print(f" {'β'*62}")
for i, s in enumerate(spans):
dur = s["end_sec"] - s["start_sec"]
print(
f" {i+1:<4} {s['video_name'][:23]:<24} "
f"{fmt(s['start_sec'])} β {fmt(s['end_sec']):<9} "
f"{dur:4.0f}s "
f"{s['frames']:<7} "
f"{s['peak_score']:.4f}"
)
print(f" {'β'*62}")
output = {
"mode": "Video Intelligence",
"query": query,
"matches": [
{
"video_name": s["video_name"],
"video_path": s.get("video_path", ""),
"start": fmt(s["start_sec"]),
"end": fmt(s["end_sec"]),
"start_seconds": s["start_sec"],
"end_seconds": s["end_sec"],
"score": round(s["peak_score"], 4),
"frames_matched": s["frames"],
}
for s in spans
],
}
print()
print(" JSON Response:")
print(f" {json.dumps(output, indent=2)}")
def main():
args = [a for a in sys.argv[1:] if not a.startswith("--")]
top = int(next((sys.argv[i+1] for i, a in enumerate(sys.argv) if a == "--top"), TOP_K))
msc = float(next((sys.argv[i+1] for i, a in enumerate(sys.argv) if a == "--min-score"), MIN_SCORE))
if not args:
print('Usage: python query_vision_video.py "your query"')
sys.exit(1)
search_video(" ".join(args), top_k=top, min_score=msc)
if __name__ == "__main__":
main()
|