| |
| """ |
| query_vision_video.py |
| ====================== |
| Query HF-native video_index using a text prompt. |
| Each row is one embedded frame at a specific second. |
| Adjacent high-scoring frames are merged into contiguous time ranges. |
| |
| Usage: |
| python query_vision_video.py "rhino running in the wild" |
| python query_vision_video.py "person waving" --top 10 --min-score 0.15 |
| """ |
|
|
| import sys |
| import json |
| from pathlib import Path |
|
|
| from config import DEFAULT_PROJECT, EMBED_MODEL, EMBED_DIM |
| from vector_store import get_store |
| from embedding import embed_text |
| from search import _merge_video_hits, _fmt as fmt |
|
|
| TOP_K = 30 |
| MIN_SCORE = 0.15 |
| MERGE_GAP_SEC = 10 |
|
|
| def search_video(query: str, top_k: int = TOP_K, min_score: float = MIN_SCORE): |
| print(f"\n{'='*60}") |
| print(f" ARIA Vision β Video Intelligence Search (HF-Native)") |
| print(f"{'='*60}") |
| print(f" Query : \"{query}\"") |
| print(f" Model : {EMBED_MODEL} ({EMBED_DIM}d)") |
| print(f" Min score : {min_score} | Merge gap: {MERGE_GAP_SEC}s | Fetch top: {top_k}") |
| print() |
|
|
| print(" [1/3] Embedding query...", end=" ", flush=True) |
| qvec = embed_text(query) |
| print("β") |
|
|
| print(" [2/3] Searching video_index...", end=" ", flush=True) |
| store = get_store(DEFAULT_PROJECT, "video_index") |
| raw_results = store.search(qvec, top_k=top_k) |
|
|
| if not raw_results: |
| print("no results.\n β Run ingest_sample_vision.py first.") |
| return |
|
|
| print(f"β ({len(raw_results)} raw frames returned)") |
|
|
| hits = [r for r in raw_results if r.get("score", 0) >= min_score] |
| if not hits: |
| top3 = sorted(raw_results, key=lambda r: -r.get("score", 0))[:3] |
| print(f"\n β No frames above score threshold ({min_score}).") |
| print(f" Top 3 raw scores: {[round(r.get('score',0),4) for r in top3]}") |
| return |
|
|
| print(f" [3/3] Merging {len(hits)} hits into time ranges...") |
| spans = _merge_video_hits(hits, gap=MERGE_GAP_SEC) |
|
|
| print() |
| print(f" {'β'*62}") |
| print(f" {'#':<4} {'Video':<24} {'Time Range':<16} {'Duration':<9} {'Frames':<7} {'Score'}") |
| print(f" {'β'*62}") |
|
|
| for i, s in enumerate(spans): |
| dur = s["end_sec"] - s["start_sec"] |
| print( |
| f" {i+1:<4} {s['video_name'][:23]:<24} " |
| f"{fmt(s['start_sec'])} β {fmt(s['end_sec']):<9} " |
| f"{dur:4.0f}s " |
| f"{s['frames']:<7} " |
| f"{s['peak_score']:.4f}" |
| ) |
|
|
| print(f" {'β'*62}") |
|
|
| output = { |
| "mode": "Video Intelligence", |
| "query": query, |
| "matches": [ |
| { |
| "video_name": s["video_name"], |
| "video_path": s.get("video_path", ""), |
| "start": fmt(s["start_sec"]), |
| "end": fmt(s["end_sec"]), |
| "start_seconds": s["start_sec"], |
| "end_seconds": s["end_sec"], |
| "score": round(s["peak_score"], 4), |
| "frames_matched": s["frames"], |
| } |
| for s in spans |
| ], |
| } |
|
|
| print() |
| print(" JSON Response:") |
| print(f" {json.dumps(output, indent=2)}") |
|
|
| def main(): |
| args = [a for a in sys.argv[1:] if not a.startswith("--")] |
| top = int(next((sys.argv[i+1] for i, a in enumerate(sys.argv) if a == "--top"), TOP_K)) |
| msc = float(next((sys.argv[i+1] for i, a in enumerate(sys.argv) if a == "--min-score"), MIN_SCORE)) |
|
|
| if not args: |
| print('Usage: python query_vision_video.py "your query"') |
| sys.exit(1) |
|
|
| search_video(" ".join(args), top_k=top, min_score=msc) |
|
|
| if __name__ == "__main__": |
| main() |
|
|