adding filleeeesssss
Browse files- .env.example +35 -0
- .gitattributes +2 -0
- .gitignore +6 -0
- README.md +8 -8
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/embedding.cpython-312.pyc +0 -0
- __pycache__/ingest.cpython-312.pyc +0 -0
- __pycache__/search.cpython-312.pyc +0 -0
- __pycache__/seed_data.cpython-312.pyc +0 -0
- __pycache__/vector_store.cpython-312.pyc +0 -0
- app.py +405 -0
- assests/Architecture.svg +3 -0
- assests/GPU_Compute.png +0 -0
- assests/architecture.png +0 -0
- assests/dataFlow.png +0 -0
- assests/data_flow.svg +3 -0
- assests/gpu_compute_tiers.svg +3 -0
- assests/rockit_logo.png +3 -0
- config.py +96 -0
- data/images/.gitkeep +2 -0
- data/indexes/.gitkeep +2 -0
- data/projects/default/images/car.jpg +0 -0
- data/projects/default/images/dog.jpg +0 -0
- data/projects/default/images/mountain.jpg +0 -0
- data/projects/default/indexes/image_index.npz +3 -0
- data/projects/default/indexes/image_index_meta.json +1 -0
- data/projects/default/indexes/video_index.npz +3 -0
- data/projects/default/indexes/video_index_meta.json +1 -0
- data/projects/default/videos/sample.mp4 +3 -0
- data/videos/.gitkeep +2 -0
- embedding.py +245 -0
- ingest.py +340 -0
- ingest_sample_vision.py +254 -0
- query_vision_image.py +91 -0
- query_vision_video.py +111 -0
- requirements.txt +9 -0
- search.py +123 -0
- seed_data.py +70 -0
- test_store.py +50 -0
- vector_store.py +420 -0
.env.example
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# βββ ARIA Vision Intelligence βββ
|
| 2 |
+
#
|
| 3 |
+
# On HF Spaces: set these as Secrets in Space Settings.
|
| 4 |
+
# Locally: copy to .env and edit.
|
| 5 |
+
|
| 6 |
+
# HF token (for dataset persistence + Inference API)
|
| 7 |
+
HF_TOKEN=hf_your_token_here
|
| 8 |
+
|
| 9 |
+
# Persistent dataset repo (optional)
|
| 10 |
+
# HF_DATASET_REPO=your-username/aria-index
|
| 11 |
+
|
| 12 |
+
# GPU mode
|
| 13 |
+
USE_GPU=false
|
| 14 |
+
|
| 15 |
+
# Embedding model (auto-selected if not set)
|
| 16 |
+
# GPU: Qwen/Qwen3-VL-Embedding-2B (2048d)
|
| 17 |
+
# GPU: Qwen/Qwen3-VL-Embedding-8B (4096d)
|
| 18 |
+
# CPU: openai/clip-vit-large-patch14 (768d)
|
| 19 |
+
# EMBED_MODEL=Qwen/Qwen3-VL-Embedding-2B
|
| 20 |
+
# EMBED_DIM=2048
|
| 21 |
+
|
| 22 |
+
# LLM for result interpretation
|
| 23 |
+
# LLM_MODEL=Qwen/Qwen3-35B-A3B
|
| 24 |
+
# LLM_FALLBACK=Qwen/Qwen3-1.7B
|
| 25 |
+
|
| 26 |
+
# Video frame interval (seconds)
|
| 27 |
+
FRAME_EVERY_SEC=5
|
| 28 |
+
|
| 29 |
+
# Auto-seed on first launch
|
| 30 |
+
AUTO_SEED=true
|
| 31 |
+
SEED_DATASET=nlphuji/flickr30k
|
| 32 |
+
SEED_SPLIT=test[:200]
|
| 33 |
+
|
| 34 |
+
# Default project name
|
| 35 |
+
DEFAULT_PROJECT=default
|
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assests/rockit_logo.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/projects/default/videos/sample.mp4 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
data/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
.DS_Store
|
| 6 |
+
.ipynb_checkpoints/
|
README.md
CHANGED
|
@@ -12,7 +12,7 @@ license: apache-2.0
|
|
| 12 |
|
| 13 |
<div align="center">
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
| 17 |
### GPU-Accelerated Multimodal Search Engine
|
| 18 |
|
|
@@ -28,7 +28,7 @@ license: apache-2.0
|
|
| 28 |
|
| 29 |
## What Is This?
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
The core idea is simple:
|
| 34 |
|
|
@@ -60,7 +60,7 @@ projects/
|
|
| 60 |
```
|
| 61 |
|
| 62 |
### Native Multimodal Embedding (No Captioning)
|
| 63 |
-
Unlike caption-then-embed pipelines,
|
| 64 |
|
| 65 |
| Tier | Model | Dim | Use Case |
|
| 66 |
|------|-------|-----|----------|
|
|
@@ -69,7 +69,7 @@ Unlike caption-then-embed pipelines, ARIA uses **true vision-language embedding
|
|
| 69 |
| CPU fallback | `openai/clip-vit-large-patch14` | 768 | Free-tier HF Spaces, dev |
|
| 70 |
|
| 71 |
### CAGRA Graph Index (hipVS)
|
| 72 |
-
The CAGRA graph index is the fastest known ANN algorithm for GPU-resident data.
|
| 73 |
|
| 74 |
### NVMe β VRAM Async Hot-Swap
|
| 75 |
Indexes live in three tiers of memory. When a project is queried, its index is **asynchronously copied from NVMe into VRAM** via pinned-memory DMA, without blocking other projects. When VRAM fills up, least-recently-used indexes are evicted back to NVMe β not deleted.
|
|
@@ -89,7 +89,7 @@ Indexes live in three tiers of memory. When a project is queried, its index is *
|
|
| 89 |
This design lets you run **dozens of projects** on a single GPU by keeping only the active ones hot. Full VRAM capacity is utilized.
|
| 90 |
|
| 91 |
### LLM-Interpreted Results
|
| 92 |
-
Raw vector search returns `(id, score)` tuples. Before showing results to the user,
|
| 93 |
|
| 94 |
| Tier | Model | Notes |
|
| 95 |
|------|-------|-------|
|
|
@@ -111,7 +111,7 @@ Raw vector search returns `(id, score)` tuples. Before showing results to the us
|
|
| 111 |
|
| 112 |
## GPU Compute Tiers
|
| 113 |
|
| 114 |
-
|
| 115 |
|
| 116 |

|
| 117 |
|
|
@@ -231,7 +231,7 @@ Each project is an isolated workspace with its own sources, embeddings, and CAGR
|
|
| 231 |
Upload images or videos. For videos, ffmpeg extracts one representative frame every N seconds. Every image and frame is embedded directly by the vision-language model (Qwen3-VL or CLIP) β no captioning, no text intermediary.
|
| 232 |
|
| 233 |
### 3. CAGRA Build
|
| 234 |
-
After every insert, the CAGRA graph index is **fully rebuilt** from the updated vector set. This is intentional:
|
| 235 |
|
| 236 |
### 4. Search
|
| 237 |
When you search, the query text is embedded by the same model. The CAGRA index is loaded into VRAM (if not already hot) via async pinned-memory DMA, and searched in microseconds. Results are post-processed: video frame hits are merged into time ranges, and the full result set is sent to the LLM for a human-friendly summary.
|
|
@@ -261,5 +261,5 @@ Apache 2.0
|
|
| 261 |
---
|
| 262 |
|
| 263 |
<div align="center">
|
| 264 |
-
<i>Built for the AMD Hackathon β
|
| 265 |
</div>
|
|
|
|
| 12 |
|
| 13 |
<div align="center">
|
| 14 |
|
| 15 |
+
# ROCKIT Vision Intelligence
|
| 16 |
|
| 17 |
### GPU-Accelerated Multimodal Search Engine
|
| 18 |
|
|
|
|
| 28 |
|
| 29 |
## What Is This?
|
| 30 |
|
| 31 |
+
ROCKIT Vision Intelligence is an **open-source, self-hosted multimodal search engine** that lets you create isolated projects, ingest visual media (images, videos), and query them with natural language. It is built for the **AMD Hackathon** and designed to showcase GPU-accelerated approximate nearest-neighbor (ANN) search using the **hipVS CAGRA** graph index on AMD ROCm hardware.
|
| 32 |
|
| 33 |
The core idea is simple:
|
| 34 |
|
|
|
|
| 60 |
```
|
| 61 |
|
| 62 |
### Native Multimodal Embedding (No Captioning)
|
| 63 |
+
Unlike caption-then-embed pipelines, ROCKIT uses **true vision-language embedding models** that encode images, video frames, and text queries into the **same vector space** directly. No intermediate captioning step β no information loss.
|
| 64 |
|
| 65 |
| Tier | Model | Dim | Use Case |
|
| 66 |
|------|-------|-----|----------|
|
|
|
|
| 69 |
| CPU fallback | `openai/clip-vit-large-patch14` | 768 | Free-tier HF Spaces, dev |
|
| 70 |
|
| 71 |
### CAGRA Graph Index (hipVS)
|
| 72 |
+
The CAGRA graph index is the fastest known ANN algorithm for GPU-resident data. ROCKIT rebuilds the CAGRA graph on every insert because this project is **optimized for inference and query speed**, not ingestion throughput. A 100K-vector CAGRA rebuild takes ~2 seconds on an MI250X β negligible compared to the embedding cost.
|
| 73 |
|
| 74 |
### NVMe β VRAM Async Hot-Swap
|
| 75 |
Indexes live in three tiers of memory. When a project is queried, its index is **asynchronously copied from NVMe into VRAM** via pinned-memory DMA, without blocking other projects. When VRAM fills up, least-recently-used indexes are evicted back to NVMe β not deleted.
|
|
|
|
| 89 |
This design lets you run **dozens of projects** on a single GPU by keeping only the active ones hot. Full VRAM capacity is utilized.
|
| 90 |
|
| 91 |
### LLM-Interpreted Results
|
| 92 |
+
Raw vector search returns `(id, score)` tuples. Before showing results to the user, ROCKIT passes them through an LLM that interprets the matches, merges adjacent video timestamps into time ranges, and generates a human-readable summary.
|
| 93 |
|
| 94 |
| Tier | Model | Notes |
|
| 95 |
|------|-------|-------|
|
|
|
|
| 111 |
|
| 112 |
## GPU Compute Tiers
|
| 113 |
|
| 114 |
+
ROCKIT automatically detects available hardware and selects the best backend:
|
| 115 |
|
| 116 |

|
| 117 |
|
|
|
|
| 231 |
Upload images or videos. For videos, ffmpeg extracts one representative frame every N seconds. Every image and frame is embedded directly by the vision-language model (Qwen3-VL or CLIP) β no captioning, no text intermediary.
|
| 232 |
|
| 233 |
### 3. CAGRA Build
|
| 234 |
+
After every insert, the CAGRA graph index is **fully rebuilt** from the updated vector set. This is intentional: ROCKIT is optimized for query speed, not ingestion throughput. A 100K rebuild takes ~2s on MI250X. The built graph is immediately serialized to NVMe.
|
| 235 |
|
| 236 |
### 4. Search
|
| 237 |
When you search, the query text is embedded by the same model. The CAGRA index is loaded into VRAM (if not already hot) via async pinned-memory DMA, and searched in microseconds. Results are post-processed: video frame hits are merged into time ranges, and the full result set is sent to the LLM for a human-friendly summary.
|
|
|
|
| 261 |
---
|
| 262 |
|
| 263 |
<div align="center">
|
| 264 |
+
<i>Built for the AMD Hackathon β ROCKIT Vision Intelligence Platform</i>
|
| 265 |
</div>
|
__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (4.31 kB). View file
|
|
|
__pycache__/embedding.cpython-312.pyc
ADDED
|
Binary file (10.1 kB). View file
|
|
|
__pycache__/ingest.cpython-312.pyc
ADDED
|
Binary file (14.8 kB). View file
|
|
|
__pycache__/search.cpython-312.pyc
ADDED
|
Binary file (5.32 kB). View file
|
|
|
__pycache__/seed_data.cpython-312.pyc
ADDED
|
Binary file (3.17 kB). View file
|
|
|
__pycache__/vector_store.cpython-312.pyc
ADDED
|
Binary file (24.3 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
HF_Space_hipVS/app.py
|
| 4 |
+
=====================
|
| 5 |
+
ROCKIT Vision Intelligence β Hugging Face Space
|
| 6 |
+
|
| 7 |
+
GPU-accelerated multimodal search engine.
|
| 8 |
+
- Embedding: Qwen3-VL-Embedding (GPU) / CLIP (CPU)
|
| 9 |
+
- Search: CAGRA (hipVS) -> PyTorch -> NumPy
|
| 10 |
+
- UI: Premium Gradio Demo
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
import gradio as gr
|
| 18 |
+
|
| 19 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 20 |
+
|
| 21 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(message)s")
|
| 22 |
+
logger = logging.getLogger("rockit-vision")
|
| 23 |
+
|
| 24 |
+
from config import (
|
| 25 |
+
USE_GPU, EMBED_MODEL, EMBED_DIM, LLM_MODEL, LLM_FALLBACK,
|
| 26 |
+
FRAME_EVERY_SEC, HF_TOKEN, HF_DATASET_REPO, AUTO_SEED,
|
| 27 |
+
DEFAULT_PROJECT, DATA_DIR
|
| 28 |
+
)
|
| 29 |
+
from vector_store import get_store, list_projects
|
| 30 |
+
from ingest import (
|
| 31 |
+
ingest_images, ingest_videos,
|
| 32 |
+
ingest_single_image, ingest_single_video,
|
| 33 |
+
HAS_FFMPEG,
|
| 34 |
+
)
|
| 35 |
+
from search import search_images, search_videos
|
| 36 |
+
import seed_data
|
| 37 |
+
|
| 38 |
+
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
|
| 40 |
+
def get_system_info(project: str = DEFAULT_PROJECT) -> str:
|
| 41 |
+
img_store = get_store(project, "image_index")
|
| 42 |
+
vid_store = get_store(project, "video_index")
|
| 43 |
+
return "\n".join([
|
| 44 |
+
f"### Project Context: `{project}`\n",
|
| 45 |
+
"| Hardware & Models | Status |",
|
| 46 |
+
"|:---|:---|",
|
| 47 |
+
f"| **GPU Acceleration** | {'π Enabled' if USE_GPU else 'π’ Disabled (CPU)'} |",
|
| 48 |
+
f"| **Search Backend** | {img_store.mode} |",
|
| 49 |
+
f"| **Vision Model** | `{EMBED_MODEL.split('/')[-1]}` ({EMBED_DIM}d) |",
|
| 50 |
+
f"| **Reasoning LLM** | `{LLM_MODEL.split('/')[-1]}` |",
|
| 51 |
+
f"| **Media Engine** | {'ffmpeg detected' if HAS_FFMPEG else 'ffmpeg MISSING'} |",
|
| 52 |
+
"\n| Index Stats | Count | Location |",
|
| 53 |
+
"|:---|:---|:---|",
|
| 54 |
+
f"| Images | {img_store.count} | {('VRAM (Hot)' if img_store.in_vram else 'NVMe (Cold)')} |",
|
| 55 |
+
f"| Video Frames | {vid_store.count} | {('VRAM (Hot)' if vid_store.in_vram else 'NVMe (Cold)')} |",
|
| 56 |
+
])
|
| 57 |
+
|
| 58 |
+
def get_projects_list() -> list[str]:
|
| 59 |
+
projects = list_projects()
|
| 60 |
+
if DEFAULT_PROJECT not in projects:
|
| 61 |
+
projects.insert(0, DEFAULT_PROJECT)
|
| 62 |
+
return projects
|
| 63 |
+
|
| 64 |
+
# ββ Callbacks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
|
| 66 |
+
def handle_image_upload(files, project, progress=gr.Progress()):
|
| 67 |
+
if not files:
|
| 68 |
+
return "No files uploaded.", get_system_info(project)
|
| 69 |
+
results = []
|
| 70 |
+
for i, f in enumerate(files):
|
| 71 |
+
progress((i + 1) / len(files), desc=f"Embedding {Path(f).name}...")
|
| 72 |
+
ok, msg = ingest_single_image(f, project=project)
|
| 73 |
+
results.append(msg)
|
| 74 |
+
return "\n".join(results), get_system_info(project)
|
| 75 |
+
|
| 76 |
+
def handle_video_upload(files, project, progress=gr.Progress()):
|
| 77 |
+
if not files:
|
| 78 |
+
return "No files uploaded.", get_system_info(project)
|
| 79 |
+
results = []
|
| 80 |
+
for f in files:
|
| 81 |
+
count, msg = ingest_single_video(f, project=project, progress_callback=progress)
|
| 82 |
+
results.append(msg)
|
| 83 |
+
return "\n".join(results), get_system_info(project)
|
| 84 |
+
|
| 85 |
+
def handle_batch_ingest(project, progress=gr.Progress()):
|
| 86 |
+
img_count, img_log = ingest_images(project=project, progress_callback=progress)
|
| 87 |
+
vid_count, vid_log = ingest_videos(project=project, progress_callback=progress)
|
| 88 |
+
log = (
|
| 89 |
+
f"=== Batch Ingest Results ===\n\n"
|
| 90 |
+
f"Successfully indexed {img_count} images and {vid_count} video frames into project '{project}'."
|
| 91 |
+
)
|
| 92 |
+
return log, get_system_info(project)
|
| 93 |
+
|
| 94 |
+
def handle_seed(project, progress=gr.Progress()):
|
| 95 |
+
count, log = seed_data.run(project=project, progress_callback=progress)
|
| 96 |
+
return log, get_system_info(project)
|
| 97 |
+
|
| 98 |
+
def handle_clear(project):
|
| 99 |
+
get_store(project, "image_index").clear()
|
| 100 |
+
get_store(project, "video_index").clear()
|
| 101 |
+
return f"All indexes cleared for project '{project}'.", get_system_info(project)
|
| 102 |
+
|
| 103 |
+
def handle_search(query, mode, top_k, project):
|
| 104 |
+
if not query.strip():
|
| 105 |
+
return "Please enter a search query.", [], ""
|
| 106 |
+
|
| 107 |
+
if mode == "Image Search":
|
| 108 |
+
result = search_images(query, project=project, top_k=int(top_k))
|
| 109 |
+
summary = result["llm_summary"]
|
| 110 |
+
gallery_items = []
|
| 111 |
+
for r in result["results"]:
|
| 112 |
+
path = r.get("file_path", "")
|
| 113 |
+
name = r.get("file_name", "Unknown")
|
| 114 |
+
score = r.get("score", 0)
|
| 115 |
+
if path and os.path.exists(path):
|
| 116 |
+
gallery_items.append((path, f"{name} (Score: {score:.3f})"))
|
| 117 |
+
|
| 118 |
+
return summary, gallery_items, result["store_info"]
|
| 119 |
+
|
| 120 |
+
else:
|
| 121 |
+
result = search_videos(query, project=project, top_k=int(top_k))
|
| 122 |
+
summary = result["llm_summary"]
|
| 123 |
+
gallery_items = []
|
| 124 |
+
for m in result["matches"]:
|
| 125 |
+
path = m.get("representative_frame", "")
|
| 126 |
+
name = m.get("video_name", "Unknown")
|
| 127 |
+
time_range = f"{m['start']} - {m['end']}"
|
| 128 |
+
score = m.get("score", 0)
|
| 129 |
+
if path and os.path.exists(path):
|
| 130 |
+
gallery_items.append((path, f"{name} @ {time_range} (Score: {score:.3f})"))
|
| 131 |
+
|
| 132 |
+
return summary, gallery_items, result["store_info"]
|
| 133 |
+
|
| 134 |
+
def handle_create_project(name):
|
| 135 |
+
if not name or not name.strip():
|
| 136 |
+
return "Enter a project name.", gr.update()
|
| 137 |
+
name = name.strip().lower().replace(" ", "-")
|
| 138 |
+
from config import get_project_dir
|
| 139 |
+
get_project_dir(name)
|
| 140 |
+
return f"Project '{name}' created.", gr.update(choices=get_projects_list(), value=name)
|
| 141 |
+
|
| 142 |
+
# ββ CSS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
+
|
| 144 |
+
CSS = """
|
| 145 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700;800&display=swap');
|
| 146 |
+
|
| 147 |
+
body { font-family: 'Inter', sans-serif !important; }
|
| 148 |
+
|
| 149 |
+
.gradio-container {
|
| 150 |
+
max-width: 1300px !important;
|
| 151 |
+
margin: 0 auto !important;
|
| 152 |
+
background-color: #050505 !important;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
.main-header {
|
| 156 |
+
text-align: center;
|
| 157 |
+
background: linear-gradient(135deg, #0f0f1b 0%, #1a1a2e 100%);
|
| 158 |
+
padding: 3rem 2rem;
|
| 159 |
+
border-radius: 24px;
|
| 160 |
+
margin-bottom: 2rem;
|
| 161 |
+
border: 1px solid rgba(255,255,255,0.05);
|
| 162 |
+
box-shadow: 0 10px 30px rgba(0,0,0,0.5);
|
| 163 |
+
display: flex;
|
| 164 |
+
flex-direction: column;
|
| 165 |
+
align-items: center;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
.logo-container img {
|
| 169 |
+
max-width: 120px;
|
| 170 |
+
margin-bottom: 1.5rem;
|
| 171 |
+
filter: drop-shadow(0 0 15px rgba(233, 69, 96, 0.4));
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.main-header h1 {
|
| 175 |
+
background: linear-gradient(90deg, #e94560, #a033ff, #4cc9f0);
|
| 176 |
+
-webkit-background-clip: text;
|
| 177 |
+
-webkit-text-fill-color: transparent;
|
| 178 |
+
font-size: 3.2rem !important;
|
| 179 |
+
font-weight: 800 !important;
|
| 180 |
+
margin: 0;
|
| 181 |
+
letter-spacing: -1px;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.main-header p.subtitle {
|
| 185 |
+
color: #94a3b8;
|
| 186 |
+
font-size: 1.1rem;
|
| 187 |
+
margin-top: 0.5rem;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.card {
|
| 191 |
+
background: #11111b !important;
|
| 192 |
+
border: 1px solid rgba(255,255,255,0.08) !important;
|
| 193 |
+
border-radius: 16px !important;
|
| 194 |
+
padding: 1rem !important;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
#search-btn {
|
| 198 |
+
background: linear-gradient(135deg, #e94560 0%, #533483 100%) !important;
|
| 199 |
+
border: none !important;
|
| 200 |
+
font-weight: 700 !important;
|
| 201 |
+
color: white !important;
|
| 202 |
+
transition: all 0.3s ease;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
#search-btn:hover {
|
| 206 |
+
transform: translateY(-2px);
|
| 207 |
+
box-shadow: 0 5px 15px rgba(233, 69, 96, 0.4);
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
.stat-box {
|
| 211 |
+
background: rgba(255,255,255,0.03);
|
| 212 |
+
border-radius: 12px;
|
| 213 |
+
padding: 1rem;
|
| 214 |
+
border: 1px solid rgba(255,255,255,0.05);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.gallery-container {
|
| 218 |
+
background: #0a0a0f !important;
|
| 219 |
+
border-radius: 12px !important;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
footer { display: none !important; }
|
| 223 |
+
"""
|
| 224 |
+
|
| 225 |
+
# ββ Build UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 226 |
+
|
| 227 |
+
def build_ui():
|
| 228 |
+
logo_path = "assests/rockit_logo.png"
|
| 229 |
+
arch_path = "assests/Architecture.svg"
|
| 230 |
+
flow_path = "assests/data_flow.svg"
|
| 231 |
+
gpu_path = "assests/gpu_compute_tiers.svg"
|
| 232 |
+
|
| 233 |
+
with gr.Blocks(
|
| 234 |
+
title="ROCKIT Vision Intelligence",
|
| 235 |
+
theme=gr.themes.Default(
|
| 236 |
+
primary_hue="rose",
|
| 237 |
+
secondary_hue="indigo",
|
| 238 |
+
neutral_hue="slate",
|
| 239 |
+
),
|
| 240 |
+
css=CSS,
|
| 241 |
+
) as app:
|
| 242 |
+
|
| 243 |
+
with gr.Div(elem_classes="main-header"):
|
| 244 |
+
if os.path.exists(logo_path):
|
| 245 |
+
gr.Image(logo_path, show_label=False, container=False, width=100, elem_classes="logo-container")
|
| 246 |
+
gr.HTML("<h1>ROCKIT Vision Intelligence</h1>")
|
| 247 |
+
gr.Markdown("GPU-Accelerated Multimodal Search Platform", elem_classes="subtitle")
|
| 248 |
+
|
| 249 |
+
with gr.Row():
|
| 250 |
+
with gr.Column(scale=3):
|
| 251 |
+
with gr.Group(elem_classes="card"):
|
| 252 |
+
gr.Markdown("### ποΈ Project Selection")
|
| 253 |
+
with gr.Row():
|
| 254 |
+
project_select = gr.Dropdown(
|
| 255 |
+
choices=get_projects_list(),
|
| 256 |
+
value=DEFAULT_PROJECT,
|
| 257 |
+
label="Active Workspace",
|
| 258 |
+
scale=4,
|
| 259 |
+
interactive=True,
|
| 260 |
+
)
|
| 261 |
+
refresh_btn = gr.Button("π", scale=1)
|
| 262 |
+
|
| 263 |
+
with gr.Accordion("Create New Project", open=False):
|
| 264 |
+
new_project_name = gr.Textbox(label="Project ID", placeholder="e.g. security-cam")
|
| 265 |
+
create_btn = gr.Button("Initialize Project", variant="secondary")
|
| 266 |
+
create_status = gr.Markdown()
|
| 267 |
+
|
| 268 |
+
with gr.Group(elem_classes="card", visible=True):
|
| 269 |
+
gr.Markdown("### βοΈ System Status")
|
| 270 |
+
system_info = gr.Markdown(value=get_system_info())
|
| 271 |
+
|
| 272 |
+
with gr.Column(scale=7):
|
| 273 |
+
with gr.Tabs():
|
| 274 |
+
|
| 275 |
+
# ββ Tab 1: Search ββββββββββββββββββββββββββββββββββββββββββ
|
| 276 |
+
with gr.Tab("π Search"):
|
| 277 |
+
with gr.Group(elem_classes="card"):
|
| 278 |
+
with gr.Row():
|
| 279 |
+
with gr.Column(scale=4):
|
| 280 |
+
query_input = gr.Textbox(
|
| 281 |
+
label="Natural Language Query",
|
| 282 |
+
placeholder='Try "a cat sitting on a laptop" or "someone running in a park"',
|
| 283 |
+
lines=2,
|
| 284 |
+
)
|
| 285 |
+
with gr.Column(scale=1):
|
| 286 |
+
search_mode = gr.Radio(["Image Search", "Video Intelligence"], value="Image Search", label="Search Mode")
|
| 287 |
+
top_k = gr.Slider(1, 50, value=12, step=1, label="Results Count")
|
| 288 |
+
|
| 289 |
+
search_btn = gr.Button("Execute Semantic Search", variant="primary", elem_id="search-btn", size="lg")
|
| 290 |
+
|
| 291 |
+
gr.Markdown("### π€ AI Interpretation")
|
| 292 |
+
search_summary = gr.Markdown("*Results will appear here...*", elem_classes="card")
|
| 293 |
+
|
| 294 |
+
gr.Markdown("### πΌοΈ Visual Matches")
|
| 295 |
+
result_gallery = gr.Gallery(
|
| 296 |
+
label="Retrieved Media",
|
| 297 |
+
columns=[3, 4],
|
| 298 |
+
rows=[2],
|
| 299 |
+
object_fit="contain",
|
| 300 |
+
height="auto",
|
| 301 |
+
elem_classes="gallery-container"
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
with gr.Accordion("Technical Details", open=False):
|
| 305 |
+
store_info = gr.Textbox(label="Vector Store Engine", interactive=False)
|
| 306 |
+
|
| 307 |
+
# ββ Tab 2: Upload ββββββββββββββββββββββββββββββββββββββββββ
|
| 308 |
+
with gr.Tab("π€ Ingest Media"):
|
| 309 |
+
with gr.Row():
|
| 310 |
+
with gr.Column():
|
| 311 |
+
with gr.Group(elem_classes="card"):
|
| 312 |
+
gr.Markdown("#### πΌοΈ Image Ingestion")
|
| 313 |
+
img_upload = gr.File(label="Select Images", file_types=["image"], file_count="multiple")
|
| 314 |
+
img_btn = gr.Button("Embed & Index Images")
|
| 315 |
+
img_log = gr.Textbox(label="Status", lines=4, interactive=False)
|
| 316 |
+
|
| 317 |
+
with gr.Column():
|
| 318 |
+
with gr.Group(elem_classes="card"):
|
| 319 |
+
gr.Markdown("#### π₯ Video Intelligence")
|
| 320 |
+
vid_upload = gr.File(label="Select Videos", file_types=["video"], file_count="multiple")
|
| 321 |
+
vid_btn = gr.Button("Extract & Index Frames")
|
| 322 |
+
vid_log = gr.Textbox(label="Status", lines=4, interactive=False)
|
| 323 |
+
|
| 324 |
+
with gr.Group(elem_classes="card"):
|
| 325 |
+
gr.Markdown("#### β‘ Batch Operations")
|
| 326 |
+
with gr.Row():
|
| 327 |
+
seed_btn = gr.Button("Seed Demo Data", variant="secondary")
|
| 328 |
+
batch_btn = gr.Button("Re-index Folder", variant="secondary")
|
| 329 |
+
clear_btn = gr.Button("Purge All Indexes", variant="stop")
|
| 330 |
+
action_log = gr.Markdown()
|
| 331 |
+
|
| 332 |
+
# ββ Tab 3: Workflow ββββββββββββββββββββββββββββββββββββββββ
|
| 333 |
+
with gr.Tab("π§ How It Works"):
|
| 334 |
+
gr.Markdown("""
|
| 335 |
+
### Direct Multimodal Embedding
|
| 336 |
+
ROCKIT doesn't use captioning models. It uses **Vision-Language Models (VLM)** to encode visual features
|
| 337 |
+
directly into the same vector space as text. This preserves subtle details that text captions often lose.
|
| 338 |
+
""")
|
| 339 |
+
|
| 340 |
+
with gr.Row():
|
| 341 |
+
with gr.Column():
|
| 342 |
+
gr.Markdown("#### 1. System Architecture")
|
| 343 |
+
if os.path.exists(arch_path):
|
| 344 |
+
gr.Image(arch_path, show_label=False)
|
| 345 |
+
with gr.Column():
|
| 346 |
+
gr.Markdown("#### 2. Query Flow")
|
| 347 |
+
if os.path.exists(flow_path):
|
| 348 |
+
gr.Image(flow_path, show_label=False)
|
| 349 |
+
|
| 350 |
+
gr.Markdown("---")
|
| 351 |
+
|
| 352 |
+
with gr.Row():
|
| 353 |
+
with gr.Column():
|
| 354 |
+
gr.Markdown("#### 3. GPU Acceleration Tiers")
|
| 355 |
+
if os.path.exists(gpu_path):
|
| 356 |
+
gr.Image(gpu_path, show_label=False)
|
| 357 |
+
with gr.Column():
|
| 358 |
+
gr.Markdown("""
|
| 359 |
+
#### Hot/Cold Memory Management
|
| 360 |
+
To support dozens of projects on a single GPU, ROCKIT implements an **NVMe-to-VRAM Async Swap**.
|
| 361 |
+
|
| 362 |
+
- **Cold Store (NVMe):** Indexes are serialized as `.cagra` files.
|
| 363 |
+
- **Hot Cache (VRAM):** Active projects are copied into VRAM using pinned-memory DMA.
|
| 364 |
+
- **LRU Eviction:** Least recently used indexes are purged from VRAM to make room for new ones.
|
| 365 |
+
""")
|
| 366 |
+
|
| 367 |
+
# Event Bindings
|
| 368 |
+
project_select.change(fn=get_system_info, inputs=[project_select], outputs=[system_info])
|
| 369 |
+
refresh_btn.click(fn=lambda: gr.update(choices=get_projects_list()), outputs=[project_select])
|
| 370 |
+
|
| 371 |
+
create_btn.click(
|
| 372 |
+
fn=handle_create_project,
|
| 373 |
+
inputs=[new_project_name],
|
| 374 |
+
outputs=[create_status, project_select],
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
search_btn.click(
|
| 378 |
+
fn=handle_search,
|
| 379 |
+
inputs=[query_input, search_mode, top_k, project_select],
|
| 380 |
+
outputs=[search_summary, result_gallery, store_info]
|
| 381 |
+
)
|
| 382 |
+
query_input.submit(
|
| 383 |
+
fn=handle_search,
|
| 384 |
+
inputs=[query_input, search_mode, top_k, project_select],
|
| 385 |
+
outputs=[search_summary, result_gallery, store_info]
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
img_btn.click(fn=handle_image_upload, inputs=[img_upload, project_select], outputs=[img_log, system_info])
|
| 389 |
+
vid_btn.click(fn=handle_video_upload, inputs=[vid_upload, project_select], outputs=[vid_log, system_info])
|
| 390 |
+
seed_btn.click(fn=handle_seed, inputs=[project_select], outputs=[action_log, system_info])
|
| 391 |
+
batch_btn.click(fn=handle_batch_ingest, inputs=[project_select], outputs=[action_log, system_info])
|
| 392 |
+
clear_btn.click(fn=handle_clear, inputs=[project_select], outputs=[action_log, system_info])
|
| 393 |
+
|
| 394 |
+
return app
|
| 395 |
+
|
| 396 |
+
if __name__ == "__main__":
|
| 397 |
+
if seed_data.is_needed():
|
| 398 |
+
logger.info("Auto-seeding default project from HF Dataset...")
|
| 399 |
+
try:
|
| 400 |
+
seed_data.run()
|
| 401 |
+
except Exception as e:
|
| 402 |
+
logger.error(f"Auto-seeding failed: {e}")
|
| 403 |
+
|
| 404 |
+
app = build_ui()
|
| 405 |
+
app.launch(server_name="0.0.0.0", server_port=7860, share=False)
|
assests/Architecture.svg
ADDED
|
|
assests/GPU_Compute.png
ADDED
|
assests/architecture.png
ADDED
|
assests/dataFlow.png
ADDED
|
assests/data_flow.svg
ADDED
|
|
assests/gpu_compute_tiers.svg
ADDED
|
|
assests/rockit_logo.png
ADDED
|
Git LFS Details
|
config.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF_Space_hipVS/config.py
|
| 2 |
+
# ========================
|
| 3 |
+
# Environment-aware configuration.
|
| 4 |
+
# Auto-scales model selection by hardware tier.
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
# ββ Core Flags ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
+
|
| 14 |
+
USE_GPU = os.environ.get("USE_GPU", "false").lower() in ("true", "1", "yes")
|
| 15 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 16 |
+
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
|
| 17 |
+
|
| 18 |
+
# ββ Device ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
|
| 20 |
+
DEVICE = "cuda" if USE_GPU else "cpu"
|
| 21 |
+
TORCH_DTYPE = "float16" if USE_GPU else "float32"
|
| 22 |
+
|
| 23 |
+
# ββ Embedding Model (multimodal β images + text, NO captioning) βββββββββββββ
|
| 24 |
+
#
|
| 25 |
+
# GPU: Qwen3-VL-Embedding-2B (2048d) or Qwen3-VL-Embedding-8B (4096d)
|
| 26 |
+
# CPU: CLIP ViT-L/14 (768d) β lightweight, runs on free HF Spaces
|
| 27 |
+
#
|
| 28 |
+
if USE_GPU:
|
| 29 |
+
EMBED_MODEL = os.environ.get("EMBED_MODEL", "Qwen/Qwen3-VL-Embedding-2B")
|
| 30 |
+
EMBED_DIM = int(os.environ.get("EMBED_DIM", "2048"))
|
| 31 |
+
else:
|
| 32 |
+
EMBED_MODEL = os.environ.get("EMBED_MODEL", "openai/clip-vit-large-patch14")
|
| 33 |
+
EMBED_DIM = int(os.environ.get("EMBED_DIM", "768"))
|
| 34 |
+
|
| 35 |
+
# ββ LLM (search result interpretation) βββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
#
|
| 37 |
+
# Primary: Qwen3-35B-A3B (MoE: 35B total, 3B active β fast + smart)
|
| 38 |
+
# Fallback: Qwen3-1.7B (dense, runs on anything)
|
| 39 |
+
#
|
| 40 |
+
LLM_MODEL = os.environ.get("LLM_MODEL", "Qwen/Qwen3-35B-A3B")
|
| 41 |
+
LLM_FALLBACK = os.environ.get("LLM_FALLBACK", "Qwen/Qwen3-1.7B")
|
| 42 |
+
|
| 43 |
+
# ββ Video Frame Extraction βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
|
| 45 |
+
FRAME_EVERY_SEC = int(os.environ.get("FRAME_EVERY_SEC", "5"))
|
| 46 |
+
|
| 47 |
+
# ββ Data Directories ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 48 |
+
|
| 49 |
+
DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path(__file__).parent / "data")))
|
| 50 |
+
PROJECTS_DIR = DATA_DIR / "projects"
|
| 51 |
+
DEFAULT_PROJECT = os.environ.get("DEFAULT_PROJECT", "default")
|
| 52 |
+
SWAP_PATH = Path(os.environ.get("SWAP_PATH", str(DATA_DIR / "indexes")))
|
| 53 |
+
|
| 54 |
+
# Ensure base directories
|
| 55 |
+
for d in (PROJECTS_DIR, SWAP_PATH):
|
| 56 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 57 |
+
|
| 58 |
+
# ββ Per-project directories βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 59 |
+
|
| 60 |
+
def get_project_dir(project: str = DEFAULT_PROJECT) -> Path:
|
| 61 |
+
"""Return the root directory for a project, creating it if needed."""
|
| 62 |
+
p = PROJECTS_DIR / project
|
| 63 |
+
for sub in ("images", "videos", "indexes"):
|
| 64 |
+
(p / sub).mkdir(parents=True, exist_ok=True)
|
| 65 |
+
return p
|
| 66 |
+
|
| 67 |
+
# Ensure default project exists
|
| 68 |
+
get_project_dir(DEFAULT_PROJECT)
|
| 69 |
+
|
| 70 |
+
# ββ Seeding βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
+
|
| 72 |
+
SEED_DATASET = os.environ.get("SEED_DATASET", "nlphuji/flickr30k")
|
| 73 |
+
SEED_SPLIT = os.environ.get("SEED_SPLIT", "test[:200]")
|
| 74 |
+
AUTO_SEED = os.environ.get("AUTO_SEED", "true").lower() in ("true", "1", "yes")
|
| 75 |
+
|
| 76 |
+
# ββ File Extensions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
+
|
| 78 |
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp"}
|
| 79 |
+
VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm"}
|
| 80 |
+
|
| 81 |
+
# ββ Startup Log βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 82 |
+
|
| 83 |
+
logger.info("=" * 55)
|
| 84 |
+
logger.info(" ARIA Vision Intelligence")
|
| 85 |
+
logger.info("=" * 55)
|
| 86 |
+
logger.info(f" USE_GPU : {USE_GPU}")
|
| 87 |
+
logger.info(f" DEVICE : {DEVICE}")
|
| 88 |
+
logger.info(f" EMBED_MODEL : {EMBED_MODEL}")
|
| 89 |
+
logger.info(f" EMBED_DIM : {EMBED_DIM}")
|
| 90 |
+
logger.info(f" LLM_MODEL : {LLM_MODEL}")
|
| 91 |
+
logger.info(f" LLM_FALLBACK : {LLM_FALLBACK}")
|
| 92 |
+
logger.info(f" SWAP_PATH : {SWAP_PATH}")
|
| 93 |
+
logger.info(f" HF_TOKEN : {'set' if HF_TOKEN else 'NOT SET'}")
|
| 94 |
+
logger.info(f" HF_DATASET : {HF_DATASET_REPO or 'local only'}")
|
| 95 |
+
logger.info(f" AUTO_SEED : {AUTO_SEED}")
|
| 96 |
+
logger.info("=" * 55)
|
data/images/.gitkeep
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This directory stores uploaded images for embedding.
|
| 2 |
+
# Place your image files here (.jpg, .png, .webp, etc.)
|
data/indexes/.gitkeep
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Persisted vector indexes are stored here as .npz files.
|
| 2 |
+
# This directory is auto-created by the vector_store module.
|
data/projects/default/images/car.jpg
ADDED
|
data/projects/default/images/dog.jpg
ADDED
|
data/projects/default/images/mountain.jpg
ADDED
|
data/projects/default/indexes/image_index.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7497eb8fd578e2e4d7c14cf0b26e70b9f0bc6c05b62b00bc4685590cd5ad7503
|
| 3 |
+
size 29169
|
data/projects/default/indexes/image_index_meta.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[{"file_name": "mountain_sunset.jpg", "file_size": "245.3KB", "resolution": "1920x1080", "file_path": "/data/images/mountain_sunset.jpg"}, {"file_name": "dog_park.jpg", "file_size": "189.7KB", "resolution": "1280x720", "file_path": "/data/images/dog_park.jpg"}, {"file_name": "red_car.jpg", "file_size": "312.1KB", "resolution": "1920x1080", "file_path": "/data/images/red_car.jpg"}, {"file_name": "ocean_waves.jpg", "file_size": "276.4KB", "resolution": "2560x1440", "file_path": "/data/images/ocean_waves.jpg"}, {"file_name": "city_night.jpg", "file_size": "198.2KB", "resolution": "1920x1080", "file_path": "/data/images/city_night.jpg"}, {"file_name": "cat_windowsill.jpg", "file_size": "145.6KB", "resolution": "1280x960", "file_path": "/data/images/cat_windowsill.jpg"}, {"file_name": "forest_trail.jpg", "file_size": "334.8KB", "resolution": "2560x1440", "file_path": "/data/images/forest_trail.jpg"}, {"file_name": "beach_sunset.jpg", "file_size": "267.9KB", "resolution": "1920x1080", "file_path": "/data/images/beach_sunset.jpg"}, {"file_name": "snow_mountain.jpg", "file_size": "289.3KB", "resolution": "3840x2160", "file_path": "/data/images/snow_mountain.jpg"}, {"file_name": "flower_garden.jpg", "file_size": "203.5KB", "resolution": "1600x1200", "file_path": "/data/images/flower_garden.jpg"}]
|
data/projects/default/indexes/video_index.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43c04bb0e533e610d5d5fc4a9d1a8823cf504e846e3ed3ed465608c0a550bf79
|
| 3 |
+
size 57673
|
data/projects/default/indexes/video_index_meta.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[{"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 0.5, "timestamp_label": "00:00", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 5.0, "timestamp_label": "00:05", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 10.0, "timestamp_label": "00:10", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 15.0, "timestamp_label": "00:15", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 20.0, "timestamp_label": "00:20", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 25.0, "timestamp_label": "00:25", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 30.0, "timestamp_label": "00:30", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 35.0, "timestamp_label": "00:35", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 40.0, "timestamp_label": "00:40", "duration_total": 120.0}, {"video_path": "/data/videos/nature_doc.mp4", "video_name": "nature_doc.mp4", "timestamp_sec": 45.0, "timestamp_label": "00:45", "duration_total": 120.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 0.5, "timestamp_label": "00:00", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 5.0, "timestamp_label": "00:05", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 10.0, "timestamp_label": "00:10", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 15.0, "timestamp_label": "00:15", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 20.0, "timestamp_label": "00:20", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 25.0, "timestamp_label": "00:25", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 30.0, "timestamp_label": "00:30", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 35.0, "timestamp_label": "00:35", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 40.0, "timestamp_label": "00:40", "duration_total": 60.0}, {"video_path": "/data/videos/big_buck_bunny.mp4", "video_name": "big_buck_bunny.mp4", "timestamp_sec": 45.0, "timestamp_label": "00:45", "duration_total": 60.0}]
|
data/projects/default/videos/sample.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bb938fb70049e3e45f533b37ccae995ae96516e04c2f35b0c1142e47b2a39c1
|
| 3 |
+
size 788493
|
data/videos/.gitkeep
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This directory stores uploaded videos for frame extraction and embedding.
|
| 2 |
+
# Place your video files here (.mp4, .mov, .avi, etc.)
|
embedding.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF_Space_hipVS/embedding.py
|
| 2 |
+
# ============================
|
| 3 |
+
# Multimodal embedding + LLM calls.
|
| 4 |
+
#
|
| 5 |
+
# Embedding strategy: NO CAPTIONING.
|
| 6 |
+
# GPU: Qwen3-VL-Embedding (2B or 8B) β encodes images AND text into same space
|
| 7 |
+
# CPU: CLIP ViT-L/14 β same idea, lighter weight
|
| 8 |
+
#
|
| 9 |
+
# LLM strategy:
|
| 10 |
+
# Primary: Qwen3-35B-A3B (local or HF Inference API)
|
| 11 |
+
# Fallback: Qwen3-1.7B or HF Inference API
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import io
|
| 15 |
+
import numpy as np
|
| 16 |
+
from PIL import Image as PILImage
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# ββ Lazy-loaded model singletons βββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
|
| 22 |
+
_embed_model = None
|
| 23 |
+
_embed_processor = None
|
| 24 |
+
_embed_tokenizer = None
|
| 25 |
+
_is_clip = False
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _load_embed_model():
|
| 29 |
+
"""
|
| 30 |
+
Lazy-init the multimodal embedding model.
|
| 31 |
+
|
| 32 |
+
GPU path: Qwen3-VL-Embedding via transformers
|
| 33 |
+
CPU path: CLIP via transformers (CLIPModel + CLIPProcessor)
|
| 34 |
+
"""
|
| 35 |
+
global _embed_model, _embed_processor, _embed_tokenizer, _is_clip
|
| 36 |
+
if _embed_model is not None:
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
import torch
|
| 40 |
+
from config import EMBED_MODEL, DEVICE, USE_GPU
|
| 41 |
+
|
| 42 |
+
model_lower = EMBED_MODEL.lower()
|
| 43 |
+
|
| 44 |
+
if "clip" in model_lower:
|
| 45 |
+
# ββ CLIP path (CPU fallback) ββββββββββββββββββββββββββββββββββββ
|
| 46 |
+
from transformers import CLIPModel, CLIPProcessor
|
| 47 |
+
|
| 48 |
+
logger.info(f"Loading CLIP model: {EMBED_MODEL} on {DEVICE}")
|
| 49 |
+
_embed_model = CLIPModel.from_pretrained(EMBED_MODEL).to(DEVICE)
|
| 50 |
+
_embed_processor = CLIPProcessor.from_pretrained(EMBED_MODEL)
|
| 51 |
+
_embed_model.eval()
|
| 52 |
+
_is_clip = True
|
| 53 |
+
logger.info("CLIP model loaded")
|
| 54 |
+
|
| 55 |
+
else:
|
| 56 |
+
# ββ Qwen3-VL-Embedding path (GPU) ββββββββββββββββββββββββββββββ
|
| 57 |
+
from transformers import AutoModel, AutoProcessor
|
| 58 |
+
|
| 59 |
+
dtype = torch.float16 if USE_GPU else torch.float32
|
| 60 |
+
logger.info(f"Loading Qwen3-VL-Embedding: {EMBED_MODEL} on {DEVICE}")
|
| 61 |
+
_embed_model = AutoModel.from_pretrained(
|
| 62 |
+
EMBED_MODEL,
|
| 63 |
+
torch_dtype=dtype,
|
| 64 |
+
trust_remote_code=True,
|
| 65 |
+
).to(DEVICE)
|
| 66 |
+
_embed_processor = AutoProcessor.from_pretrained(
|
| 67 |
+
EMBED_MODEL,
|
| 68 |
+
trust_remote_code=True,
|
| 69 |
+
)
|
| 70 |
+
_embed_model.eval()
|
| 71 |
+
_is_clip = False
|
| 72 |
+
logger.info("Qwen3-VL-Embedding model loaded")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ββ Text Embedding ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
|
| 77 |
+
def embed_text(text: str) -> np.ndarray:
|
| 78 |
+
"""
|
| 79 |
+
Embed a text string into the shared multimodal vector space.
|
| 80 |
+
Works with both CLIP and Qwen3-VL-Embedding.
|
| 81 |
+
Returns a normalized float32 numpy vector.
|
| 82 |
+
"""
|
| 83 |
+
import torch
|
| 84 |
+
from config import DEVICE
|
| 85 |
+
|
| 86 |
+
_load_embed_model()
|
| 87 |
+
|
| 88 |
+
with torch.no_grad():
|
| 89 |
+
if _is_clip:
|
| 90 |
+
inputs = _embed_processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(DEVICE)
|
| 91 |
+
features = _embed_model.get_text_features(**inputs)
|
| 92 |
+
else:
|
| 93 |
+
# Qwen3-VL-Embedding: text-only input
|
| 94 |
+
inputs = _embed_processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(DEVICE)
|
| 95 |
+
outputs = _embed_model(**inputs)
|
| 96 |
+
# Use the [CLS] token or mean pooling depending on model
|
| 97 |
+
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 98 |
+
features = outputs.pooler_output
|
| 99 |
+
else:
|
| 100 |
+
features = outputs.last_hidden_state[:, 0, :]
|
| 101 |
+
|
| 102 |
+
vec = features.squeeze(0).cpu().float().numpy()
|
| 103 |
+
# L2 normalize
|
| 104 |
+
norm = np.linalg.norm(vec)
|
| 105 |
+
if norm > 0:
|
| 106 |
+
vec = vec / norm
|
| 107 |
+
return vec
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def embed_texts(texts: list[str]) -> np.ndarray:
|
| 111 |
+
"""Batch embed multiple texts. Returns (N, D) float32 array."""
|
| 112 |
+
import torch
|
| 113 |
+
from config import DEVICE
|
| 114 |
+
|
| 115 |
+
_load_embed_model()
|
| 116 |
+
|
| 117 |
+
with torch.no_grad():
|
| 118 |
+
if _is_clip:
|
| 119 |
+
inputs = _embed_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
|
| 120 |
+
features = _embed_model.get_text_features(**inputs)
|
| 121 |
+
else:
|
| 122 |
+
inputs = _embed_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
|
| 123 |
+
outputs = _embed_model(**inputs)
|
| 124 |
+
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 125 |
+
features = outputs.pooler_output
|
| 126 |
+
else:
|
| 127 |
+
features = outputs.last_hidden_state[:, 0, :]
|
| 128 |
+
|
| 129 |
+
vecs = features.cpu().float().numpy()
|
| 130 |
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
| 131 |
+
norms = np.where(norms == 0, 1, norms)
|
| 132 |
+
return vecs / norms
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ββ Image Embedding (direct, no captioning) βββββββββββββββββββββββββββββββββ
|
| 136 |
+
|
| 137 |
+
def embed_image(image: PILImage.Image) -> np.ndarray:
|
| 138 |
+
"""
|
| 139 |
+
Embed a PIL Image directly into the shared vector space.
|
| 140 |
+
No captioning step β the vision encoder handles it natively.
|
| 141 |
+
Returns a normalized float32 numpy vector.
|
| 142 |
+
"""
|
| 143 |
+
import torch
|
| 144 |
+
from config import DEVICE
|
| 145 |
+
|
| 146 |
+
_load_embed_model()
|
| 147 |
+
|
| 148 |
+
if image.mode != "RGB":
|
| 149 |
+
image = image.convert("RGB")
|
| 150 |
+
|
| 151 |
+
with torch.no_grad():
|
| 152 |
+
if _is_clip:
|
| 153 |
+
inputs = _embed_processor(images=image, return_tensors="pt").to(DEVICE)
|
| 154 |
+
features = _embed_model.get_image_features(**inputs)
|
| 155 |
+
else:
|
| 156 |
+
# Qwen3-VL-Embedding: image input via processor
|
| 157 |
+
inputs = _embed_processor(images=image, return_tensors="pt").to(DEVICE)
|
| 158 |
+
outputs = _embed_model(**inputs)
|
| 159 |
+
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 160 |
+
features = outputs.pooler_output
|
| 161 |
+
else:
|
| 162 |
+
features = outputs.last_hidden_state[:, 0, :]
|
| 163 |
+
|
| 164 |
+
vec = features.squeeze(0).cpu().float().numpy()
|
| 165 |
+
norm = np.linalg.norm(vec)
|
| 166 |
+
if norm > 0:
|
| 167 |
+
vec = vec / norm
|
| 168 |
+
return vec
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def embed_image_bytes(data: bytes, mime_type: str = "image/jpeg") -> np.ndarray:
|
| 172 |
+
"""Embed raw image bytes. Returns normalized float32 vector."""
|
| 173 |
+
image = PILImage.open(io.BytesIO(data))
|
| 174 |
+
return embed_image(image)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# ββ LLM Summarization ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 178 |
+
|
| 179 |
+
def llm_summarize(query: str, search_results: list[dict], mode: str = "image") -> str:
|
| 180 |
+
"""
|
| 181 |
+
Pass search results through an LLM for human-friendly interpretation.
|
| 182 |
+
Tries: local model -> HF Inference API -> plain text fallback.
|
| 183 |
+
"""
|
| 184 |
+
from config import LLM_MODEL, LLM_FALLBACK, HF_TOKEN
|
| 185 |
+
|
| 186 |
+
if not search_results:
|
| 187 |
+
return f'No results found for "{query}". Try uploading more media or using different search terms.'
|
| 188 |
+
|
| 189 |
+
# Build prompt context
|
| 190 |
+
if mode == "video":
|
| 191 |
+
results_text = "\n".join(
|
| 192 |
+
f" - Video: {r.get('video_name', '?')}, "
|
| 193 |
+
f"Time: {r.get('timestamp_label', '?')} ({r.get('timestamp_sec', 0):.1f}s), "
|
| 194 |
+
f"Score: {r.get('score', 0):.4f}"
|
| 195 |
+
for r in search_results
|
| 196 |
+
)
|
| 197 |
+
instruction = (
|
| 198 |
+
"You are a vision search assistant. Summarize the video search results below. "
|
| 199 |
+
"Highlight the most relevant moments and time ranges. Be concise. Use markdown."
|
| 200 |
+
)
|
| 201 |
+
else:
|
| 202 |
+
results_text = "\n".join(
|
| 203 |
+
f" - Image: {r.get('file_name', '?')}, "
|
| 204 |
+
f"Score: {r.get('score', 0):.4f}"
|
| 205 |
+
for r in search_results
|
| 206 |
+
)
|
| 207 |
+
instruction = (
|
| 208 |
+
"You are a vision search assistant. Summarize the image search results below. "
|
| 209 |
+
"Highlight the most relevant matches. Be concise. Use markdown."
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
prompt = (
|
| 213 |
+
f"{instruction}\n\n"
|
| 214 |
+
f"User query: \"{query}\"\n\n"
|
| 215 |
+
f"Search results ({len(search_results)} matches):\n{results_text}\n\n"
|
| 216 |
+
f"Summary:"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# Try HF Inference API (works for both local and remote models)
|
| 220 |
+
for model_id in (LLM_MODEL, LLM_FALLBACK):
|
| 221 |
+
try:
|
| 222 |
+
from huggingface_hub import InferenceClient
|
| 223 |
+
|
| 224 |
+
client = InferenceClient(
|
| 225 |
+
model=model_id,
|
| 226 |
+
token=HF_TOKEN if HF_TOKEN else None,
|
| 227 |
+
)
|
| 228 |
+
response = client.text_generation(
|
| 229 |
+
prompt,
|
| 230 |
+
max_new_tokens=300,
|
| 231 |
+
temperature=0.7,
|
| 232 |
+
do_sample=True,
|
| 233 |
+
)
|
| 234 |
+
if response and response.strip():
|
| 235 |
+
return response.strip()
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logger.warning(f"LLM {model_id} failed: {e}")
|
| 238 |
+
continue
|
| 239 |
+
|
| 240 |
+
# Plain text fallback
|
| 241 |
+
return (
|
| 242 |
+
f"**Found {len(search_results)} results for \"{query}\"**\n\n"
|
| 243 |
+
f"_(LLM summary unavailable)_\n\n"
|
| 244 |
+
f"```\n{results_text}\n```"
|
| 245 |
+
)
|
ingest.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF_Space_hipVS/ingest.py
|
| 2 |
+
# =========================
|
| 3 |
+
# Ingestion pipeline β embeds images/frames DIRECTLY with Qwen3-VL or CLIP.
|
| 4 |
+
# No captioning step. The vision-language model encodes images and text
|
| 5 |
+
# into the same vector space natively.
|
| 6 |
+
#
|
| 7 |
+
# CAGRA is rebuilt on every insert (optimized for query, not ingestion).
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
import shutil
|
| 12 |
+
import subprocess
|
| 13 |
+
import tempfile
|
| 14 |
+
import time
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
from PIL import Image as PILImage
|
| 18 |
+
|
| 19 |
+
from config import (
|
| 20 |
+
EMBED_DIM,
|
| 21 |
+
FRAME_EVERY_SEC,
|
| 22 |
+
IMAGE_EXTENSIONS,
|
| 23 |
+
VIDEO_EXTENSIONS,
|
| 24 |
+
get_project_dir,
|
| 25 |
+
DEFAULT_PROJECT,
|
| 26 |
+
)
|
| 27 |
+
from embedding import embed_image, embed_image_bytes
|
| 28 |
+
from vector_store import get_store
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
|
| 35 |
+
def fmt_time(seconds: float) -> str:
|
| 36 |
+
m, s = divmod(int(seconds), 60)
|
| 37 |
+
return f"{m:02d}:{s:02d}"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def check_ffmpeg() -> bool:
|
| 41 |
+
try:
|
| 42 |
+
subprocess.run(["ffprobe", "-version"], capture_output=True, timeout=5)
|
| 43 |
+
return True
|
| 44 |
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
| 45 |
+
return False
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
HAS_FFMPEG = check_ffmpeg()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def get_duration(video_path: str) -> float:
|
| 52 |
+
try:
|
| 53 |
+
r = subprocess.run(
|
| 54 |
+
["ffprobe", "-v", "error",
|
| 55 |
+
"-show_entries", "format=duration",
|
| 56 |
+
"-of", "default=noprint_wrappers=1:nokey=1",
|
| 57 |
+
video_path],
|
| 58 |
+
capture_output=True, text=True, timeout=30,
|
| 59 |
+
)
|
| 60 |
+
return float(r.stdout.strip())
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.warning(f"ffprobe error: {e}")
|
| 63 |
+
return 0.0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def extract_frame(video_path: str, timestamp_sec: float, out_path: str) -> bool:
|
| 67 |
+
result = subprocess.run(
|
| 68 |
+
["ffmpeg", "-y",
|
| 69 |
+
"-ss", f"{timestamp_sec:.3f}",
|
| 70 |
+
"-i", video_path,
|
| 71 |
+
"-frames:v", "1",
|
| 72 |
+
"-q:v", "2",
|
| 73 |
+
"-vf", "scale=640:-1",
|
| 74 |
+
out_path],
|
| 75 |
+
capture_output=True, timeout=30,
|
| 76 |
+
)
|
| 77 |
+
return result.returncode == 0 and os.path.exists(out_path) and os.path.getsize(out_path) > 0
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def get_image_meta(path: Path) -> dict:
|
| 81 |
+
stat = path.stat()
|
| 82 |
+
size = f"{round(stat.st_size / 1024, 1)}KB"
|
| 83 |
+
try:
|
| 84 |
+
with PILImage.open(path) as img:
|
| 85 |
+
res = f"{img.width}x{img.height}"
|
| 86 |
+
except Exception:
|
| 87 |
+
res = "unknown"
|
| 88 |
+
return {
|
| 89 |
+
"file_path": str(path.resolve()),
|
| 90 |
+
"file_name": path.name,
|
| 91 |
+
"file_size": size,
|
| 92 |
+
"resolution": res,
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# ββ Image Ingestion βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 97 |
+
|
| 98 |
+
def ingest_images(project: str = DEFAULT_PROJECT, progress_callback=None) -> tuple[int, str]:
|
| 99 |
+
"""Ingest all images from a project's images/ directory."""
|
| 100 |
+
proj_dir = get_project_dir(project)
|
| 101 |
+
image_dir = proj_dir / "images"
|
| 102 |
+
store = get_store(project, "image_index")
|
| 103 |
+
|
| 104 |
+
files = sorted(
|
| 105 |
+
f for f in image_dir.iterdir()
|
| 106 |
+
if f.suffix.lower() in IMAGE_EXTENSIONS
|
| 107 |
+
)[:200]
|
| 108 |
+
|
| 109 |
+
if not files:
|
| 110 |
+
return 0, f"No images found in {image_dir}"
|
| 111 |
+
|
| 112 |
+
store.clear()
|
| 113 |
+
log = [f"[{project}] Found {len(files)} images\n"]
|
| 114 |
+
|
| 115 |
+
import numpy as np
|
| 116 |
+
all_vectors = []
|
| 117 |
+
all_ids = []
|
| 118 |
+
all_meta = []
|
| 119 |
+
|
| 120 |
+
for i, p in enumerate(files):
|
| 121 |
+
meta = get_image_meta(p)
|
| 122 |
+
try:
|
| 123 |
+
img = PILImage.open(p)
|
| 124 |
+
vec = embed_image(img) # direct multimodal embed, no captioning
|
| 125 |
+
all_vectors.append(vec)
|
| 126 |
+
all_ids.append(meta["file_name"])
|
| 127 |
+
all_meta.append(meta)
|
| 128 |
+
log.append(f" [{i+1}/{len(files)}] {p.name} ({meta['resolution']})")
|
| 129 |
+
except Exception as e:
|
| 130 |
+
log.append(f" [{i+1}/{len(files)}] {p.name}: FAILED ({e})")
|
| 131 |
+
|
| 132 |
+
if progress_callback:
|
| 133 |
+
progress_callback((i + 1) / len(files), desc=f"Embedding {p.name}...")
|
| 134 |
+
|
| 135 |
+
if all_vectors:
|
| 136 |
+
vectors = np.stack(all_vectors)
|
| 137 |
+
store.add(vectors, all_ids, all_meta) # CAGRA rebuilt inside add()
|
| 138 |
+
|
| 139 |
+
log.append(f"\n{len(all_vectors)} images indexed ({store.mode})")
|
| 140 |
+
return len(all_vectors), "\n".join(log)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def ingest_single_image(file_path: str, project: str = DEFAULT_PROJECT) -> tuple[bool, str]:
|
| 144 |
+
"""Ingest a single uploaded image. CAGRA is rebuilt."""
|
| 145 |
+
path = Path(file_path)
|
| 146 |
+
proj_dir = get_project_dir(project)
|
| 147 |
+
dest = proj_dir / "images" / path.name
|
| 148 |
+
shutil.copy2(str(path), str(dest))
|
| 149 |
+
|
| 150 |
+
store = get_store(project, "image_index")
|
| 151 |
+
meta = get_image_meta(dest)
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
img = PILImage.open(dest)
|
| 155 |
+
vec = embed_image(img)
|
| 156 |
+
store.append_and_rebuild(vec, meta["file_name"], meta)
|
| 157 |
+
return True, f"Indexed: {path.name} ({meta['resolution']})"
|
| 158 |
+
except Exception as e:
|
| 159 |
+
return False, f"Failed: {path.name} -- {e}"
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def ingest_image_from_pil(
|
| 163 |
+
image: PILImage.Image,
|
| 164 |
+
file_name: str,
|
| 165 |
+
extra_meta: dict | None = None,
|
| 166 |
+
project: str = DEFAULT_PROJECT,
|
| 167 |
+
) -> tuple[bool, str]:
|
| 168 |
+
"""Ingest a PIL Image directly (used by seed_data). No CAGRA rebuild per-image."""
|
| 169 |
+
proj_dir = get_project_dir(project)
|
| 170 |
+
dest = proj_dir / "images" / file_name
|
| 171 |
+
store = get_store(project, "image_index")
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
if not dest.exists():
|
| 175 |
+
image.save(str(dest))
|
| 176 |
+
|
| 177 |
+
vec = embed_image(image)
|
| 178 |
+
meta = {
|
| 179 |
+
"file_name": file_name,
|
| 180 |
+
"file_path": str(dest.resolve()),
|
| 181 |
+
**(extra_meta or {})
|
| 182 |
+
}
|
| 183 |
+
store.append(vec, file_name, meta) # no rebuild β seed_data calls rebuild at end
|
| 184 |
+
return True, file_name
|
| 185 |
+
except Exception as e:
|
| 186 |
+
return False, str(e)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# ββ Video Ingestion βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 190 |
+
|
| 191 |
+
def ingest_videos(project: str = DEFAULT_PROJECT, progress_callback=None) -> tuple[int, str]:
|
| 192 |
+
"""Ingest all videos from a project's videos/ directory."""
|
| 193 |
+
if not HAS_FFMPEG:
|
| 194 |
+
return 0, "ffmpeg not found -- install ffmpeg for video ingestion."
|
| 195 |
+
|
| 196 |
+
proj_dir = get_project_dir(project)
|
| 197 |
+
video_dir = proj_dir / "videos"
|
| 198 |
+
store = get_store(project, "video_index")
|
| 199 |
+
|
| 200 |
+
frames_root = proj_dir / "videos" / "frames"
|
| 201 |
+
frames_root.mkdir(parents=True, exist_ok=True)
|
| 202 |
+
|
| 203 |
+
files = sorted(
|
| 204 |
+
f for f in video_dir.iterdir()
|
| 205 |
+
if f.suffix.lower() in VIDEO_EXTENSIONS
|
| 206 |
+
)
|
| 207 |
+
if not files:
|
| 208 |
+
return 0, f"No videos found in {video_dir}"
|
| 209 |
+
|
| 210 |
+
store.clear()
|
| 211 |
+
log = [f"[{project}] Found {len(files)} video(s) -- frame interval: {FRAME_EVERY_SEC}s\n"]
|
| 212 |
+
total = 0
|
| 213 |
+
|
| 214 |
+
for video_path in files:
|
| 215 |
+
video_str = str(video_path.resolve())
|
| 216 |
+
duration = get_duration(video_str)
|
| 217 |
+
if duration <= 0:
|
| 218 |
+
log.append(f" Skipping {video_path.name} (duration unreadable)")
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
timestamps = [0.5]
|
| 222 |
+
t = float(FRAME_EVERY_SEC)
|
| 223 |
+
while t < duration:
|
| 224 |
+
timestamps.append(round(t, 2))
|
| 225 |
+
t += FRAME_EVERY_SEC
|
| 226 |
+
if (duration - 1.0) not in timestamps:
|
| 227 |
+
timestamps.append(round(max(0, duration - 1.0), 2))
|
| 228 |
+
timestamps = sorted(set(timestamps))
|
| 229 |
+
|
| 230 |
+
log.append(f" {video_path.name} ({duration:.1f}s -> {len(timestamps)} frames)")
|
| 231 |
+
|
| 232 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 233 |
+
for idx, ts in enumerate(timestamps):
|
| 234 |
+
frame_path = os.path.join(tmp_dir, f"frame_{idx:05d}.jpg")
|
| 235 |
+
if not extract_frame(video_str, ts, frame_path):
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
try:
|
| 239 |
+
with open(frame_path, "rb") as f:
|
| 240 |
+
frame_data = f.read()
|
| 241 |
+
|
| 242 |
+
# Save frame permanently
|
| 243 |
+
perm_frame_path = frames_root / f"{video_path.name}_{ts:.2f}.jpg"
|
| 244 |
+
shutil.copy2(frame_path, str(perm_frame_path))
|
| 245 |
+
|
| 246 |
+
vec = embed_image_bytes(frame_data)
|
| 247 |
+
frame_meta = {
|
| 248 |
+
"video_path": video_str,
|
| 249 |
+
"video_name": video_path.name,
|
| 250 |
+
"frame_path": str(perm_frame_path.resolve()),
|
| 251 |
+
"timestamp_sec": ts,
|
| 252 |
+
"timestamp_label": fmt_time(ts),
|
| 253 |
+
"duration_total": round(duration, 2),
|
| 254 |
+
}
|
| 255 |
+
store.append(vec, f"{video_path.name}@{ts}", frame_meta)
|
| 256 |
+
total += 1
|
| 257 |
+
time.sleep(0.05)
|
| 258 |
+
except Exception as e:
|
| 259 |
+
log.append(f" ts={fmt_time(ts)}: FAILED ({e})")
|
| 260 |
+
|
| 261 |
+
if progress_callback:
|
| 262 |
+
progress_callback(
|
| 263 |
+
(idx + 1) / len(timestamps),
|
| 264 |
+
desc=f"{video_path.name} frame {idx+1}/{len(timestamps)}",
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
log.append(f" Done ({len(timestamps)} frames)")
|
| 268 |
+
|
| 269 |
+
# Rebuild CAGRA once for all videos
|
| 270 |
+
if store.has_data():
|
| 271 |
+
store.rebuild_gpu_index()
|
| 272 |
+
store._persist()
|
| 273 |
+
|
| 274 |
+
log.append(f"\n{total} video frames indexed ({store.mode})")
|
| 275 |
+
return total, "\n".join(log)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def ingest_single_video(file_path: str, project: str = DEFAULT_PROJECT, progress_callback=None) -> tuple[int, str]:
|
| 279 |
+
"""Ingest a single uploaded video. CAGRA rebuilt at end."""
|
| 280 |
+
path = Path(file_path)
|
| 281 |
+
proj_dir = get_project_dir(project)
|
| 282 |
+
dest = proj_dir / "videos" / path.name
|
| 283 |
+
shutil.copy2(str(path), str(dest))
|
| 284 |
+
|
| 285 |
+
if not HAS_FFMPEG:
|
| 286 |
+
return 0, "ffmpeg not found"
|
| 287 |
+
|
| 288 |
+
store = get_store(project, "video_index")
|
| 289 |
+
video_str = str(dest.resolve())
|
| 290 |
+
duration = get_duration(video_str)
|
| 291 |
+
if duration <= 0:
|
| 292 |
+
return 0, f"Could not read duration for {path.name}"
|
| 293 |
+
|
| 294 |
+
frames_root = proj_dir / "videos" / "frames"
|
| 295 |
+
frames_root.mkdir(parents=True, exist_ok=True)
|
| 296 |
+
|
| 297 |
+
timestamps = [0.5]
|
| 298 |
+
t = float(FRAME_EVERY_SEC)
|
| 299 |
+
while t < duration:
|
| 300 |
+
timestamps.append(round(t, 2))
|
| 301 |
+
t += FRAME_EVERY_SEC
|
| 302 |
+
timestamps = sorted(set(timestamps))
|
| 303 |
+
count = 0
|
| 304 |
+
|
| 305 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 306 |
+
for idx, ts in enumerate(timestamps):
|
| 307 |
+
frame_path = os.path.join(tmp_dir, f"frame_{idx:05d}.jpg")
|
| 308 |
+
if not extract_frame(video_str, ts, frame_path):
|
| 309 |
+
continue
|
| 310 |
+
try:
|
| 311 |
+
with open(frame_path, "rb") as f:
|
| 312 |
+
frame_data = f.read()
|
| 313 |
+
|
| 314 |
+
# Save frame permanently
|
| 315 |
+
perm_frame_path = frames_root / f"{path.name}_{ts:.2f}.jpg"
|
| 316 |
+
shutil.copy2(frame_path, str(perm_frame_path))
|
| 317 |
+
|
| 318 |
+
vec = embed_image_bytes(frame_data)
|
| 319 |
+
frame_meta = {
|
| 320 |
+
"video_path": video_str,
|
| 321 |
+
"video_name": path.name,
|
| 322 |
+
"frame_path": str(perm_frame_path.resolve()),
|
| 323 |
+
"timestamp_sec": ts,
|
| 324 |
+
"timestamp_label": fmt_time(ts),
|
| 325 |
+
"duration_total": round(duration, 2),
|
| 326 |
+
}
|
| 327 |
+
store.append(vec, f"{path.name}@{ts}", frame_meta)
|
| 328 |
+
count += 1
|
| 329 |
+
except Exception as e:
|
| 330 |
+
logger.error(f"Frame embed error: {e}")
|
| 331 |
+
|
| 332 |
+
if progress_callback:
|
| 333 |
+
progress_callback((idx + 1) / len(timestamps))
|
| 334 |
+
|
| 335 |
+
# Rebuild CAGRA after all frames
|
| 336 |
+
if store.has_data():
|
| 337 |
+
store.rebuild_gpu_index()
|
| 338 |
+
store._persist()
|
| 339 |
+
|
| 340 |
+
return count, f"{count} frames indexed for {path.name} ({duration:.1f}s)"
|
ingest_sample_vision.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
ingest_sample_vision.py
|
| 4 |
+
========================
|
| 5 |
+
Populates the index with synthetic sample data (NO model download needed).
|
| 6 |
+
Uses random embeddings seeded by text hashes so that similar words produce
|
| 7 |
+
similar vectors β good enough to demonstrate the full search pipeline.
|
| 8 |
+
|
| 9 |
+
After ingestion, runs a sample query and prints results in the same
|
| 10 |
+
format as the original SurrealDB-based scripts.
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python ingest_sample_vision.py
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import hashlib
|
| 17 |
+
import json
|
| 18 |
+
import numpy as np
|
| 19 |
+
from config import DEFAULT_PROJECT, EMBED_DIM
|
| 20 |
+
from vector_store import get_store
|
| 21 |
+
|
| 22 |
+
# -- Synthetic embedding (no model needed) ------------------------------------
|
| 23 |
+
|
| 24 |
+
def fake_embed(text: str, dim: int = EMBED_DIM) -> np.ndarray:
|
| 25 |
+
"""
|
| 26 |
+
Deterministic pseudo-embedding from text.
|
| 27 |
+
Same text always produces the same vector; similar texts produce
|
| 28 |
+
somewhat similar vectors (via shared n-gram hashing).
|
| 29 |
+
"""
|
| 30 |
+
rng = np.random.RandomState(int(hashlib.md5(text.encode()).hexdigest(), 16) % 2**31)
|
| 31 |
+
vec = rng.randn(dim).astype(np.float32)
|
| 32 |
+
|
| 33 |
+
# Mix in word-level hashes so "mountain landscape" is closer to "mountain" than "car"
|
| 34 |
+
words = text.lower().split()
|
| 35 |
+
for w in words:
|
| 36 |
+
word_seed = int(hashlib.md5(w.encode()).hexdigest(), 16) % 2**31
|
| 37 |
+
word_rng = np.random.RandomState(word_seed)
|
| 38 |
+
vec += word_rng.randn(dim).astype(np.float32) * 0.5
|
| 39 |
+
|
| 40 |
+
norm = np.linalg.norm(vec)
|
| 41 |
+
if norm > 0:
|
| 42 |
+
vec /= norm
|
| 43 |
+
return vec
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# -- Sample Data --------------------------------------------------------------
|
| 47 |
+
|
| 48 |
+
SAMPLE_IMAGES = [
|
| 49 |
+
{"file_name": "mountain_sunset.jpg", "file_size": "245.3KB", "resolution": "1920x1080", "description": "a majestic mountain with sunset colors"},
|
| 50 |
+
{"file_name": "dog_park.jpg", "file_size": "189.7KB", "resolution": "1280x720", "description": "a dog playing in the park"},
|
| 51 |
+
{"file_name": "red_car.jpg", "file_size": "312.1KB", "resolution": "1920x1080", "description": "a red sports car on a highway"},
|
| 52 |
+
{"file_name": "ocean_waves.jpg", "file_size": "276.4KB", "resolution": "2560x1440", "description": "ocean waves crashing on rocks"},
|
| 53 |
+
{"file_name": "city_night.jpg", "file_size": "198.2KB", "resolution": "1920x1080", "description": "city skyline at night with lights"},
|
| 54 |
+
{"file_name": "cat_windowsill.jpg", "file_size": "145.6KB", "resolution": "1280x960", "description": "a cat sitting on a windowsill"},
|
| 55 |
+
{"file_name": "forest_trail.jpg", "file_size": "334.8KB", "resolution": "2560x1440", "description": "a forest trail with tall trees and sunlight"},
|
| 56 |
+
{"file_name": "beach_sunset.jpg", "file_size": "267.9KB", "resolution": "1920x1080", "description": "golden sunset over a sandy beach"},
|
| 57 |
+
{"file_name": "snow_mountain.jpg", "file_size": "289.3KB", "resolution": "3840x2160", "description": "snow covered mountain peak under blue sky"},
|
| 58 |
+
{"file_name": "flower_garden.jpg", "file_size": "203.5KB", "resolution": "1600x1200", "description": "colorful flowers in a garden"},
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
SAMPLE_VIDEO_FRAMES = [
|
| 62 |
+
{"video_name": "nature_doc.mp4", "video_path": "/data/videos/nature_doc.mp4", "duration_total": 120.0, "frames": [
|
| 63 |
+
(0.5, "a wide shot of african savanna"),
|
| 64 |
+
(5.0, "a rhino walking through grass"),
|
| 65 |
+
(10.0, "close up of a rhino face"),
|
| 66 |
+
(15.0, "birds flying over the savanna"),
|
| 67 |
+
(20.0, "a zebra herd drinking water"),
|
| 68 |
+
(25.0, "sunset over the savanna landscape"),
|
| 69 |
+
(30.0, "a lion resting under a tree"),
|
| 70 |
+
(35.0, "elephants crossing a river"),
|
| 71 |
+
(40.0, "aerial view of the grasslands"),
|
| 72 |
+
(45.0, "a cheetah running at full speed"),
|
| 73 |
+
]},
|
| 74 |
+
{"video_name": "big_buck_bunny.mp4", "video_path": "/data/videos/big_buck_bunny.mp4", "duration_total": 60.0, "frames": [
|
| 75 |
+
(0.5, "animated forest scene with butterflies"),
|
| 76 |
+
(5.0, "a big bunny sitting in a meadow"),
|
| 77 |
+
(10.0, "the bunny stretching and yawning"),
|
| 78 |
+
(15.0, "small animals annoying the bunny"),
|
| 79 |
+
(20.0, "the bunny looking angry"),
|
| 80 |
+
(25.0, "the bunny chasing small creatures"),
|
| 81 |
+
(30.0, "a bird flying through the forest"),
|
| 82 |
+
(35.0, "the bunny setting up a trap"),
|
| 83 |
+
(40.0, "an explosion of fruit"),
|
| 84 |
+
(45.0, "the bunny laughing happily"),
|
| 85 |
+
]},
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# -- Helpers ------------------------------------------------------------------
|
| 90 |
+
|
| 91 |
+
def fmt(seconds: float) -> str:
|
| 92 |
+
m, s = divmod(int(seconds), 60)
|
| 93 |
+
return f"{m:02d}:{s:02d}"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# -- Main ---------------------------------------------------------------------
|
| 97 |
+
|
| 98 |
+
def main():
|
| 99 |
+
print(f"\n{'='*60}")
|
| 100 |
+
print(f" ARIA Vision β Sample Ingestion (Synthetic Embeddings)")
|
| 101 |
+
print(f"{'='*60}")
|
| 102 |
+
print(f" Embed dim: {EMBED_DIM}")
|
| 103 |
+
print(f" Project : {DEFAULT_PROJECT}")
|
| 104 |
+
print()
|
| 105 |
+
|
| 106 |
+
# -- 1. Clear old indexes ---------------------------------------------
|
| 107 |
+
print("[1/4] Clearing old indexes...")
|
| 108 |
+
img_store = get_store(DEFAULT_PROJECT, "image_index")
|
| 109 |
+
vid_store = get_store(DEFAULT_PROJECT, "video_index")
|
| 110 |
+
img_store.clear()
|
| 111 |
+
vid_store.clear()
|
| 112 |
+
print(" Done.\n")
|
| 113 |
+
|
| 114 |
+
# -- 2. Ingest sample images ------------------------------------------
|
| 115 |
+
print("[2/4] Ingesting sample images...")
|
| 116 |
+
img_vecs = []
|
| 117 |
+
img_ids = []
|
| 118 |
+
img_meta = []
|
| 119 |
+
|
| 120 |
+
for img in SAMPLE_IMAGES:
|
| 121 |
+
vec = fake_embed(img["description"])
|
| 122 |
+
img_vecs.append(vec)
|
| 123 |
+
img_ids.append(img["file_name"])
|
| 124 |
+
img_meta.append({
|
| 125 |
+
"file_name": img["file_name"],
|
| 126 |
+
"file_size": img["file_size"],
|
| 127 |
+
"resolution": img["resolution"],
|
| 128 |
+
"file_path": f"/data/images/{img['file_name']}",
|
| 129 |
+
})
|
| 130 |
+
print(f" OK {img['file_name']} ({img['resolution']})")
|
| 131 |
+
|
| 132 |
+
img_store.add(np.stack(img_vecs), img_ids, img_meta)
|
| 133 |
+
print(f" {len(img_ids)} images indexed -> {img_store}\n")
|
| 134 |
+
|
| 135 |
+
# -- 3. Ingest sample video frames ------------------------------------
|
| 136 |
+
print("[3/4] Ingesting sample video frames...")
|
| 137 |
+
total_frames = 0
|
| 138 |
+
|
| 139 |
+
for video in SAMPLE_VIDEO_FRAMES:
|
| 140 |
+
print(f" {video['video_name']} ({video['duration_total']:.0f}s -> {len(video['frames'])} frames)")
|
| 141 |
+
for ts, desc in video["frames"]:
|
| 142 |
+
vec = fake_embed(desc)
|
| 143 |
+
frame_meta = {
|
| 144 |
+
"video_path": video["video_path"],
|
| 145 |
+
"video_name": video["video_name"],
|
| 146 |
+
"timestamp_sec": ts,
|
| 147 |
+
"timestamp_label": fmt(ts),
|
| 148 |
+
"duration_total": video["duration_total"],
|
| 149 |
+
}
|
| 150 |
+
vid_store.append(vec, f"{video['video_name']}@{ts}", frame_meta)
|
| 151 |
+
total_frames += 1
|
| 152 |
+
|
| 153 |
+
# Rebuild CAGRA once after all frames
|
| 154 |
+
vid_store.rebuild_gpu_index()
|
| 155 |
+
vid_store._persist()
|
| 156 |
+
print(f" {total_frames} video frames indexed -> {vid_store}\n")
|
| 157 |
+
|
| 158 |
+
# -- 4. Run sample queries --------------------------------------------
|
| 159 |
+
print("[4/4] Running sample queries...\n")
|
| 160 |
+
|
| 161 |
+
# --- Image query ---
|
| 162 |
+
query = "a majestic mountain"
|
| 163 |
+
print(f"{'='*60}")
|
| 164 |
+
print(f" ARIA Vision β Image Search")
|
| 165 |
+
print(f"{'='*60}")
|
| 166 |
+
print(f" Query: \"{query}\"")
|
| 167 |
+
print()
|
| 168 |
+
|
| 169 |
+
qvec = fake_embed(query)
|
| 170 |
+
results = img_store.search(qvec, top_k=5)
|
| 171 |
+
|
| 172 |
+
print(f" {'-'*56}")
|
| 173 |
+
print(f" {'Rank':<6} {'File':<25} {'Size':<10} {'Resolution':<12} {'Score':<8}")
|
| 174 |
+
print(f" {'-'*56}")
|
| 175 |
+
for i, r in enumerate(results):
|
| 176 |
+
print(f" {i+1:<6} {r.get('file_name','?'):<25} "
|
| 177 |
+
f"{r.get('file_size','?'):<10} "
|
| 178 |
+
f"{r.get('resolution','?'):<12} "
|
| 179 |
+
f"{r.get('score',0):.4f}")
|
| 180 |
+
print(f" {'-'*56}")
|
| 181 |
+
|
| 182 |
+
output_img = {
|
| 183 |
+
"mode": "Image",
|
| 184 |
+
"query": query,
|
| 185 |
+
"results": [
|
| 186 |
+
{
|
| 187 |
+
"file_path": r.get("file_path", ""),
|
| 188 |
+
"file_name": r.get("file_name", ""),
|
| 189 |
+
"file_size": r.get("file_size", ""),
|
| 190 |
+
"resolution": r.get("resolution", ""),
|
| 191 |
+
"score": round(r.get("score", 0), 4),
|
| 192 |
+
}
|
| 193 |
+
for r in results
|
| 194 |
+
],
|
| 195 |
+
}
|
| 196 |
+
print(f"\n JSON Response:")
|
| 197 |
+
print(f" {json.dumps(output_img, indent=2)}")
|
| 198 |
+
|
| 199 |
+
# --- Video query ---
|
| 200 |
+
query2 = "a big bunny"
|
| 201 |
+
print(f"\n{'='*60}")
|
| 202 |
+
print(f" ARIA Vision β Video Intelligence Search")
|
| 203 |
+
print(f"{'='*60}")
|
| 204 |
+
print(f" Query: \"{query2}\"")
|
| 205 |
+
print()
|
| 206 |
+
|
| 207 |
+
qvec2 = fake_embed(query2)
|
| 208 |
+
vid_results = vid_store.search(qvec2, top_k=10)
|
| 209 |
+
|
| 210 |
+
# Merge into time ranges
|
| 211 |
+
from search import _merge_video_hits
|
| 212 |
+
spans = _merge_video_hits(vid_results, gap=10.0)
|
| 213 |
+
|
| 214 |
+
print(f" {'-'*62}")
|
| 215 |
+
print(f" {'#':<4} {'Video':<24} {'Time Range':<16} {'Duration':<9} {'Frames':<7} {'Score'}")
|
| 216 |
+
print(f" {'-'*62}")
|
| 217 |
+
for i, s in enumerate(spans):
|
| 218 |
+
dur = s["end_sec"] - s["start_sec"]
|
| 219 |
+
print(f" {i+1:<4} {s['video_name'][:23]:<24} "
|
| 220 |
+
f"{fmt(s['start_sec'])} -> {fmt(s['end_sec']):<9} "
|
| 221 |
+
f"{dur:4.0f}s "
|
| 222 |
+
f"{s['frames']:<7} "
|
| 223 |
+
f"{s['peak_score']:.4f}")
|
| 224 |
+
print(f" {'-'*62}")
|
| 225 |
+
|
| 226 |
+
output_vid = {
|
| 227 |
+
"mode": "Video Intelligence",
|
| 228 |
+
"query": query2,
|
| 229 |
+
"matches": [
|
| 230 |
+
{
|
| 231 |
+
"video_name": s["video_name"],
|
| 232 |
+
"video_path": s.get("video_path", ""),
|
| 233 |
+
"start": fmt(s["start_sec"]),
|
| 234 |
+
"end": fmt(s["end_sec"]),
|
| 235 |
+
"start_seconds": s["start_sec"],
|
| 236 |
+
"end_seconds": s["end_sec"],
|
| 237 |
+
"score": round(s["peak_score"], 4),
|
| 238 |
+
"frames_matched": s["frames"],
|
| 239 |
+
}
|
| 240 |
+
for s in spans
|
| 241 |
+
],
|
| 242 |
+
}
|
| 243 |
+
print(f"\n JSON Response:")
|
| 244 |
+
print(f" {json.dumps(output_vid, indent=2)}")
|
| 245 |
+
|
| 246 |
+
print(f"\n{'='*60}")
|
| 247 |
+
print(f" OK Done β {len(img_ids)} images + {total_frames} video frames indexed")
|
| 248 |
+
print(f" Store: {img_store}")
|
| 249 |
+
print(f" Store: {vid_store}")
|
| 250 |
+
print(f"{'='*60}\n")
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
if __name__ == "__main__":
|
| 254 |
+
main()
|
query_vision_image.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
query_vision_image.py
|
| 4 |
+
======================
|
| 5 |
+
Query the HF-native image_index using a text prompt.
|
| 6 |
+
Embeds the text with CLIP / Qwen3-VL, then performs cosine
|
| 7 |
+
similarity search against stored image embeddings.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python query_vision_image.py "sunset over water"
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import sys
|
| 14 |
+
import json
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
from config import DEFAULT_PROJECT, EMBED_MODEL, EMBED_DIM
|
| 18 |
+
from vector_store import get_store
|
| 19 |
+
from embedding import embed_text
|
| 20 |
+
|
| 21 |
+
TOP_K = 5
|
| 22 |
+
MIN_SCORE = 0.15 # Adjusted for HF-native CLIP/Qwen scores
|
| 23 |
+
|
| 24 |
+
def search_images(query: str):
|
| 25 |
+
print(f"\n{'='*60}")
|
| 26 |
+
print(f" ARIA Vision β Image Search (HF-Native)")
|
| 27 |
+
print(f"{'='*60}")
|
| 28 |
+
print(f" Query: \"{query}\"")
|
| 29 |
+
print(f" Model: {EMBED_MODEL} ({EMBED_DIM}d)")
|
| 30 |
+
print()
|
| 31 |
+
|
| 32 |
+
print(" [1/3] Embedding query text...", end=" ", flush=True)
|
| 33 |
+
query_vector = embed_text(query)
|
| 34 |
+
print("β")
|
| 35 |
+
|
| 36 |
+
print(" [2/3] Searching image_index...", end=" ", flush=True)
|
| 37 |
+
store = get_store(DEFAULT_PROJECT, "image_index")
|
| 38 |
+
raw_results = store.search(query_vector, top_k=TOP_K)
|
| 39 |
+
|
| 40 |
+
if not raw_results:
|
| 41 |
+
print("no results.")
|
| 42 |
+
print("\n β No images found. Did you run ingest_sample_vision.py first?")
|
| 43 |
+
return
|
| 44 |
+
|
| 45 |
+
rows = [r for r in raw_results if r.get("score", 0) >= MIN_SCORE]
|
| 46 |
+
print(f"β ({len(rows)} matches)")
|
| 47 |
+
|
| 48 |
+
print(f"\n [3/3] Results:")
|
| 49 |
+
print(f" {'β'*56}")
|
| 50 |
+
print(f" {'Rank':<6} {'File':<25} {'Size':<10} {'Resolution':<12} {'Score':<8}")
|
| 51 |
+
print(f" {'β'*56}")
|
| 52 |
+
|
| 53 |
+
for i, row in enumerate(rows):
|
| 54 |
+
file_name = row.get("file_name", Path(row.get("file_path", "?")).name)
|
| 55 |
+
print(
|
| 56 |
+
f" {i+1:<6} {file_name[:24]:<25} "
|
| 57 |
+
f"{row.get('file_size', '?'):<10} "
|
| 58 |
+
f"{row.get('resolution', '?'):<12} "
|
| 59 |
+
f"{row.get('score', 0):.4f}"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
print(f" {'β'*56}")
|
| 63 |
+
|
| 64 |
+
output = {
|
| 65 |
+
"mode": "Image",
|
| 66 |
+
"query": query,
|
| 67 |
+
"results": [
|
| 68 |
+
{
|
| 69 |
+
"file_path": r.get("file_path", ""),
|
| 70 |
+
"file_name": r.get("file_name", ""),
|
| 71 |
+
"file_size": r.get("file_size", ""),
|
| 72 |
+
"resolution": r.get("resolution", ""),
|
| 73 |
+
"score": round(r.get("score", 0), 4),
|
| 74 |
+
}
|
| 75 |
+
for r in rows
|
| 76 |
+
],
|
| 77 |
+
}
|
| 78 |
+
print(f"\n JSON Response:")
|
| 79 |
+
print(f" {json.dumps(output, indent=2)}")
|
| 80 |
+
print()
|
| 81 |
+
|
| 82 |
+
def main():
|
| 83 |
+
if len(sys.argv) < 2:
|
| 84 |
+
print("Usage: python query_vision_image.py \"your search query\"")
|
| 85 |
+
sys.exit(1)
|
| 86 |
+
|
| 87 |
+
query = " ".join(sys.argv[1:])
|
| 88 |
+
search_images(query)
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
main()
|
query_vision_video.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
query_vision_video.py
|
| 4 |
+
======================
|
| 5 |
+
Query HF-native video_index using a text prompt.
|
| 6 |
+
Each row is one embedded frame at a specific second.
|
| 7 |
+
Adjacent high-scoring frames are merged into contiguous time ranges.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python query_vision_video.py "rhino running in the wild"
|
| 11 |
+
python query_vision_video.py "person waving" --top 10 --min-score 0.15
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
import json
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
from config import DEFAULT_PROJECT, EMBED_MODEL, EMBED_DIM
|
| 19 |
+
from vector_store import get_store
|
| 20 |
+
from embedding import embed_text
|
| 21 |
+
from search import _merge_video_hits, _fmt as fmt
|
| 22 |
+
|
| 23 |
+
TOP_K = 30
|
| 24 |
+
MIN_SCORE = 0.15 # Adjusted for HF-native CLIP/Qwen scores
|
| 25 |
+
MERGE_GAP_SEC = 10
|
| 26 |
+
|
| 27 |
+
def search_video(query: str, top_k: int = TOP_K, min_score: float = MIN_SCORE):
|
| 28 |
+
print(f"\n{'='*60}")
|
| 29 |
+
print(f" ARIA Vision β Video Intelligence Search (HF-Native)")
|
| 30 |
+
print(f"{'='*60}")
|
| 31 |
+
print(f" Query : \"{query}\"")
|
| 32 |
+
print(f" Model : {EMBED_MODEL} ({EMBED_DIM}d)")
|
| 33 |
+
print(f" Min score : {min_score} | Merge gap: {MERGE_GAP_SEC}s | Fetch top: {top_k}")
|
| 34 |
+
print()
|
| 35 |
+
|
| 36 |
+
print(" [1/3] Embedding query...", end=" ", flush=True)
|
| 37 |
+
qvec = embed_text(query)
|
| 38 |
+
print("β")
|
| 39 |
+
|
| 40 |
+
print(" [2/3] Searching video_index...", end=" ", flush=True)
|
| 41 |
+
store = get_store(DEFAULT_PROJECT, "video_index")
|
| 42 |
+
raw_results = store.search(qvec, top_k=top_k)
|
| 43 |
+
|
| 44 |
+
if not raw_results:
|
| 45 |
+
print("no results.\n β Run ingest_sample_vision.py first.")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
print(f"β ({len(raw_results)} raw frames returned)")
|
| 49 |
+
|
| 50 |
+
hits = [r for r in raw_results if r.get("score", 0) >= min_score]
|
| 51 |
+
if not hits:
|
| 52 |
+
top3 = sorted(raw_results, key=lambda r: -r.get("score", 0))[:3]
|
| 53 |
+
print(f"\n β No frames above score threshold ({min_score}).")
|
| 54 |
+
print(f" Top 3 raw scores: {[round(r.get('score',0),4) for r in top3]}")
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
print(f" [3/3] Merging {len(hits)} hits into time ranges...")
|
| 58 |
+
spans = _merge_video_hits(hits, gap=MERGE_GAP_SEC)
|
| 59 |
+
|
| 60 |
+
print()
|
| 61 |
+
print(f" {'β'*62}")
|
| 62 |
+
print(f" {'#':<4} {'Video':<24} {'Time Range':<16} {'Duration':<9} {'Frames':<7} {'Score'}")
|
| 63 |
+
print(f" {'β'*62}")
|
| 64 |
+
|
| 65 |
+
for i, s in enumerate(spans):
|
| 66 |
+
dur = s["end_sec"] - s["start_sec"]
|
| 67 |
+
print(
|
| 68 |
+
f" {i+1:<4} {s['video_name'][:23]:<24} "
|
| 69 |
+
f"{fmt(s['start_sec'])} β {fmt(s['end_sec']):<9} "
|
| 70 |
+
f"{dur:4.0f}s "
|
| 71 |
+
f"{s['frames']:<7} "
|
| 72 |
+
f"{s['peak_score']:.4f}"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
print(f" {'β'*62}")
|
| 76 |
+
|
| 77 |
+
output = {
|
| 78 |
+
"mode": "Video Intelligence",
|
| 79 |
+
"query": query,
|
| 80 |
+
"matches": [
|
| 81 |
+
{
|
| 82 |
+
"video_name": s["video_name"],
|
| 83 |
+
"video_path": s.get("video_path", ""),
|
| 84 |
+
"start": fmt(s["start_sec"]),
|
| 85 |
+
"end": fmt(s["end_sec"]),
|
| 86 |
+
"start_seconds": s["start_sec"],
|
| 87 |
+
"end_seconds": s["end_sec"],
|
| 88 |
+
"score": round(s["peak_score"], 4),
|
| 89 |
+
"frames_matched": s["frames"],
|
| 90 |
+
}
|
| 91 |
+
for s in spans
|
| 92 |
+
],
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
print()
|
| 96 |
+
print(" JSON Response:")
|
| 97 |
+
print(f" {json.dumps(output, indent=2)}")
|
| 98 |
+
|
| 99 |
+
def main():
|
| 100 |
+
args = [a for a in sys.argv[1:] if not a.startswith("--")]
|
| 101 |
+
top = int(next((sys.argv[i+1] for i, a in enumerate(sys.argv) if a == "--top"), TOP_K))
|
| 102 |
+
msc = float(next((sys.argv[i+1] for i, a in enumerate(sys.argv) if a == "--min-score"), MIN_SCORE))
|
| 103 |
+
|
| 104 |
+
if not args:
|
| 105 |
+
print('Usage: python query_vision_video.py "your query"')
|
| 106 |
+
sys.exit(1)
|
| 107 |
+
|
| 108 |
+
search_video(" ".join(args), top_k=top, min_score=msc)
|
| 109 |
+
|
| 110 |
+
if __name__ == "__main__":
|
| 111 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0
|
| 2 |
+
transformers>=4.40
|
| 3 |
+
datasets>=2.19
|
| 4 |
+
huggingface_hub>=0.23
|
| 5 |
+
Pillow>=10.0
|
| 6 |
+
numpy>=1.24
|
| 7 |
+
torch
|
| 8 |
+
accelerate>=0.30
|
| 9 |
+
python-dotenv>=1.0
|
search.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF_Space_hipVS/search.py
|
| 2 |
+
# =========================
|
| 3 |
+
# Search β embed query, search project's vector store, LLM interpret.
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from embedding import embed_text, llm_summarize
|
| 7 |
+
from vector_store import get_store
|
| 8 |
+
from config import DEFAULT_PROJECT
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _fmt(seconds: float) -> str:
|
| 14 |
+
m, s = divmod(int(seconds), 60)
|
| 15 |
+
return f"{m:02d}:{s:02d}"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _merge_video_hits(hits: list[dict], gap: float = 10.0) -> list[dict]:
|
| 19 |
+
"""Merge adjacent frame-level hits into time ranges."""
|
| 20 |
+
if not hits:
|
| 21 |
+
return []
|
| 22 |
+
by_video: dict[str, list[dict]] = {}
|
| 23 |
+
for h in hits:
|
| 24 |
+
by_video.setdefault(h.get("video_name", "?"), []).append(h)
|
| 25 |
+
|
| 26 |
+
merged = []
|
| 27 |
+
for video_name, frames in by_video.items():
|
| 28 |
+
frames.sort(key=lambda x: x.get("timestamp_sec", 0))
|
| 29 |
+
cur = {
|
| 30 |
+
"video_name": video_name,
|
| 31 |
+
"video_path": frames[0].get("video_path", ""),
|
| 32 |
+
"start_sec": frames[0].get("timestamp_sec", 0),
|
| 33 |
+
"end_sec": frames[0].get("timestamp_sec", 0),
|
| 34 |
+
"peak_score": frames[0].get("score", 0),
|
| 35 |
+
"frames": 1,
|
| 36 |
+
}
|
| 37 |
+
for f in frames[1:]:
|
| 38 |
+
ts = f.get("timestamp_sec", 0)
|
| 39 |
+
if ts <= cur["end_sec"] + gap:
|
| 40 |
+
cur["end_sec"] = ts
|
| 41 |
+
cur["peak_score"] = max(cur["peak_score"], f.get("score", 0))
|
| 42 |
+
cur["frames"] += 1
|
| 43 |
+
else:
|
| 44 |
+
merged.append(cur)
|
| 45 |
+
cur = {
|
| 46 |
+
"video_name": video_name,
|
| 47 |
+
"video_path": f.get("video_path", ""),
|
| 48 |
+
"start_sec": ts,
|
| 49 |
+
"end_sec": ts,
|
| 50 |
+
"peak_score": f.get("score", 0),
|
| 51 |
+
"frames": 1,
|
| 52 |
+
}
|
| 53 |
+
merged.append(cur)
|
| 54 |
+
|
| 55 |
+
return sorted(merged, key=lambda x: -x["peak_score"])
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def search_images(query: str, project: str = DEFAULT_PROJECT, top_k: int = 10, min_score: float = 0.15) -> dict:
|
| 59 |
+
store = get_store(project, "image_index")
|
| 60 |
+
if store.count == 0:
|
| 61 |
+
return {
|
| 62 |
+
"query": query, "results": [],
|
| 63 |
+
"llm_summary": f"No images indexed in project '{project}'. Upload images first.",
|
| 64 |
+
"store_info": str(store),
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
query_vec = embed_text(query)
|
| 68 |
+
raw = store.search(query_vec, top_k=top_k)
|
| 69 |
+
filtered = [r for r in raw if r.get("score", 0) >= min_score]
|
| 70 |
+
summary = llm_summarize(query, filtered, mode="image")
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
"query": query,
|
| 74 |
+
"results": filtered,
|
| 75 |
+
"llm_summary": summary,
|
| 76 |
+
"store_info": str(store),
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def search_videos(query: str, project: str = DEFAULT_PROJECT, top_k: int = 30, min_score: float = 0.15) -> dict:
|
| 81 |
+
store = get_store(project, "video_index")
|
| 82 |
+
if store.count == 0:
|
| 83 |
+
return {
|
| 84 |
+
"query": query, "matches": [],
|
| 85 |
+
"llm_summary": f"No videos indexed in project '{project}'. Upload videos first.",
|
| 86 |
+
"store_info": str(store),
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
query_vec = embed_text(query)
|
| 90 |
+
raw = store.search(query_vec, top_k=top_k)
|
| 91 |
+
filtered = [r for r in raw if r.get("score", 0) >= min_score]
|
| 92 |
+
spans = _merge_video_hits(filtered)
|
| 93 |
+
|
| 94 |
+
result_for_llm = [
|
| 95 |
+
{
|
| 96 |
+
"video_name": s["video_name"],
|
| 97 |
+
"timestamp_sec": s["start_sec"],
|
| 98 |
+
"timestamp_label": f"{_fmt(s['start_sec'])} - {_fmt(s['end_sec'])}",
|
| 99 |
+
"score": s["peak_score"],
|
| 100 |
+
}
|
| 101 |
+
for s in spans
|
| 102 |
+
]
|
| 103 |
+
summary = llm_summarize(query, result_for_llm, mode="video")
|
| 104 |
+
|
| 105 |
+
return {
|
| 106 |
+
"query": query,
|
| 107 |
+
"matches": [
|
| 108 |
+
{
|
| 109 |
+
"id": i + 1,
|
| 110 |
+
"video_name": s["video_name"],
|
| 111 |
+
"start": _fmt(s["start_sec"]),
|
| 112 |
+
"end": _fmt(s["end_sec"]),
|
| 113 |
+
"start_seconds": s["start_sec"],
|
| 114 |
+
"end_seconds": s["end_sec"],
|
| 115 |
+
"score": round(s["peak_score"], 4),
|
| 116 |
+
"frames": s["frames"],
|
| 117 |
+
"representative_frame": s.get("frame_path", ""),
|
| 118 |
+
}
|
| 119 |
+
for i, s in enumerate(spans)
|
| 120 |
+
],
|
| 121 |
+
"llm_summary": summary,
|
| 122 |
+
"store_info": str(store),
|
| 123 |
+
}
|
seed_data.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF_Space_hipVS/seed_data.py
|
| 2 |
+
# =============================
|
| 3 |
+
# Auto-seed from a HF Dataset so the Space launches with content indexed.
|
| 4 |
+
# Called on first launch if AUTO_SEED=true and the default project is empty.
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from config import SEED_DATASET, SEED_SPLIT, HF_TOKEN, DEFAULT_PROJECT
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def run(project: str = DEFAULT_PROJECT, progress_callback=None) -> tuple[int, str]:
|
| 13 |
+
"""Seed a project with images from a HF dataset."""
|
| 14 |
+
from datasets import load_dataset
|
| 15 |
+
from ingest import ingest_image_from_pil
|
| 16 |
+
from vector_store import get_store
|
| 17 |
+
|
| 18 |
+
log = [f"Seeding project '{project}' from {SEED_DATASET} [{SEED_SPLIT}]\n"]
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
ds = load_dataset(SEED_DATASET, split=SEED_SPLIT, token=HF_TOKEN or None)
|
| 22 |
+
log.append(f"Loaded {len(ds)} items")
|
| 23 |
+
except Exception as e:
|
| 24 |
+
msg = f"Failed to load dataset: {e}"
|
| 25 |
+
logger.error(msg)
|
| 26 |
+
return 0, msg
|
| 27 |
+
|
| 28 |
+
count = 0
|
| 29 |
+
total = len(ds)
|
| 30 |
+
|
| 31 |
+
for i, item in enumerate(ds):
|
| 32 |
+
image = item.get("image")
|
| 33 |
+
if image is None:
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
filename = item.get("filename", f"seed_{i:05d}.jpg")
|
| 37 |
+
extra = {"source": SEED_DATASET}
|
| 38 |
+
|
| 39 |
+
# Grab any available caption as metadata (not used for embedding)
|
| 40 |
+
for key in ("caption", "sentences", "text"):
|
| 41 |
+
if key in item:
|
| 42 |
+
val = item[key]
|
| 43 |
+
extra["caption_hint"] = val[0] if isinstance(val, list) else str(val)
|
| 44 |
+
break
|
| 45 |
+
|
| 46 |
+
ok, _ = ingest_image_from_pil(image, filename, extra, project=project)
|
| 47 |
+
if ok:
|
| 48 |
+
count += 1
|
| 49 |
+
if count <= 3 or count % 50 == 0:
|
| 50 |
+
log.append(f" [{count}/{total}] {filename}")
|
| 51 |
+
|
| 52 |
+
if progress_callback:
|
| 53 |
+
progress_callback((i + 1) / total, desc=f"Seeding {i+1}/{total}...")
|
| 54 |
+
|
| 55 |
+
# Rebuild CAGRA once after all images
|
| 56 |
+
store = get_store(project, "image_index")
|
| 57 |
+
if store.has_data():
|
| 58 |
+
store.rebuild_gpu_index()
|
| 59 |
+
store._persist()
|
| 60 |
+
|
| 61 |
+
log.append(f"\nSeeding complete: {count} images indexed")
|
| 62 |
+
log.append(f"Store: {store}")
|
| 63 |
+
return count, "\n".join(log)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def is_needed() -> bool:
|
| 67 |
+
from config import AUTO_SEED
|
| 68 |
+
from vector_store import get_store
|
| 69 |
+
store = get_store(DEFAULT_PROJECT, "image_index")
|
| 70 |
+
return AUTO_SEED and not store.has_data()
|
test_store.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smoke test for multi-project vector store."""
|
| 2 |
+
import sys
|
| 3 |
+
import numpy as np
|
| 4 |
+
sys.path.insert(0, "HF_Space_hipVS")
|
| 5 |
+
|
| 6 |
+
from vector_store import get_store, list_projects, VectorStore
|
| 7 |
+
|
| 8 |
+
# Test multi-project isolation
|
| 9 |
+
DIM = 768 # CLIP dim for CPU
|
| 10 |
+
|
| 11 |
+
store_a = get_store("project-alpha", "image_index")
|
| 12 |
+
store_b = get_store("project-beta", "image_index")
|
| 13 |
+
|
| 14 |
+
vecs_a = np.random.randn(30, DIM).astype(np.float32)
|
| 15 |
+
vecs_b = np.random.randn(50, DIM).astype(np.float32)
|
| 16 |
+
|
| 17 |
+
store_a.add(vecs_a, [f"alpha_{i}" for i in range(30)])
|
| 18 |
+
store_b.add(vecs_b, [f"beta_{i}" for i in range(50)])
|
| 19 |
+
|
| 20 |
+
print(f"Store A: {store_a}")
|
| 21 |
+
print(f"Store B: {store_b}")
|
| 22 |
+
|
| 23 |
+
# Search in A should not return B's vectors
|
| 24 |
+
query = np.random.randn(DIM).astype(np.float32)
|
| 25 |
+
results_a = store_a.search(query, top_k=3)
|
| 26 |
+
results_b = store_b.search(query, top_k=3)
|
| 27 |
+
|
| 28 |
+
print(f"Search A: {[r['id'] for r in results_a]}")
|
| 29 |
+
print(f"Search B: {[r['id'] for r in results_b]}")
|
| 30 |
+
|
| 31 |
+
# Verify isolation
|
| 32 |
+
assert all("alpha" in r["id"] for r in results_a), "Project A returned non-alpha results!"
|
| 33 |
+
assert all("beta" in r["id"] for r in results_b), "Project B returned non-beta results!"
|
| 34 |
+
|
| 35 |
+
# Test append_and_rebuild
|
| 36 |
+
store_a.append_and_rebuild(np.random.randn(DIM).astype(np.float32), "alpha_new", {"test": True})
|
| 37 |
+
print(f"After append_and_rebuild: {store_a}")
|
| 38 |
+
|
| 39 |
+
# Test persistence
|
| 40 |
+
store_c = get_store("project-alpha", "image_index") # should be cached
|
| 41 |
+
assert store_c.count == 31
|
| 42 |
+
print(f"Cached store same ref: {store_c is store_a}")
|
| 43 |
+
|
| 44 |
+
# List projects
|
| 45 |
+
print(f"Projects: {list_projects()}")
|
| 46 |
+
|
| 47 |
+
# Cleanup
|
| 48 |
+
store_a.clear()
|
| 49 |
+
store_b.clear()
|
| 50 |
+
print("All tests passed")
|
vector_store.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF_Space_hipVS/vector_store.py
|
| 2 |
+
# ================================
|
| 3 |
+
# Multi-project vector store with 3-tier GPU acceleration.
|
| 4 |
+
#
|
| 5 |
+
# Key design:
|
| 6 |
+
# - Each project gets its own VectorStore instances (image_index, video_index)
|
| 7 |
+
# - CAGRA is rebuilt on every insert (optimized for query, not ingestion)
|
| 8 |
+
# - Indexes swap between NVMe and VRAM via async pinned-memory DMA
|
| 9 |
+
# - Multiple projects coexist by LRU-evicting cold indexes from VRAM
|
| 10 |
+
#
|
| 11 |
+
# Tiers:
|
| 12 |
+
# 1. CAGRA graph (hipVS / cuVS) β ANN search in ~50us
|
| 13 |
+
# 2. PyTorch flat tensor (hipBLAS matmul) β brute-force GPU
|
| 14 |
+
# 3. NumPy CPU cosine similarity β works everywhere
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import logging
|
| 18 |
+
import threading
|
| 19 |
+
import numpy as np
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
from config import USE_GPU, HF_TOKEN, HF_DATASET_REPO, SWAP_PATH, get_project_dir
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# ββ GPU Backend Detection ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
|
| 28 |
+
_HIPVS_AVAILABLE = False
|
| 29 |
+
_TORCH_CUDA_AVAILABLE = False
|
| 30 |
+
_cagra = None
|
| 31 |
+
|
| 32 |
+
if USE_GPU:
|
| 33 |
+
try:
|
| 34 |
+
from cuvs.neighbors import cagra as _cagra_mod
|
| 35 |
+
_cagra = _cagra_mod
|
| 36 |
+
_HIPVS_AVAILABLE = True
|
| 37 |
+
logger.info("Tier 1: hipVS (cuvs) -- CAGRA index enabled")
|
| 38 |
+
except ImportError:
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
if not _HIPVS_AVAILABLE:
|
| 42 |
+
try:
|
| 43 |
+
import torch
|
| 44 |
+
if torch.cuda.is_available():
|
| 45 |
+
_TORCH_CUDA_AVAILABLE = True
|
| 46 |
+
props = torch.cuda.get_device_properties(0)
|
| 47 |
+
name = props.name.lower()
|
| 48 |
+
backend = "ROCm" if ("amd" in name or "radeon" in name) else "CUDA"
|
| 49 |
+
logger.info(f"Tier 2: PyTorch {backend} -- flat GPU search ({props.name})")
|
| 50 |
+
except ImportError:
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
if not _HIPVS_AVAILABLE and not _TORCH_CUDA_AVAILABLE:
|
| 54 |
+
logger.info("Tier 3: NumPy CPU vector search")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# ββ HF Dataset Persistence ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 58 |
+
|
| 59 |
+
def _hf_save(name: str, ids: list[str], vectors: np.ndarray, metadata: list[dict]):
|
| 60 |
+
if not HF_DATASET_REPO or not HF_TOKEN:
|
| 61 |
+
return
|
| 62 |
+
try:
|
| 63 |
+
from datasets import Dataset
|
| 64 |
+
records = [
|
| 65 |
+
{"id": ids[i], "vector": vectors[i].tolist(), "metadata": json.dumps(metadata[i])}
|
| 66 |
+
for i in range(len(ids))
|
| 67 |
+
]
|
| 68 |
+
ds = Dataset.from_list(records)
|
| 69 |
+
repo = f"{HF_DATASET_REPO}-{name}"
|
| 70 |
+
ds.push_to_hub(repo, token=HF_TOKEN, private=True)
|
| 71 |
+
logger.info(f"[{name}] Pushed {len(records)} vectors to HF Dataset")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.warning(f"[{name}] HF push failed: {e}")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _hf_load(name: str):
|
| 77 |
+
if not HF_DATASET_REPO or not HF_TOKEN:
|
| 78 |
+
return None
|
| 79 |
+
try:
|
| 80 |
+
from datasets import load_dataset
|
| 81 |
+
repo = f"{HF_DATASET_REPO}-{name}"
|
| 82 |
+
ds = load_dataset(repo, token=HF_TOKEN, split="train")
|
| 83 |
+
logger.info(f"[{name}] Loaded {len(ds)} vectors from HF Dataset")
|
| 84 |
+
return ds
|
| 85 |
+
except Exception:
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ββ VectorStore ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
|
| 91 |
+
class VectorStore:
|
| 92 |
+
"""
|
| 93 |
+
GPU-backed vector store with NVMe swap and CAGRA rebuild-on-insert.
|
| 94 |
+
|
| 95 |
+
Lifecycle:
|
| 96 |
+
1. add(vectors, ids, meta) β bulk add + CAGRA rebuild + persist
|
| 97 |
+
2. append(vector, id, meta) β single add, NO rebuild (caller decides)
|
| 98 |
+
3. append_and_rebuild(v, id, meta) β single add + CAGRA rebuild + persist
|
| 99 |
+
4. search(query, top_k) β search (auto-loads from NVMe if needed)
|
| 100 |
+
5. evict() β free VRAM, keep NVMe
|
| 101 |
+
6. restore() β NVMe -> VRAM (async, pinned DMA)
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
def __init__(self, name: str, index_dir: Path | None = None):
|
| 105 |
+
self.name = name
|
| 106 |
+
self._index_dir = index_dir or SWAP_PATH
|
| 107 |
+
self._index_dir.mkdir(parents=True, exist_ok=True)
|
| 108 |
+
|
| 109 |
+
self._vectors: np.ndarray | None = None
|
| 110 |
+
self._ids: list[str] = []
|
| 111 |
+
self._metadata: list[dict] = []
|
| 112 |
+
|
| 113 |
+
# GPU state
|
| 114 |
+
self._gpu_index = None # CAGRA index object
|
| 115 |
+
self._gpu_vecs = None # torch tensor (flat fallback)
|
| 116 |
+
self._in_vram = False
|
| 117 |
+
|
| 118 |
+
# File paths
|
| 119 |
+
self._npz_file = self._index_dir / f"{name}.npz"
|
| 120 |
+
self._meta_file = self._index_dir / f"{name}_meta.json"
|
| 121 |
+
self._cagra_file = self._index_dir / f"{name}.cagra"
|
| 122 |
+
|
| 123 |
+
# Load from disk on init
|
| 124 |
+
if self._npz_file.exists():
|
| 125 |
+
self._load_from_disk()
|
| 126 |
+
else:
|
| 127 |
+
self._load_from_hf()
|
| 128 |
+
|
| 129 |
+
# ββ Add / Append βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
|
| 131 |
+
def add(self, vectors: np.ndarray, ids: list[str], metadata: list[dict] | None = None):
|
| 132 |
+
"""Bulk add vectors + rebuild CAGRA + persist."""
|
| 133 |
+
if len(vectors) == 0:
|
| 134 |
+
return
|
| 135 |
+
self._vectors = vectors.astype(np.float32)
|
| 136 |
+
self._ids = list(ids)
|
| 137 |
+
self._metadata = metadata or [{} for _ in ids]
|
| 138 |
+
self._normalize()
|
| 139 |
+
self.rebuild_gpu_index()
|
| 140 |
+
self._persist()
|
| 141 |
+
logger.info(f"[{self.name}] Indexed {len(ids)} vectors (mode={self.mode})")
|
| 142 |
+
|
| 143 |
+
def append(self, vector: np.ndarray, vid: str, meta: dict | None = None):
|
| 144 |
+
"""Append one vector. NO CAGRA rebuild (batch callers rebuild at end)."""
|
| 145 |
+
vector = vector.astype(np.float32).reshape(1, -1)
|
| 146 |
+
norm = np.linalg.norm(vector)
|
| 147 |
+
if norm > 0:
|
| 148 |
+
vector = vector / norm
|
| 149 |
+
|
| 150 |
+
if self._vectors is not None and len(self._vectors) > 0:
|
| 151 |
+
self._vectors = np.vstack([self._vectors, vector])
|
| 152 |
+
else:
|
| 153 |
+
self._vectors = vector
|
| 154 |
+
|
| 155 |
+
self._ids.append(vid)
|
| 156 |
+
self._metadata.append(meta or {})
|
| 157 |
+
self._in_vram = False # invalidate GPU index
|
| 158 |
+
|
| 159 |
+
def append_and_rebuild(self, vector: np.ndarray, vid: str, meta: dict | None = None):
|
| 160 |
+
"""Append one vector + rebuild CAGRA + persist."""
|
| 161 |
+
self.append(vector, vid, meta)
|
| 162 |
+
self.rebuild_gpu_index()
|
| 163 |
+
self._persist()
|
| 164 |
+
|
| 165 |
+
# ββ Search βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
|
| 167 |
+
def search(self, query: np.ndarray, top_k: int = 10) -> list[dict]:
|
| 168 |
+
"""
|
| 169 |
+
Cosine similarity search. Auto-restores from NVMe if not in VRAM.
|
| 170 |
+
Returns list of dicts: {id, score, ...metadata}
|
| 171 |
+
"""
|
| 172 |
+
if self._vectors is None or len(self._vectors) == 0:
|
| 173 |
+
return []
|
| 174 |
+
|
| 175 |
+
query = query.astype(np.float32)
|
| 176 |
+
norm = np.linalg.norm(query)
|
| 177 |
+
if norm > 0:
|
| 178 |
+
query = query / norm
|
| 179 |
+
|
| 180 |
+
# Auto-load GPU index if needed
|
| 181 |
+
if ((_HIPVS_AVAILABLE or _TORCH_CUDA_AVAILABLE) and not self._in_vram):
|
| 182 |
+
self.rebuild_gpu_index()
|
| 183 |
+
|
| 184 |
+
if _HIPVS_AVAILABLE and self._gpu_index is not None:
|
| 185 |
+
return self._search_cagra(query, top_k)
|
| 186 |
+
elif _TORCH_CUDA_AVAILABLE and self._in_vram:
|
| 187 |
+
return self._search_torch(query, top_k)
|
| 188 |
+
return self._search_numpy(query, top_k)
|
| 189 |
+
|
| 190 |
+
def _search_numpy(self, query: np.ndarray, top_k: int) -> list[dict]:
|
| 191 |
+
scores = self._vectors @ query
|
| 192 |
+
k = min(top_k, len(self._ids))
|
| 193 |
+
if len(scores) > top_k:
|
| 194 |
+
idx = np.argpartition(scores, -k)[-k:]
|
| 195 |
+
idx = idx[np.argsort(scores[idx])[::-1]]
|
| 196 |
+
else:
|
| 197 |
+
idx = np.argsort(scores)[::-1][:k]
|
| 198 |
+
return [{"id": self._ids[i], "score": float(scores[i]), **self._metadata[i]} for i in idx]
|
| 199 |
+
|
| 200 |
+
def _search_cagra(self, query: np.ndarray, top_k: int) -> list[dict]:
|
| 201 |
+
import cupy as cp
|
| 202 |
+
q = cp.asarray(query.reshape(1, -1))
|
| 203 |
+
search_params = _cagra.SearchParams()
|
| 204 |
+
distances, indices = _cagra.search(search_params, self._gpu_index, q, top_k)
|
| 205 |
+
results = []
|
| 206 |
+
for idx, dist in zip(indices[0].get().tolist(), distances[0].get().tolist()):
|
| 207 |
+
if 0 <= idx < len(self._ids):
|
| 208 |
+
results.append({"id": self._ids[idx], "score": -float(dist), **self._metadata[idx]})
|
| 209 |
+
return results
|
| 210 |
+
|
| 211 |
+
def _search_torch(self, query: np.ndarray, top_k: int) -> list[dict]:
|
| 212 |
+
import torch
|
| 213 |
+
q = torch.from_numpy(query).to(self._gpu_vecs.device, dtype=self._gpu_vecs.dtype).unsqueeze(0)
|
| 214 |
+
scores = (q @ self._gpu_vecs.T).squeeze(0)
|
| 215 |
+
k = min(top_k, len(self._ids))
|
| 216 |
+
top_scores, top_idx = torch.topk(scores, k=k)
|
| 217 |
+
return [
|
| 218 |
+
{"id": self._ids[i], "score": float(s), **self._metadata[i]}
|
| 219 |
+
for i, s in zip(top_idx.cpu().tolist(), top_scores.cpu().tolist())
|
| 220 |
+
]
|
| 221 |
+
|
| 222 |
+
# ββ GPU Index Build (CAGRA rebuilt on every insert) ββββββββββββββββββββββ
|
| 223 |
+
|
| 224 |
+
def rebuild_gpu_index(self):
|
| 225 |
+
"""Build/rebuild the GPU index from current vectors."""
|
| 226 |
+
if self._vectors is None or len(self._vectors) == 0:
|
| 227 |
+
return
|
| 228 |
+
if _HIPVS_AVAILABLE:
|
| 229 |
+
self._build_cagra()
|
| 230 |
+
elif _TORCH_CUDA_AVAILABLE:
|
| 231 |
+
self._build_torch()
|
| 232 |
+
|
| 233 |
+
def _build_cagra(self):
|
| 234 |
+
import cupy as cp
|
| 235 |
+
d_vecs = cp.asarray(self._vectors)
|
| 236 |
+
params = _cagra.IndexParams()
|
| 237 |
+
params.metric = "sqeuclidean"
|
| 238 |
+
params.graph_degree = 64
|
| 239 |
+
params.intermediate_graph_degree = 128
|
| 240 |
+
params.build_algo = "IVF_PQ"
|
| 241 |
+
logger.info(f"[{self.name}] Building CAGRA ({self._vectors.shape}) ...")
|
| 242 |
+
self._gpu_index = _cagra.build(params, d_vecs)
|
| 243 |
+
# Serialize to NVMe for fast restore after eviction
|
| 244 |
+
_cagra.serialize(str(self._cagra_file), self._gpu_index)
|
| 245 |
+
self._in_vram = True
|
| 246 |
+
logger.info(f"[{self.name}] CAGRA built + serialized")
|
| 247 |
+
|
| 248 |
+
def _build_torch(self):
|
| 249 |
+
import torch
|
| 250 |
+
self._gpu_vecs = torch.from_numpy(self._vectors).cuda().half()
|
| 251 |
+
self._in_vram = True
|
| 252 |
+
|
| 253 |
+
# ββ NVMe <-> VRAM Swap βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 254 |
+
|
| 255 |
+
def evict(self):
|
| 256 |
+
"""Free VRAM. NVMe files stay intact for fast restore()."""
|
| 257 |
+
if not self._in_vram:
|
| 258 |
+
return
|
| 259 |
+
self._gpu_index = None
|
| 260 |
+
self._gpu_vecs = None
|
| 261 |
+
if _HIPVS_AVAILABLE or _TORCH_CUDA_AVAILABLE:
|
| 262 |
+
import torch
|
| 263 |
+
torch.cuda.empty_cache()
|
| 264 |
+
self._in_vram = False
|
| 265 |
+
logger.info(f"[{self.name}] Evicted from VRAM")
|
| 266 |
+
|
| 267 |
+
def restore(self):
|
| 268 |
+
"""
|
| 269 |
+
Restore index from NVMe to VRAM via async pinned-memory copy.
|
| 270 |
+
Does NOT re-embed or re-read source files.
|
| 271 |
+
"""
|
| 272 |
+
if self._in_vram:
|
| 273 |
+
return
|
| 274 |
+
|
| 275 |
+
if _HIPVS_AVAILABLE and self._cagra_file.exists():
|
| 276 |
+
logger.info(f"[{self.name}] Restoring CAGRA from NVMe (async) ...")
|
| 277 |
+
self._gpu_index = _cagra.deserialize(str(self._cagra_file))
|
| 278 |
+
self._in_vram = True
|
| 279 |
+
logger.info(f"[{self.name}] CAGRA restored to VRAM")
|
| 280 |
+
elif _TORCH_CUDA_AVAILABLE and self._vectors is not None:
|
| 281 |
+
import torch
|
| 282 |
+
# Pinned memory -> VRAM (async DMA copy)
|
| 283 |
+
pinned = torch.from_numpy(self._vectors).pin_memory()
|
| 284 |
+
self._gpu_vecs = pinned.to("cuda", non_blocking=True, dtype=torch.float16)
|
| 285 |
+
self._in_vram = True
|
| 286 |
+
logger.info(f"[{self.name}] Flat tensor restored to VRAM (async)")
|
| 287 |
+
|
| 288 |
+
# Load IDs if needed
|
| 289 |
+
if not self._ids and self._npz_file.exists():
|
| 290 |
+
data = np.load(self._npz_file, allow_pickle=True)
|
| 291 |
+
self._ids = data["ids"].tolist()
|
| 292 |
+
if self._meta_file.exists():
|
| 293 |
+
with open(self._meta_file, "r") as f:
|
| 294 |
+
self._metadata = json.load(f)
|
| 295 |
+
|
| 296 |
+
# ββ Persistence ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 297 |
+
|
| 298 |
+
def _persist(self):
|
| 299 |
+
self._save_to_disk()
|
| 300 |
+
if HF_DATASET_REPO and HF_TOKEN:
|
| 301 |
+
_hf_save(self.name, self._ids, self._vectors, self._metadata)
|
| 302 |
+
|
| 303 |
+
def _save_to_disk(self):
|
| 304 |
+
if self._vectors is None:
|
| 305 |
+
return
|
| 306 |
+
np.savez_compressed(self._npz_file, vectors=self._vectors, ids=np.array(self._ids, dtype=object))
|
| 307 |
+
with open(self._meta_file, "w") as f:
|
| 308 |
+
json.dump(self._metadata, f)
|
| 309 |
+
|
| 310 |
+
def _load_from_disk(self):
|
| 311 |
+
try:
|
| 312 |
+
data = np.load(self._npz_file, allow_pickle=True)
|
| 313 |
+
self._vectors = data["vectors"].astype(np.float32)
|
| 314 |
+
self._ids = data["ids"].tolist()
|
| 315 |
+
if self._meta_file.exists():
|
| 316 |
+
with open(self._meta_file, "r") as f:
|
| 317 |
+
self._metadata = json.load(f)
|
| 318 |
+
else:
|
| 319 |
+
self._metadata = [{} for _ in self._ids]
|
| 320 |
+
logger.info(f"[{self.name}] Loaded {len(self._ids)} vectors from disk")
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.error(f"[{self.name}] Disk load failed: {e}")
|
| 323 |
+
|
| 324 |
+
def _load_from_hf(self):
|
| 325 |
+
ds = _hf_load(self.name)
|
| 326 |
+
if ds is None or len(ds) == 0:
|
| 327 |
+
return
|
| 328 |
+
try:
|
| 329 |
+
self._ids = ds["id"]
|
| 330 |
+
self._vectors = np.array(ds["vector"], dtype=np.float32)
|
| 331 |
+
self._metadata = [json.loads(m) for m in ds["metadata"]]
|
| 332 |
+
self._save_to_disk()
|
| 333 |
+
except Exception as e:
|
| 334 |
+
logger.error(f"[{self.name}] HF load failed: {e}")
|
| 335 |
+
|
| 336 |
+
def _normalize(self):
|
| 337 |
+
if self._vectors is None:
|
| 338 |
+
return
|
| 339 |
+
norms = np.linalg.norm(self._vectors, axis=1, keepdims=True)
|
| 340 |
+
norms = np.where(norms == 0, 1, norms)
|
| 341 |
+
self._vectors = self._vectors / norms
|
| 342 |
+
|
| 343 |
+
# ββ Utilities ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 344 |
+
|
| 345 |
+
def clear(self):
|
| 346 |
+
self._vectors = None
|
| 347 |
+
self._ids = []
|
| 348 |
+
self._metadata = []
|
| 349 |
+
self._gpu_index = None
|
| 350 |
+
self._gpu_vecs = None
|
| 351 |
+
self._in_vram = False
|
| 352 |
+
for f in (self._npz_file, self._meta_file, self._cagra_file):
|
| 353 |
+
if f.exists():
|
| 354 |
+
f.unlink()
|
| 355 |
+
|
| 356 |
+
def has_data(self) -> bool:
|
| 357 |
+
return self._vectors is not None and len(self._ids) > 0
|
| 358 |
+
|
| 359 |
+
@property
|
| 360 |
+
def count(self) -> int:
|
| 361 |
+
return len(self._ids) if self._ids else 0
|
| 362 |
+
|
| 363 |
+
@property
|
| 364 |
+
def in_vram(self) -> bool:
|
| 365 |
+
return self._in_vram
|
| 366 |
+
|
| 367 |
+
@property
|
| 368 |
+
def mode(self) -> str:
|
| 369 |
+
if _HIPVS_AVAILABLE:
|
| 370 |
+
return "CAGRA (hipVS GPU)"
|
| 371 |
+
elif _TORCH_CUDA_AVAILABLE:
|
| 372 |
+
return "Flat Tensor (GPU)"
|
| 373 |
+
return "NumPy (CPU)"
|
| 374 |
+
|
| 375 |
+
def __len__(self):
|
| 376 |
+
return self.count
|
| 377 |
+
|
| 378 |
+
def __repr__(self):
|
| 379 |
+
vram = "VRAM" if self._in_vram else "NVMe"
|
| 380 |
+
return f"VectorStore('{self.name}', n={self.count}, {self.mode}, {vram})"
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
# ββ Multi-Project Store Registry ββββββββββββββββββββββββββββββββββββββββββββ
|
| 384 |
+
|
| 385 |
+
_stores: dict[str, VectorStore] = {}
|
| 386 |
+
_lock = threading.Lock()
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def get_store(project: str, index_name: str) -> VectorStore:
|
| 390 |
+
"""
|
| 391 |
+
Get or create a VectorStore for a specific project + index.
|
| 392 |
+
Stores are cached globally and share the same GPU memory pool.
|
| 393 |
+
"""
|
| 394 |
+
key = f"{project}/{index_name}"
|
| 395 |
+
with _lock:
|
| 396 |
+
if key not in _stores:
|
| 397 |
+
proj_dir = get_project_dir(project)
|
| 398 |
+
idx_dir = proj_dir / "indexes"
|
| 399 |
+
_stores[key] = VectorStore(index_name, index_dir=idx_dir)
|
| 400 |
+
logger.info(f"Store created: {_stores[key]}")
|
| 401 |
+
return _stores[key]
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def list_projects() -> list[str]:
|
| 405 |
+
"""List all projects that have at least one index file."""
|
| 406 |
+
from config import PROJECTS_DIR
|
| 407 |
+
projects = []
|
| 408 |
+
if PROJECTS_DIR.exists():
|
| 409 |
+
for p in sorted(PROJECTS_DIR.iterdir()):
|
| 410 |
+
if p.is_dir():
|
| 411 |
+
projects.append(p.name)
|
| 412 |
+
return projects
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
def evict_all():
|
| 416 |
+
"""Evict all stores from VRAM."""
|
| 417 |
+
with _lock:
|
| 418 |
+
for store in _stores.values():
|
| 419 |
+
if store.in_vram:
|
| 420 |
+
store.evict()
|