Spaces:
Runtime error
Runtime error
Commit
·
fca155a
0
Parent(s):
first commit
Browse files- .gitignore +27 -0
- README.md +42 -0
- check_environment.py +30 -0
- packages.txt +3 -0
- requirements.txt +21 -0
- scripts/download_model.py +51 -0
- src/app.py +259 -0
- src/config/settings.py +29 -0
- src/core/graph.py +179 -0
- src/core/orchestrator.py +25 -0
- src/interfaces/base.py +47 -0
- src/main.py +60 -0
- src/memory/manager.py +75 -0
- src/memory/vector_index.py +86 -0
- src/perception/engine.py +156 -0
- src/perception/scout.py +72 -0
- src/utils/video.py +72 -0
- tests/test_graph.py +27 -0
- tests/test_memory.py +59 -0
- tests/test_orchestrator.py +51 -0
- tests/test_perception.py +78 -0
- tests/verify_pipeline.py +82 -0
.gitignore
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Documentation & Tasks (Excluded)
|
| 2 |
+
ACTION_PLAN_TASK_*.md
|
| 3 |
+
ARCHITECTURE_*.md
|
| 4 |
+
PROGRESS_CHECKPOINT.md
|
| 5 |
+
PROJECT_OVERVIEW.md
|
| 6 |
+
RESTART_CONTEXT.md
|
| 7 |
+
TECHNICAL_GUIDELINES.md
|
| 8 |
+
project.txt
|
| 9 |
+
|
| 10 |
+
# Python
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.pyc
|
| 13 |
+
venv/
|
| 14 |
+
.pytest_cache/
|
| 15 |
+
|
| 16 |
+
# Data & Models (Generated at runtime or too large)
|
| 17 |
+
models/*.gguf
|
| 18 |
+
data/*.mp4
|
| 19 |
+
data/*.idx
|
| 20 |
+
data/*.json
|
| 21 |
+
data/temp*
|
| 22 |
+
data/metadata/
|
| 23 |
+
|
| 24 |
+
# Environment & Config
|
| 25 |
+
.env
|
| 26 |
+
.streamlit/secrets.toml
|
| 27 |
+
.gemini/
|
README.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Visual Scout AI
|
| 3 |
+
emoji: 🦅
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.30.0
|
| 8 |
+
app_file: src/app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
models:
|
| 11 |
+
- Qwen/Qwen2-VL-2B-Instruct
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Visual Scout: Agentic Video Understanding
|
| 15 |
+
|
| 16 |
+
An agentic AI system that watches videos, builds a semantic index, and answers natural language questions using **Qwen2-VL**.
|
| 17 |
+
|
| 18 |
+
## 🚀 How to Run Locally
|
| 19 |
+
|
| 20 |
+
1. **Install Dependencies:**
|
| 21 |
+
```bash
|
| 22 |
+
pip install -r requirements.txt
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
2. **Download Model:**
|
| 26 |
+
```bash
|
| 27 |
+
python scripts/download_model.py
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
3. **Run App:**
|
| 31 |
+
```bash
|
| 32 |
+
streamlit run src/app.py
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## ☁️ Deployment (Hugging Face Spaces)
|
| 36 |
+
|
| 37 |
+
This repository is configured for immediate deployment on Hugging Face Spaces.
|
| 38 |
+
|
| 39 |
+
1. Create a new Space on [Hugging Face](https://huggingface.co/spaces).
|
| 40 |
+
2. Select **Streamlit** as the SDK.
|
| 41 |
+
3. Connect this Git repository.
|
| 42 |
+
4. The system will automatically build using `requirements.txt`.
|
check_environment.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import shutil
|
| 4 |
+
|
| 5 |
+
def check_gpu():
|
| 6 |
+
print("\n--- GPU Check (via nvidia-smi) ---")
|
| 7 |
+
if shutil.which("nvidia-smi"):
|
| 8 |
+
ret = os.system("nvidia-smi")
|
| 9 |
+
if ret == 0:
|
| 10 |
+
print("✅ NVIDIA Driver detected.")
|
| 11 |
+
else:
|
| 12 |
+
print("⚠️ nvidia-smi found but returned error.")
|
| 13 |
+
else:
|
| 14 |
+
print("❌ nvidia-smi not found. CUDA might not be in PATH.")
|
| 15 |
+
|
| 16 |
+
def check_llama_cuda():
|
| 17 |
+
print("\n--- Llama.cpp CUDA Check ---")
|
| 18 |
+
try:
|
| 19 |
+
from llama_cpp import Llama
|
| 20 |
+
print("✅ llama-cpp-python is installed.")
|
| 21 |
+
print(f"Llama.cpp package location: {sys.modules['llama_cpp'].__file__}")
|
| 22 |
+
except ImportError:
|
| 23 |
+
print("❌ llama-cpp-python is NOT installed.")
|
| 24 |
+
sys.exit(1)
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
print(f"Python Version: {sys.version}")
|
| 28 |
+
check_gpu()
|
| 29 |
+
check_llama_cuda()
|
| 30 |
+
print("\nEnvironment check complete.")
|
packages.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
| 2 |
+
libsm6
|
| 3 |
+
libxext6
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core AI & Inference
|
| 2 |
+
llama-cpp-python>=0.2.82 # The engine for Qwen2-VL
|
| 3 |
+
numpy>=1.24.0 # Array manipulation
|
| 4 |
+
opencv-python-headless>=4.8.0 # Video processing (headless for server/CLI environments)
|
| 5 |
+
|
| 6 |
+
# Utility & CLI
|
| 7 |
+
rich>=13.0.0 # Beautiful terminal output
|
| 8 |
+
pydantic>=2.0.0 # Data validation and settings management
|
| 9 |
+
Pillow>=10.0.0 # Image handling
|
| 10 |
+
|
| 11 |
+
# Development & Testing
|
| 12 |
+
pytest>=7.0.0 # Testing framework
|
| 13 |
+
black>=23.0.0 # Code formatter (for dev use)
|
| 14 |
+
huggingface_hub>=0.19.0
|
| 15 |
+
langgraph>=0.0.10
|
| 16 |
+
langchain>=0.1.0
|
| 17 |
+
langchain-core>=0.1.0
|
| 18 |
+
streamlit>=1.30.0
|
| 19 |
+
sentence-transformers>=2.2.2
|
| 20 |
+
scikit-learn>=1.3.0
|
| 21 |
+
decord>=0.6.0
|
scripts/download_model.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import list_repo_files, hf_hub_download
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
|
| 5 |
+
MODEL_DIR = "models"
|
| 6 |
+
|
| 7 |
+
def list_files():
|
| 8 |
+
print(f"Inspecting repo: {REPO_ID}")
|
| 9 |
+
files = list_repo_files(repo_id=REPO_ID)
|
| 10 |
+
return files
|
| 11 |
+
|
| 12 |
+
def download_file(filename):
|
| 13 |
+
print(f"Downloading {filename}...")
|
| 14 |
+
hf_hub_download(
|
| 15 |
+
repo_id=REPO_ID,
|
| 16 |
+
filename=filename,
|
| 17 |
+
local_dir=MODEL_DIR,
|
| 18 |
+
local_dir_use_symlinks=False
|
| 19 |
+
)
|
| 20 |
+
print(f"✅ Saved to {MODEL_DIR}/{filename}")
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
if not os.path.exists(MODEL_DIR):
|
| 24 |
+
os.makedirs(MODEL_DIR)
|
| 25 |
+
|
| 26 |
+
files = list_files()
|
| 27 |
+
|
| 28 |
+
# 1. Find the best Quantization (Q4_K_M is the sweet spot)
|
| 29 |
+
target_quant = "q4_k_m.gguf"
|
| 30 |
+
model_file = next((f for f in files if target_quant in f.lower()), None)
|
| 31 |
+
|
| 32 |
+
if model_file:
|
| 33 |
+
print(f"Found model: {model_file}")
|
| 34 |
+
if not os.path.exists(os.path.join(MODEL_DIR, model_file)):
|
| 35 |
+
download_file(model_file)
|
| 36 |
+
else:
|
| 37 |
+
print("✅ Model already exists.")
|
| 38 |
+
else:
|
| 39 |
+
print("❌ Could not find Q4_K_M model file.")
|
| 40 |
+
|
| 41 |
+
# 2. Look for mmproj (Vision Adapter) if it exists separately
|
| 42 |
+
# Qwen2-VL GGUFs usually embed it, but let's check for 'mmproj' just in case.
|
| 43 |
+
mmproj_file = next((f for f in files if "mmproj" in f.lower()), None)
|
| 44 |
+
if mmproj_file:
|
| 45 |
+
print(f"Found projector: {mmproj_file}")
|
| 46 |
+
if not os.path.exists(os.path.join(MODEL_DIR, mmproj_file)):
|
| 47 |
+
download_file(mmproj_file)
|
| 48 |
+
else:
|
| 49 |
+
print("✅ Projector already exists.")
|
| 50 |
+
else:
|
| 51 |
+
print("ℹ️ No separate mmproj file found (likely embedded or not needed for this repo).")
|
src/app.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import shutil
|
| 3 |
+
import time
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import cv2
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
|
| 10 |
+
# Add project root to python path
|
| 11 |
+
sys.path.append(os.getcwd())
|
| 12 |
+
|
| 13 |
+
# Internal Modules
|
| 14 |
+
from src.perception.engine import Qwen2PerceptionEngine
|
| 15 |
+
from src.perception.scout import VisualScout
|
| 16 |
+
from src.memory.manager import SimpleMemoryManager
|
| 17 |
+
from src.memory.vector_index import VectorIndex
|
| 18 |
+
from src.core.orchestrator import VideoAgent
|
| 19 |
+
from src.config.settings import settings
|
| 20 |
+
from src.utils.video import extract_frames_decord
|
| 21 |
+
|
| 22 |
+
# --- PAGE CONFIGURATION ---
|
| 23 |
+
st.set_page_config(
|
| 24 |
+
page_title="Visual Scout AI",
|
| 25 |
+
page_icon="🦅",
|
| 26 |
+
layout="wide",
|
| 27 |
+
initial_sidebar_state="expanded"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# --- SYSTEM SETUP ---
|
| 31 |
+
|
| 32 |
+
def ensure_models_exist():
|
| 33 |
+
"""
|
| 34 |
+
Checks if the AI models are present.
|
| 35 |
+
If not (first run or cloud deploy), it downloads them automatically.
|
| 36 |
+
"""
|
| 37 |
+
REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
|
| 38 |
+
MODEL_FILENAME = "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
|
| 39 |
+
VISION_ADAPTER_FILENAME = "Qwen2-VL-2B-Instruct-f16-mmproj.gguf"
|
| 40 |
+
|
| 41 |
+
if not settings.paths.models_dir.exists():
|
| 42 |
+
settings.paths.models_dir.mkdir(parents=True)
|
| 43 |
+
|
| 44 |
+
model_path = settings.paths.models_dir / MODEL_FILENAME
|
| 45 |
+
adapter_path = settings.paths.models_dir / VISION_ADAPTER_FILENAME
|
| 46 |
+
|
| 47 |
+
# If either file is missing, trigger download
|
| 48 |
+
if not model_path.exists() or not adapter_path.exists():
|
| 49 |
+
with st.spinner("📥 Performing First-Time Setup: Downloading AI Models..."):
|
| 50 |
+
if not model_path.exists():
|
| 51 |
+
st.toast("Downloading Main Model (1.5GB)...")
|
| 52 |
+
hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, local_dir=settings.paths.models_dir)
|
| 53 |
+
|
| 54 |
+
if not adapter_path.exists():
|
| 55 |
+
st.toast("Downloading Vision Adapter...")
|
| 56 |
+
try:
|
| 57 |
+
hf_hub_download(repo_id=REPO_ID, filename=VISION_ADAPTER_FILENAME, local_dir=settings.paths.models_dir)
|
| 58 |
+
except Exception:
|
| 59 |
+
st.warning("Could not download specific adapter. Trying to proceed...")
|
| 60 |
+
|
| 61 |
+
st.success("Models Ready!")
|
| 62 |
+
|
| 63 |
+
@st.cache_resource
|
| 64 |
+
def initialize_system():
|
| 65 |
+
"""
|
| 66 |
+
Loads the heavy AI models once and caches them.
|
| 67 |
+
"""
|
| 68 |
+
ensure_models_exist()
|
| 69 |
+
|
| 70 |
+
print("🚀 System Startup: Initializing AI Engines...")
|
| 71 |
+
|
| 72 |
+
# 1. The Analyst (High Intelligence, GPU)
|
| 73 |
+
perception_engine = Qwen2PerceptionEngine()
|
| 74 |
+
try:
|
| 75 |
+
perception_engine.load_model(settings.paths.model_path)
|
| 76 |
+
except Exception as error:
|
| 77 |
+
st.error(f"Critical Error Loading AI: {error}")
|
| 78 |
+
st.stop()
|
| 79 |
+
|
| 80 |
+
# 2. The Scout (Fast Search, CPU)
|
| 81 |
+
visual_scout = VisualScout()
|
| 82 |
+
|
| 83 |
+
# 3. The Memory Manager
|
| 84 |
+
memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
|
| 85 |
+
|
| 86 |
+
# 4. The Agent Orchestrator
|
| 87 |
+
video_agent = VideoAgent(perception_engine, memory_manager)
|
| 88 |
+
|
| 89 |
+
return perception_engine, visual_scout, memory_manager, video_agent
|
| 90 |
+
|
| 91 |
+
# Load the system
|
| 92 |
+
perception_engine, visual_scout, memory_manager, video_agent = initialize_system()
|
| 93 |
+
|
| 94 |
+
# --- SIDEBAR ---
|
| 95 |
+
st.sidebar.title("1. Upload Video")
|
| 96 |
+
uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])
|
| 97 |
+
|
| 98 |
+
# --- SESSION STATE INITIALIZATION ---
|
| 99 |
+
if "active_video_id" not in st.session_state:
|
| 100 |
+
st.session_state.active_video_id = None
|
| 101 |
+
if "is_video_processed" not in st.session_state:
|
| 102 |
+
st.session_state.is_video_processed = False
|
| 103 |
+
if "chat_history" not in st.session_state:
|
| 104 |
+
st.session_state.chat_history = []
|
| 105 |
+
if "visual_memory" not in st.session_state:
|
| 106 |
+
st.session_state.visual_memory = None
|
| 107 |
+
if "text_memory" not in st.session_state:
|
| 108 |
+
st.session_state.text_memory = None
|
| 109 |
+
|
| 110 |
+
# --- MAIN UI ---
|
| 111 |
+
st.title("🦅 Visual Scout")
|
| 112 |
+
st.caption("Agentic Video Understanding System")
|
| 113 |
+
|
| 114 |
+
if uploaded_file is not None:
|
| 115 |
+
# Generate a simple ID from the filename
|
| 116 |
+
current_video_id = uploaded_file.name.split(".")[0]
|
| 117 |
+
|
| 118 |
+
# Detect if a new video was uploaded
|
| 119 |
+
if st.session_state.active_video_id != current_video_id:
|
| 120 |
+
st.session_state.active_video_id = current_video_id
|
| 121 |
+
st.session_state.is_video_processed = False
|
| 122 |
+
st.session_state.chat_history = []
|
| 123 |
+
st.session_state.visual_memory = None
|
| 124 |
+
st.session_state.text_memory = None
|
| 125 |
+
|
| 126 |
+
# Save the file locally
|
| 127 |
+
local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
|
| 128 |
+
with open(local_video_path, "wb") as f:
|
| 129 |
+
f.write(uploaded_file.getbuffer())
|
| 130 |
+
st.toast(f"Video '{current_video_id}' loaded.")
|
| 131 |
+
|
| 132 |
+
# --- PROCESSING PIPELINE ---
|
| 133 |
+
if not st.session_state.is_video_processed:
|
| 134 |
+
st.divider()
|
| 135 |
+
st.header("🧠 Analyzing Video Content")
|
| 136 |
+
st.info("The agent is watching the video to build a semantic index. This happens once per video.")
|
| 137 |
+
|
| 138 |
+
video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
|
| 139 |
+
|
| 140 |
+
# Initialize Memory Indices
|
| 141 |
+
visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
|
| 142 |
+
text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
|
| 143 |
+
|
| 144 |
+
visual_memory_index = VectorIndex(visual_index_path)
|
| 145 |
+
text_memory_index = VectorIndex(text_index_path)
|
| 146 |
+
|
| 147 |
+
# Store in session
|
| 148 |
+
st.session_state.visual_memory = visual_memory_index
|
| 149 |
+
st.session_state.text_memory = text_memory_index
|
| 150 |
+
|
| 151 |
+
memory_manager.initialize_storage(current_video_id)
|
| 152 |
+
|
| 153 |
+
with st.status("🦅 Scout: Scanning video timeline...", expanded=True) as status:
|
| 154 |
+
|
| 155 |
+
# Step 1: Extract Frames
|
| 156 |
+
status.write("Extracting frames at 1 FPS...")
|
| 157 |
+
raw_frames = list(extract_frames_decord(video_path, fps=1.0))
|
| 158 |
+
|
| 159 |
+
# Step 2: Semantic Cuts
|
| 160 |
+
status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
|
| 161 |
+
key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
|
| 162 |
+
|
| 163 |
+
# Index Visuals
|
| 164 |
+
for timestamp, frame in raw_frames:
|
| 165 |
+
embedding = visual_scout.embed_image(frame)
|
| 166 |
+
visual_memory_index.add(timestamp, embedding)
|
| 167 |
+
visual_memory_index.save()
|
| 168 |
+
|
| 169 |
+
status.write(f"Detected {len(key_events)} key semantic events.")
|
| 170 |
+
|
| 171 |
+
# Step 3: Deep Captioning
|
| 172 |
+
progress_bar = st.progress(0)
|
| 173 |
+
event_log = []
|
| 174 |
+
|
| 175 |
+
ANALYSIS_PROMPT = """Analyze this scene.
|
| 176 |
+
1. Describe the main action and subject.
|
| 177 |
+
2. Note any text or signs visible.
|
| 178 |
+
3. Describe the environment."""
|
| 179 |
+
|
| 180 |
+
for i, (timestamp, frame) in enumerate(key_events):
|
| 181 |
+
status.write(f"👁️ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
|
| 182 |
+
|
| 183 |
+
# Save temp frame for the VLM
|
| 184 |
+
temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
|
| 185 |
+
cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
| 186 |
+
|
| 187 |
+
description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
|
| 188 |
+
|
| 189 |
+
time_string = time.strftime('%M:%S', time.gmtime(timestamp))
|
| 190 |
+
memory_manager.commit_event(current_video_id, time_string, description, {})
|
| 191 |
+
event_log.append(f"**{time_string}**: {description}")
|
| 192 |
+
|
| 193 |
+
# Index Text (for semantic search later)
|
| 194 |
+
text_embedding = visual_scout.embed_text(description)
|
| 195 |
+
text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
|
| 196 |
+
|
| 197 |
+
progress_bar.progress((i + 1) / len(key_events))
|
| 198 |
+
|
| 199 |
+
text_memory_index.save()
|
| 200 |
+
|
| 201 |
+
# Step 4: Summary
|
| 202 |
+
status.write("📝 Writing Global Summary...")
|
| 203 |
+
full_timeline_text = "\n".join(event_log)
|
| 204 |
+
summary_prompt = f"""<|im_start|>system
|
| 205 |
+
You are a video editor. Read the timeline below and write a concise summary of the entire video.
|
| 206 |
+
TIMELINE:
|
| 207 |
+
{full_timeline_text}
|
| 208 |
+
<|im_end|>
|
| 209 |
+
<|im_start|>assistant
|
| 210 |
+
"""
|
| 211 |
+
summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
|
| 212 |
+
memory_manager.save_summary(current_video_id, summary)
|
| 213 |
+
|
| 214 |
+
status.update(label="✅ Analysis Complete!", state="complete", expanded=False)
|
| 215 |
+
|
| 216 |
+
st.session_state.is_video_processed = True
|
| 217 |
+
st.success("Video Index Ready.")
|
| 218 |
+
st.markdown(f"**Summary:** {summary}")
|
| 219 |
+
|
| 220 |
+
with st.expander("See Detailed Timeline"):
|
| 221 |
+
for event in event_log:
|
| 222 |
+
st.write(event)
|
| 223 |
+
|
| 224 |
+
# Rerun to switch to Chat Mode cleanly
|
| 225 |
+
st.rerun()
|
| 226 |
+
|
| 227 |
+
# --- CHAT INTERFACE ---
|
| 228 |
+
else:
|
| 229 |
+
st.divider()
|
| 230 |
+
|
| 231 |
+
# Display History
|
| 232 |
+
for message in st.session_state.chat_history:
|
| 233 |
+
with st.chat_message(message["role"]):
|
| 234 |
+
st.write(message["content"])
|
| 235 |
+
|
| 236 |
+
# Chat Input
|
| 237 |
+
if user_query := st.chat_input("Ask about the video..."):
|
| 238 |
+
|
| 239 |
+
# Add User Message
|
| 240 |
+
st.session_state.chat_history.append({"role": "user", "content": user_query})
|
| 241 |
+
with st.chat_message("user"):
|
| 242 |
+
st.write(user_query)
|
| 243 |
+
|
| 244 |
+
# Generate Answer
|
| 245 |
+
with st.chat_message("assistant"):
|
| 246 |
+
with st.spinner("Agent is thinking..."):
|
| 247 |
+
|
| 248 |
+
# Inject Tools/Context into the Agent
|
| 249 |
+
video_agent.context = {
|
| 250 |
+
"scout": visual_scout,
|
| 251 |
+
"vis_index": st.session_state.visual_memory,
|
| 252 |
+
"txt_index": st.session_state.text_memory
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
response_text = video_agent.ask(user_query, st.session_state.active_video_id)
|
| 256 |
+
st.write(response_text)
|
| 257 |
+
|
| 258 |
+
# Add Assistant Message
|
| 259 |
+
st.session_state.chat_history.append({"role": "assistant", "content": response_text})
|
src/config/settings.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
|
| 4 |
+
class ProjectPaths(BaseModel):
|
| 5 |
+
"""Defines the standard file paths for the project."""
|
| 6 |
+
root: Path = Path(__file__).parent.parent.parent
|
| 7 |
+
models_dir: Path = Field(default_factory=lambda: Path(__file__).parent.parent.parent / "models")
|
| 8 |
+
data_dir: Path = Field(default_factory=lambda: Path(__file__).parent.parent.parent / "data")
|
| 9 |
+
|
| 10 |
+
@property
|
| 11 |
+
def model_path(self) -> Path:
|
| 12 |
+
# Default model name
|
| 13 |
+
return self.models_dir / "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
|
| 14 |
+
|
| 15 |
+
class PerceptionSettings(BaseModel):
|
| 16 |
+
"""Tunable parameters for the vision system."""
|
| 17 |
+
frame_interval: int = 2 # Process 1 frame every X seconds
|
| 18 |
+
ssim_threshold: float = 0.90 # Similarity threshold to skip frames (0.0 - 1.0)
|
| 19 |
+
|
| 20 |
+
class Config(BaseModel):
|
| 21 |
+
"""Global Application Configuration."""
|
| 22 |
+
paths: ProjectPaths = Field(default_factory=ProjectPaths)
|
| 23 |
+
perception: PerceptionSettings = Field(default_factory=PerceptionSettings)
|
| 24 |
+
|
| 25 |
+
class Config:
|
| 26 |
+
arbitrary_types_allowed = True
|
| 27 |
+
|
| 28 |
+
# Singleton instance
|
| 29 |
+
settings = Config()
|
src/core/graph.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langgraph.graph import StateGraph, END
|
| 2 |
+
from typing import TypedDict, Annotated, Sequence, Dict, Any, List
|
| 3 |
+
from langchain_core.messages import BaseMessage
|
| 4 |
+
import operator
|
| 5 |
+
|
| 6 |
+
class AgentState(TypedDict):
|
| 7 |
+
"""
|
| 8 |
+
Represents the 'Brain State' of the agent as it thinks.
|
| 9 |
+
This dictionary is passed between all the nodes in the graph.
|
| 10 |
+
"""
|
| 11 |
+
user_query: str
|
| 12 |
+
video_id: str
|
| 13 |
+
plan: str
|
| 14 |
+
search_topic: str
|
| 15 |
+
target_timestamps: List[float]
|
| 16 |
+
observations: Annotated[List[str], operator.add] # 'add' means new observations are appended, not overwritten
|
| 17 |
+
final_answer: str
|
| 18 |
+
context: Dict[str, Any] # Holds references to the Search Index and Tools
|
| 19 |
+
|
| 20 |
+
def create_agent_graph(perception_engine, memory_manager):
|
| 21 |
+
"""
|
| 22 |
+
Builds the Decision Graph (The 'Flowchart' of the AI).
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
# --- NODE 1: PLANNER ---
|
| 26 |
+
def planner_node(state: AgentState):
|
| 27 |
+
query = state["user_query"]
|
| 28 |
+
print(f"🤖 Planner: Receiving query -> '{query}'")
|
| 29 |
+
# For now, we assume every query requires a search.
|
| 30 |
+
# Future improvement: Distinguish between 'Summary' and 'Specific Search'.
|
| 31 |
+
return {
|
| 32 |
+
"plan": "SEARCH",
|
| 33 |
+
"search_topic": query,
|
| 34 |
+
"observations": []
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# --- NODE 2: RETRIEVER (The Librarian) ---
|
| 38 |
+
def retriever_node(state: AgentState):
|
| 39 |
+
"""
|
| 40 |
+
Searches both Text (Metadata) and Vision (CLIP) indices.
|
| 41 |
+
"""
|
| 42 |
+
video_id = state["video_id"]
|
| 43 |
+
search_topic = state["search_topic"]
|
| 44 |
+
|
| 45 |
+
# Unpack tools from context
|
| 46 |
+
scout = state["context"]["scout"]
|
| 47 |
+
visual_memory = state["context"]["vis_index"]
|
| 48 |
+
text_memory = state["context"]["txt_index"]
|
| 49 |
+
|
| 50 |
+
found_observations = []
|
| 51 |
+
timestamps_to_investigate = []
|
| 52 |
+
|
| 53 |
+
print(f"📚 Retriever: Looking up '{search_topic}'...")
|
| 54 |
+
query_vector = scout.embed_text(search_topic)
|
| 55 |
+
|
| 56 |
+
# A. Search Semantic Text Memory (Captions we generated earlier)
|
| 57 |
+
if text_memory:
|
| 58 |
+
# Find top 3 text matches
|
| 59 |
+
text_matches = text_memory.search(query_vector, top_k=3)
|
| 60 |
+
|
| 61 |
+
# Filter matches that are actually relevant (Score > 0.35)
|
| 62 |
+
relevant_text_matches = [match for match in text_matches if match[1] > 0.35]
|
| 63 |
+
|
| 64 |
+
if relevant_text_matches:
|
| 65 |
+
print(f" ✅ Found {len(relevant_text_matches)} relevant text records.")
|
| 66 |
+
|
| 67 |
+
# Note: ideally we'd fetch the exact text from the index metadata here.
|
| 68 |
+
# For this implementation, we rely on the generic system memory or
|
| 69 |
+
# we accept the timestamp and let the Analyst re-verify.
|
| 70 |
+
# Let's map these timestamps to potential investigation points.
|
| 71 |
+
for timestamp, score in relevant_text_matches:
|
| 72 |
+
timestamps_to_investigate.append(timestamp)
|
| 73 |
+
found_observations.append(f"Memory Hint: Something relevant might be at {timestamp:.1f}s (Confidence: {score:.2f})")
|
| 74 |
+
|
| 75 |
+
else:
|
| 76 |
+
print(" ⚠️ No strong text matches found. Switching to Visual Search.")
|
| 77 |
+
|
| 78 |
+
# B. Visual Fallback (If text failed, or to double-check)
|
| 79 |
+
# We look for frames that *look* like the user's query
|
| 80 |
+
if not found_observations and visual_memory:
|
| 81 |
+
visual_matches = visual_memory.search(query_vector, top_k=3)
|
| 82 |
+
|
| 83 |
+
# Visual similarity needs a lower threshold usually
|
| 84 |
+
valid_visual_matches = [match for match in visual_matches if match[1] > 0.22]
|
| 85 |
+
|
| 86 |
+
if valid_visual_matches:
|
| 87 |
+
found_timestamps = [match[0] for match in valid_visual_matches]
|
| 88 |
+
print(f" 🦅 Visual Scout suggests checking times: {found_timestamps}")
|
| 89 |
+
timestamps_to_investigate.extend(found_timestamps)
|
| 90 |
+
else:
|
| 91 |
+
found_observations.append("No direct visual matches found.")
|
| 92 |
+
|
| 93 |
+
# Remove duplicates and sort
|
| 94 |
+
unique_timestamps = sorted(list(set(timestamps_to_investigate)))
|
| 95 |
+
|
| 96 |
+
return {
|
| 97 |
+
"observations": found_observations,
|
| 98 |
+
"target_timestamps": unique_timestamps
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# --- NODE 3: ANALYST (The Eyes) ---
|
| 102 |
+
def analyst_node(state: AgentState):
|
| 103 |
+
"""
|
| 104 |
+
Visits the specific timestamps found by the Retriever and looks closely.
|
| 105 |
+
"""
|
| 106 |
+
video_id = state["video_id"]
|
| 107 |
+
timestamps = state["target_timestamps"]
|
| 108 |
+
search_topic = state["search_topic"]
|
| 109 |
+
new_findings = []
|
| 110 |
+
|
| 111 |
+
if not timestamps:
|
| 112 |
+
return {"observations": ["Analyst: I have nowhere to look."]}
|
| 113 |
+
|
| 114 |
+
print(f"👁️ Analyst: Zooming in on {len(timestamps)} moments...")
|
| 115 |
+
|
| 116 |
+
# We give the Vision Model a very specific task
|
| 117 |
+
verification_prompt = f"Look specifically for '{search_topic}'. If you see it, describe it in detail. If not, say 'Not visible'."
|
| 118 |
+
|
| 119 |
+
for time_point in timestamps:
|
| 120 |
+
description = perception_engine.analyze_video_segment(
|
| 121 |
+
video_path=f"data/{video_id}.mp4",
|
| 122 |
+
start_time=time_point,
|
| 123 |
+
end_time=time_point + 1.0,
|
| 124 |
+
prompt=verification_prompt
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
log_entry = f"Visual Inspection at {time_point:.1f}s: {description}"
|
| 128 |
+
new_findings.append(log_entry)
|
| 129 |
+
print(f" > {log_entry}")
|
| 130 |
+
|
| 131 |
+
return {"observations": new_findings}
|
| 132 |
+
|
| 133 |
+
# --- NODE 4: SYNTHESIZER (The Speaker) ---
|
| 134 |
+
def synthesizer_node(state: AgentState):
|
| 135 |
+
"""
|
| 136 |
+
Compiles all observations into a final natural language answer.
|
| 137 |
+
"""
|
| 138 |
+
user_query = state["user_query"]
|
| 139 |
+
all_evidence = state["observations"]
|
| 140 |
+
|
| 141 |
+
if not all_evidence:
|
| 142 |
+
return {"final_answer": "I'm sorry, I couldn't find any information about that in the video."}
|
| 143 |
+
|
| 144 |
+
evidence_text = "\n".join(all_evidence)
|
| 145 |
+
|
| 146 |
+
system_prompt = f"""<|im_start|>system
|
| 147 |
+
You are a helpful video assistant.
|
| 148 |
+
Answer the user's question based strictly on the evidence below.
|
| 149 |
+
If the evidence contradicts itself, trust the 'Visual Inspection' over the 'Memory Hint'.
|
| 150 |
+
|
| 151 |
+
EVIDENCE COLLECTED:
|
| 152 |
+
{evidence_text}
|
| 153 |
+
<|im_end|>
|
| 154 |
+
<|im_start|>user
|
| 155 |
+
{user_query}
|
| 156 |
+
<|im_end|>
|
| 157 |
+
<|im_start|>assistant
|
| 158 |
+
"""
|
| 159 |
+
# We use the raw text generation here for direct control
|
| 160 |
+
answer = perception_engine.generate_text(system_prompt, stop=["<|im_end|>"])
|
| 161 |
+
return {"final_answer": answer}
|
| 162 |
+
|
| 163 |
+
# --- GRAPH CONSTRUCTION ---
|
| 164 |
+
workflow = StateGraph(AgentState)
|
| 165 |
+
|
| 166 |
+
# Add Nodes
|
| 167 |
+
workflow.add_node("planner", planner_node)
|
| 168 |
+
workflow.add_node("retriever", retriever_node)
|
| 169 |
+
workflow.add_node("analyst", analyst_node)
|
| 170 |
+
workflow.add_node("synthesizer", synthesizer_node)
|
| 171 |
+
|
| 172 |
+
# Define Edges (The Flow)
|
| 173 |
+
workflow.set_entry_point("planner")
|
| 174 |
+
workflow.add_edge("planner", "retriever")
|
| 175 |
+
workflow.add_edge("retriever", "analyst")
|
| 176 |
+
workflow.add_edge("analyst", "synthesizer")
|
| 177 |
+
workflow.add_edge("synthesizer", END)
|
| 178 |
+
|
| 179 |
+
return workflow.compile()
|
src/core/orchestrator.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.interfaces.base import PerceptionEngine, MemoryManager
|
| 2 |
+
from src.core.graph import create_agent_graph
|
| 3 |
+
|
| 4 |
+
class VideoAgent:
|
| 5 |
+
def __init__(self, perception: PerceptionEngine, memory: MemoryManager):
|
| 6 |
+
self.graph = create_agent_graph(perception, memory)
|
| 7 |
+
self.context = {} # Scout/Index injection
|
| 8 |
+
|
| 9 |
+
def ask(self, question: str, video_id: str) -> str:
|
| 10 |
+
"""
|
| 11 |
+
Runs the Linear Pipeline.
|
| 12 |
+
"""
|
| 13 |
+
initial_state = {
|
| 14 |
+
"query": question,
|
| 15 |
+
"video_id": video_id,
|
| 16 |
+
"plan": "",
|
| 17 |
+
"search_term": "",
|
| 18 |
+
"timestamps": [],
|
| 19 |
+
"observations": [],
|
| 20 |
+
"final_answer": "",
|
| 21 |
+
"context": self.context
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
final_state = self.graph.invoke(initial_state)
|
| 25 |
+
return final_state["final_answer"]
|
src/interfaces/base.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import List, Dict, Optional, Any
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
class PerceptionEngine(ABC):
|
| 6 |
+
"""Abstract Base Class for the Visual Perception System."""
|
| 7 |
+
|
| 8 |
+
@abstractmethod
|
| 9 |
+
def load_model(self, model_path: Path) -> None:
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
@abstractmethod
|
| 13 |
+
def analyze_frame(self, frame_path: str, prompt: str) -> str:
|
| 14 |
+
pass
|
| 15 |
+
|
| 16 |
+
@abstractmethod
|
| 17 |
+
def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, prompt: str) -> str:
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def chat(self, messages: List[Dict[str, str]]) -> str:
|
| 22 |
+
"""General purpose chat completion (for the Agent brain)."""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
@abstractmethod
|
| 26 |
+
def generate_text(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
| 27 |
+
"""Raw text completion for advanced prompting."""
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
class MemoryManager(ABC):
|
| 31 |
+
"""Abstract Base Class for the Knowledge Graph."""
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
def initialize_storage(self, video_id: str) -> None:
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
@abstractmethod
|
| 38 |
+
def commit_event(self, video_id: str, timestamp: str, description: str, metadata: Dict[str, Any]) -> None:
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
@abstractmethod
|
| 42 |
+
def query_knowledge(self, video_id: str, query: str) -> List[Dict[str, Any]]:
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
@abstractmethod
|
| 46 |
+
def get_summary(self, video_id: str) -> str:
|
| 47 |
+
pass
|
src/main.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from rich.console import Console
|
| 4 |
+
from rich.prompt import Prompt
|
| 5 |
+
|
| 6 |
+
from src.perception.engine import Qwen2PerceptionEngine
|
| 7 |
+
from src.memory.manager import SimpleMemoryManager
|
| 8 |
+
from src.core.orchestrator import VideoAgent
|
| 9 |
+
from src.config.settings import settings
|
| 10 |
+
|
| 11 |
+
console = Console()
|
| 12 |
+
|
| 13 |
+
def main():
|
| 14 |
+
console.rule("[bold blue]Agentic Video Understanding System[/]")
|
| 15 |
+
|
| 16 |
+
# 1. Initialize Components
|
| 17 |
+
with console.status("[bold green]Loading AI Models... (This uses GPU)[/]"):
|
| 18 |
+
perception = Qwen2PerceptionEngine()
|
| 19 |
+
# Pre-load to avoid delay on first query
|
| 20 |
+
try:
|
| 21 |
+
perception.load_model(settings.paths.model_path)
|
| 22 |
+
except Exception as e:
|
| 23 |
+
console.print(f"[bold red]Critical Error:[/] Failed to load Qwen2-VL. {e}")
|
| 24 |
+
console.print("Ensure you ran 'python scripts/download_model.py'")
|
| 25 |
+
sys.exit(1)
|
| 26 |
+
|
| 27 |
+
memory = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
|
| 28 |
+
|
| 29 |
+
agent = VideoAgent(perception, memory)
|
| 30 |
+
|
| 31 |
+
# 2. Select Video
|
| 32 |
+
video_id = "test_video"
|
| 33 |
+
video_path = settings.paths.data_dir / f"{video_id}.mp4"
|
| 34 |
+
|
| 35 |
+
if not video_path.exists():
|
| 36 |
+
console.print(f"[yellow]Warning:[/] Video {video_path} not found.")
|
| 37 |
+
console.print("Please place a video at data/test_video.mp4")
|
| 38 |
+
return
|
| 39 |
+
|
| 40 |
+
# 3. Initialize Memory for this video
|
| 41 |
+
memory.initialize_storage(video_id)
|
| 42 |
+
console.print(f"[green]Video Loaded:[/] {video_id}.mp4")
|
| 43 |
+
|
| 44 |
+
# 4. Interactive Loop
|
| 45 |
+
while True:
|
| 46 |
+
console.print()
|
| 47 |
+
query = Prompt.ask("[bold cyan]Ask a question[/] (or 'exit')")
|
| 48 |
+
|
| 49 |
+
if query.lower() in ["exit", "quit", "q"]:
|
| 50 |
+
break
|
| 51 |
+
|
| 52 |
+
with console.status("[bold yellow]Agent Thinking...[/]"):
|
| 53 |
+
try:
|
| 54 |
+
response = agent.ask(query, video_id)
|
| 55 |
+
console.print(f"[bold white]Answer:[/] {response}")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
console.print(f"[bold red]Error:[/] {e}")
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
main()
|
src/memory/manager.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.interfaces.base import MemoryManager
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
class SimpleMemoryManager(MemoryManager):
|
| 7 |
+
"""
|
| 8 |
+
A lightweight file-based memory manager using JSON.
|
| 9 |
+
Perfect for single-video sessions.
|
| 10 |
+
"""
|
| 11 |
+
def __init__(self, storage_dir: Path):
|
| 12 |
+
self.storage_dir = storage_dir
|
| 13 |
+
if not self.storage_dir.exists():
|
| 14 |
+
self.storage_dir.mkdir(parents=True)
|
| 15 |
+
|
| 16 |
+
def _get_file_path(self, video_id: str) -> Path:
|
| 17 |
+
return self.storage_dir / f"{video_id}_metadata.json"
|
| 18 |
+
|
| 19 |
+
def _load_data(self, video_id: str) -> Dict[str, Any]:
|
| 20 |
+
path = self._get_file_path(video_id)
|
| 21 |
+
if not path.exists():
|
| 22 |
+
raise FileNotFoundError(f"Metadata not found for {video_id}")
|
| 23 |
+
with open(path, "r") as f:
|
| 24 |
+
return json.load(f)
|
| 25 |
+
|
| 26 |
+
def _save_data(self, video_id: str, data: Dict[str, Any]) -> None:
|
| 27 |
+
path = self._get_file_path(video_id)
|
| 28 |
+
with open(path, "w") as f:
|
| 29 |
+
json.dump(data, f, indent=2)
|
| 30 |
+
|
| 31 |
+
def initialize_storage(self, video_id: str) -> None:
|
| 32 |
+
"""Sets up the storage structure for a new video."""
|
| 33 |
+
data = {
|
| 34 |
+
"video_id": video_id,
|
| 35 |
+
"events": [],
|
| 36 |
+
"entities": {},
|
| 37 |
+
"summary": ""
|
| 38 |
+
}
|
| 39 |
+
self._save_data(video_id, data)
|
| 40 |
+
|
| 41 |
+
def commit_event(self, video_id: str, timestamp: str, description: str, metadata: Dict[str, Any]) -> None:
|
| 42 |
+
"""Saves a new event to the timeline."""
|
| 43 |
+
data = self._load_data(video_id)
|
| 44 |
+
|
| 45 |
+
event = {
|
| 46 |
+
"timestamp": timestamp,
|
| 47 |
+
"description": description,
|
| 48 |
+
"metadata": metadata
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
data["events"].append(event)
|
| 52 |
+
self._save_data(video_id, data)
|
| 53 |
+
|
| 54 |
+
def query_knowledge(self, video_id: str, query: str) -> List[Dict[str, Any]]:
|
| 55 |
+
"""Searches the existing knowledge base."""
|
| 56 |
+
data = self._load_data(video_id)
|
| 57 |
+
results = []
|
| 58 |
+
|
| 59 |
+
# Simple keyword search
|
| 60 |
+
query_lower = query.lower()
|
| 61 |
+
for event in data["events"]:
|
| 62 |
+
if query_lower in event["description"].lower():
|
| 63 |
+
results.append(event)
|
| 64 |
+
|
| 65 |
+
return results
|
| 66 |
+
|
| 67 |
+
def get_summary(self, video_id: str) -> str:
|
| 68 |
+
data = self._load_data(video_id)
|
| 69 |
+
return data.get("summary", "")
|
| 70 |
+
|
| 71 |
+
def save_summary(self, video_id: str, summary_text: str) -> None:
|
| 72 |
+
"""Updates the global summary for the video."""
|
| 73 |
+
data = self._load_data(video_id)
|
| 74 |
+
data["summary"] = summary_text
|
| 75 |
+
self._save_data(video_id, data)
|
src/memory/vector_index.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import List, Tuple, Dict, Any, Optional
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
import pickle
|
| 6 |
+
|
| 7 |
+
class VectorIndex:
|
| 8 |
+
"""
|
| 9 |
+
In-memory Vector Database.
|
| 10 |
+
|
| 11 |
+
This acts as the 'Long Term Memory' for visual concepts.
|
| 12 |
+
It maps a Timestamp (when something happened) to a Vector (what it looked like).
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, index_file_path: Path):
|
| 16 |
+
self.file_path = index_file_path
|
| 17 |
+
self.timestamps: List[float] = []
|
| 18 |
+
self.embedding_matrix: Optional[np.ndarray] = None
|
| 19 |
+
self.metadata_store: List[Dict[str, Any]] = []
|
| 20 |
+
|
| 21 |
+
# Load existing index if available
|
| 22 |
+
if self.file_path.exists():
|
| 23 |
+
self.load()
|
| 24 |
+
|
| 25 |
+
def add(self, timestamp_seconds: float, vector: np.ndarray, extra_data: Dict[str, Any] = None):
|
| 26 |
+
"""Adds a new memory entry (timestamp + vector)."""
|
| 27 |
+
self.timestamps.append(timestamp_seconds)
|
| 28 |
+
self.metadata_store.append(extra_data or {})
|
| 29 |
+
|
| 30 |
+
# Normalize the vector to length 1.
|
| 31 |
+
# This is crucial so that 'Cosine Similarity' is just a Dot Product (faster).
|
| 32 |
+
vector_norm = np.linalg.norm(vector)
|
| 33 |
+
if vector_norm > 0:
|
| 34 |
+
vector = vector / vector_norm
|
| 35 |
+
|
| 36 |
+
if self.embedding_matrix is None:
|
| 37 |
+
self.embedding_matrix = vector.reshape(1, -1)
|
| 38 |
+
else:
|
| 39 |
+
self.embedding_matrix = np.vstack([self.embedding_matrix, vector])
|
| 40 |
+
|
| 41 |
+
def search(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[float, float]]:
|
| 42 |
+
"""
|
| 43 |
+
Finds the moments in the video that are most similar to the query.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
A list of tuples: (timestamp_seconds, similarity_score)
|
| 47 |
+
"""
|
| 48 |
+
if self.embedding_matrix is None:
|
| 49 |
+
return []
|
| 50 |
+
|
| 51 |
+
# Normalize the query too
|
| 52 |
+
query_norm = np.linalg.norm(query_vector)
|
| 53 |
+
if query_norm > 0:
|
| 54 |
+
query_vector = query_vector / query_norm
|
| 55 |
+
|
| 56 |
+
# Calculate similarity against ALL stored memories at once
|
| 57 |
+
similarity_scores = cosine_similarity(query_vector.reshape(1, -1), self.embedding_matrix)[0]
|
| 58 |
+
|
| 59 |
+
# Sort by highest score first
|
| 60 |
+
best_indices = np.argsort(similarity_scores)[::-1][:top_k]
|
| 61 |
+
|
| 62 |
+
results = []
|
| 63 |
+
for index in best_indices:
|
| 64 |
+
score = float(similarity_scores[index])
|
| 65 |
+
time_point = self.timestamps[index]
|
| 66 |
+
results.append((time_point, score))
|
| 67 |
+
|
| 68 |
+
return results
|
| 69 |
+
|
| 70 |
+
def save(self):
|
| 71 |
+
"""Persists the index to the disk using Pickle."""
|
| 72 |
+
data_packet = {
|
| 73 |
+
"timestamps": self.timestamps,
|
| 74 |
+
"vectors": self.embedding_matrix,
|
| 75 |
+
"metadata": self.metadata_store
|
| 76 |
+
}
|
| 77 |
+
with open(self.file_path, "wb") as f:
|
| 78 |
+
pickle.dump(data_packet, f)
|
| 79 |
+
|
| 80 |
+
def load(self):
|
| 81 |
+
"""Loads the index from disk."""
|
| 82 |
+
with open(self.file_path, "rb") as f:
|
| 83 |
+
data_packet = pickle.load(f)
|
| 84 |
+
self.timestamps = data_packet["timestamps"]
|
| 85 |
+
self.embedding_matrix = data_packet["vectors"]
|
| 86 |
+
self.metadata_store = data_packet.get("metadata", [])
|
src/perception/engine.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Optional, List, Dict
|
| 4 |
+
import base64
|
| 5 |
+
|
| 6 |
+
# Third-party imports
|
| 7 |
+
from llama_cpp import Llama
|
| 8 |
+
from llama_cpp.llama_chat_format import Llava15ChatHandler
|
| 9 |
+
import cv2
|
| 10 |
+
|
| 11 |
+
# Local imports
|
| 12 |
+
from src.interfaces.base import PerceptionEngine
|
| 13 |
+
from src.config.settings import settings
|
| 14 |
+
|
| 15 |
+
class Qwen2PerceptionEngine(PerceptionEngine):
|
| 16 |
+
"""
|
| 17 |
+
The 'Eyes' of the system.
|
| 18 |
+
|
| 19 |
+
This class wraps the Qwen2-VL (Vision-Language) model running via llama.cpp.
|
| 20 |
+
It handles loading the heavy GPU weights and formatting images for the AI to 'see'.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
# We hold the model in memory here.
|
| 25 |
+
# It's set to None initially to allow for lazy loading (saving RAM until needed).
|
| 26 |
+
self._vision_language_model: Optional[Llama] = None
|
| 27 |
+
|
| 28 |
+
def _find_vision_adapter(self) -> Path:
|
| 29 |
+
"""
|
| 30 |
+
Locates the 'mmproj' file (Multimedia Projector).
|
| 31 |
+
This file acts as a translator between the Image Encoder and the Language Model.
|
| 32 |
+
"""
|
| 33 |
+
candidates = list(settings.paths.models_dir.glob("*mmproj*.gguf"))
|
| 34 |
+
if not candidates:
|
| 35 |
+
raise FileNotFoundError("Critical: Could not find the vision adapter (mmproj) in models/ directory.")
|
| 36 |
+
return candidates[0]
|
| 37 |
+
|
| 38 |
+
def load_model(self, model_file_path: Path) -> None:
|
| 39 |
+
"""Loads the AI model into GPU memory."""
|
| 40 |
+
if self._vision_language_model is not None:
|
| 41 |
+
return # Already loaded
|
| 42 |
+
|
| 43 |
+
print(f"Loading Qwen2-VL from {model_file_path}...")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
# The ChatHandler takes care of the complex CLIP image processing
|
| 47 |
+
vision_handler = Llava15ChatHandler(clip_model_path=str(self._find_vision_adapter()))
|
| 48 |
+
|
| 49 |
+
self._vision_language_model = Llama(
|
| 50 |
+
model_path=str(model_file_path),
|
| 51 |
+
chat_handler=vision_handler,
|
| 52 |
+
n_ctx=2048, # Context Window (how much text/image data it can hold)
|
| 53 |
+
n_gpu_layers=-1, # -1 means "Put everything on the GPU"
|
| 54 |
+
n_batch=512,
|
| 55 |
+
verbose=False # Keep logs clean
|
| 56 |
+
)
|
| 57 |
+
print("✅ Vision Model loaded successfully on GPU.")
|
| 58 |
+
except Exception as error:
|
| 59 |
+
print(f"❌ Failed to load model: {error}")
|
| 60 |
+
raise
|
| 61 |
+
|
| 62 |
+
def _convert_image_to_base64(self, local_image_path: str) -> str:
|
| 63 |
+
"""Reads an image file and encodes it as a string for the API."""
|
| 64 |
+
with open(local_image_path, "rb") as image_file:
|
| 65 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
| 66 |
+
|
| 67 |
+
def analyze_frame(self, frame_path: str, user_prompt: str) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Main Vision Function: Looks at a single image and answers a prompt.
|
| 70 |
+
"""
|
| 71 |
+
if self._vision_language_model is None:
|
| 72 |
+
self.load_model(settings.paths.model_path)
|
| 73 |
+
|
| 74 |
+
# Create the data URI that the model expects
|
| 75 |
+
image_uri = f"data:image/jpeg;base64,{self._convert_image_to_base64(frame_path)}"
|
| 76 |
+
|
| 77 |
+
# Construct the conversation history
|
| 78 |
+
conversation = [
|
| 79 |
+
{
|
| 80 |
+
"role": "user",
|
| 81 |
+
"content": [
|
| 82 |
+
{"type": "image_url", "image_url": {"url": image_uri}},
|
| 83 |
+
{"type": "text", "text": user_prompt}
|
| 84 |
+
]
|
| 85 |
+
}
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
# Ask the model
|
| 89 |
+
response = self._vision_language_model.create_chat_completion(
|
| 90 |
+
messages=conversation,
|
| 91 |
+
max_tokens=256, # Limit response length to avoid rambling
|
| 92 |
+
temperature=0.3 # Low temperature = More factual, less creative
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
return response["choices"][0]["message"]["content"]
|
| 96 |
+
|
| 97 |
+
def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, analysis_prompt: str) -> str:
|
| 98 |
+
"""
|
| 99 |
+
Analyzes a specific time range in the video.
|
| 100 |
+
Currently extracts the middle frame of that segment.
|
| 101 |
+
"""
|
| 102 |
+
# 1. Open the video file
|
| 103 |
+
video_capture = cv2.VideoCapture(str(video_path))
|
| 104 |
+
fps = video_capture.get(cv2.CAP_PROP_FPS)
|
| 105 |
+
|
| 106 |
+
# 2. Jump to the middle of the requested segment
|
| 107 |
+
middle_timestamp = (start_time + end_time) / 2
|
| 108 |
+
target_frame_number = int(middle_timestamp * fps)
|
| 109 |
+
|
| 110 |
+
video_capture.set(cv2.CAP_PROP_POS_FRAMES, target_frame_number)
|
| 111 |
+
success, video_frame = video_capture.read()
|
| 112 |
+
video_capture.release()
|
| 113 |
+
|
| 114 |
+
if not success:
|
| 115 |
+
return "Error: Could not read video frame at this timestamp."
|
| 116 |
+
|
| 117 |
+
# 3. Save a temporary snapshot to disk (Model reads from disk)
|
| 118 |
+
temp_snapshot_path = settings.paths.data_dir / "temp_analysis_frame.jpg"
|
| 119 |
+
|
| 120 |
+
# Ensure directory exists
|
| 121 |
+
if not temp_snapshot_path.parent.exists():
|
| 122 |
+
temp_snapshot_path.parent.mkdir(parents=True)
|
| 123 |
+
|
| 124 |
+
cv2.imwrite(str(temp_snapshot_path), video_frame)
|
| 125 |
+
|
| 126 |
+
# 4. Perform the analysis
|
| 127 |
+
return self.analyze_frame(str(temp_snapshot_path), analysis_prompt)
|
| 128 |
+
|
| 129 |
+
def chat(self, chat_history: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None) -> str:
|
| 130 |
+
"""Standard text-only chat (for reasoning without new images)."""
|
| 131 |
+
if self._vision_language_model is None:
|
| 132 |
+
self.load_model(settings.paths.model_path)
|
| 133 |
+
|
| 134 |
+
response = self._vision_language_model.create_chat_completion(
|
| 135 |
+
messages=chat_history,
|
| 136 |
+
max_tokens=512,
|
| 137 |
+
temperature=0.7,
|
| 138 |
+
stop=stop_sequences
|
| 139 |
+
)
|
| 140 |
+
return response["choices"][0]["message"]["content"]
|
| 141 |
+
|
| 142 |
+
def generate_text(self, raw_prompt: str, stop_sequences: Optional[List[str]] = None) -> str:
|
| 143 |
+
"""
|
| 144 |
+
Raw text completion.
|
| 145 |
+
Useful when we want strict control over the output format (like standardizing a summary).
|
| 146 |
+
"""
|
| 147 |
+
if self._vision_language_model is None:
|
| 148 |
+
self.load_model(settings.paths.model_path)
|
| 149 |
+
|
| 150 |
+
response = self._vision_language_model.create_completion(
|
| 151 |
+
prompt=raw_prompt,
|
| 152 |
+
max_tokens=512,
|
| 153 |
+
temperature=0.7,
|
| 154 |
+
stop=stop_sequences
|
| 155 |
+
)
|
| 156 |
+
return response["choices"][0]["text"]
|
src/perception/scout.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import Union, List, Tuple
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
|
| 7 |
+
class VisualScout:
|
| 8 |
+
"""
|
| 9 |
+
The 'Scout' Agent.
|
| 10 |
+
|
| 11 |
+
This class handles the fast, lightweight semantic analysis of the video.
|
| 12 |
+
It uses CLIP (Vision Transformer) to convert video frames into mathematical vectors,
|
| 13 |
+
allowing us to 'search' the video content numerically without needing a heavy LLM.
|
| 14 |
+
"""
|
| 15 |
+
def __init__(self, model_name: str = "clip-ViT-B-32"):
|
| 16 |
+
print(f"Initializing Visual Scout with model: {model_name}...")
|
| 17 |
+
# We use CPU here to save the GPU VRAM for the main Chat Model (Qwen)
|
| 18 |
+
self.embedding_model = SentenceTransformer(model_name, device="cpu")
|
| 19 |
+
|
| 20 |
+
def embed_image(self, image_data: Union[np.ndarray, Image.Image]) -> np.ndarray:
|
| 21 |
+
"""Converts a single video frame into a 512-dimensional vector."""
|
| 22 |
+
if isinstance(image_data, np.ndarray):
|
| 23 |
+
# Convert OpenCV format (numpy) to PIL for the model
|
| 24 |
+
image_data = Image.fromarray(image_data)
|
| 25 |
+
return self.embedding_model.encode(image_data)
|
| 26 |
+
|
| 27 |
+
def embed_text(self, search_text: str) -> np.ndarray:
|
| 28 |
+
"""Converts a user's search query into a vector for comparison."""
|
| 29 |
+
return self.embedding_model.encode(search_text)
|
| 30 |
+
|
| 31 |
+
def detect_semantic_changes(self, video_frames: List[Tuple[float, np.ndarray]], sensitivity: float = 0.85) -> List[Tuple[float, np.ndarray]]:
|
| 32 |
+
"""
|
| 33 |
+
Scans the video to find 'Scenes' rather than just raw frames.
|
| 34 |
+
|
| 35 |
+
It compares each frame to the previous one using vector similarity.
|
| 36 |
+
If the similarity drops below the 'sensitivity' threshold, we mark it as a new event.
|
| 37 |
+
"""
|
| 38 |
+
if not video_frames:
|
| 39 |
+
return []
|
| 40 |
+
|
| 41 |
+
print(f"🦅 Scout: Analyzing {len(video_frames)} frames for scene changes...")
|
| 42 |
+
|
| 43 |
+
# Optimization: Batch process all images at once instead of a loop
|
| 44 |
+
pil_images = [Image.fromarray(frame) for _, frame in video_frames]
|
| 45 |
+
|
| 46 |
+
# This is the heavy lifting - encoding all frames
|
| 47 |
+
frame_embeddings = self.embedding_model.encode(pil_images, batch_size=32, show_progress_bar=True)
|
| 48 |
+
|
| 49 |
+
significant_events = []
|
| 50 |
+
|
| 51 |
+
# Always include the very first frame
|
| 52 |
+
significant_events.append(video_frames[0])
|
| 53 |
+
|
| 54 |
+
previous_vector = frame_embeddings[0].reshape(1, -1)
|
| 55 |
+
|
| 56 |
+
# Iterate through the timeline
|
| 57 |
+
for i in range(1, len(frame_embeddings)):
|
| 58 |
+
current_vector = frame_embeddings[i].reshape(1, -1)
|
| 59 |
+
|
| 60 |
+
# Calculate how similar this frame is to the previous one (0.0 to 1.0)
|
| 61 |
+
similarity_score = cosine_similarity(previous_vector, current_vector)[0][0]
|
| 62 |
+
|
| 63 |
+
# If the scene changed drastically (low similarity), keep this frame
|
| 64 |
+
if similarity_score < sensitivity:
|
| 65 |
+
timestamp = video_frames[i][0]
|
| 66 |
+
print(f" ✂️ New Scene detected at {timestamp:.1f}s (Similarity: {similarity_score:.2f})")
|
| 67 |
+
|
| 68 |
+
significant_events.append(video_frames[i])
|
| 69 |
+
previous_vector = current_vector
|
| 70 |
+
|
| 71 |
+
print(f"🦅 Scout: Condensed video into {len(significant_events)} key semantic events.")
|
| 72 |
+
return significant_events
|
src/utils/video.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Generator, Tuple, List
|
| 5 |
+
import decord
|
| 6 |
+
from decord import VideoReader, cpu
|
| 7 |
+
|
| 8 |
+
# Fix decord seed to avoid warnings
|
| 9 |
+
decord.bridge.set_bridge('torch')
|
| 10 |
+
|
| 11 |
+
def extract_frames_decord(video_path: Path, fps: float = 1.0) -> Generator[Tuple[float, np.ndarray], None, None]:
|
| 12 |
+
"""Efficiently extracts frames from a video using Decord."""
|
| 13 |
+
if not video_path.exists():
|
| 14 |
+
raise FileNotFoundError(f"Video not found: {video_path}")
|
| 15 |
+
|
| 16 |
+
vr = VideoReader(str(video_path), ctx=cpu(0))
|
| 17 |
+
original_fps = vr.get_avg_fps()
|
| 18 |
+
|
| 19 |
+
# Calculate indices
|
| 20 |
+
step = int(original_fps / fps)
|
| 21 |
+
if step < 1: step = 1
|
| 22 |
+
|
| 23 |
+
indices = list(range(0, len(vr), step))
|
| 24 |
+
|
| 25 |
+
# Batch extraction
|
| 26 |
+
batch_size = 32
|
| 27 |
+
for i in range(0, len(indices), batch_size):
|
| 28 |
+
batch_indices = indices[i : i + batch_size]
|
| 29 |
+
frames = vr.get_batch(batch_indices).asnumpy()
|
| 30 |
+
|
| 31 |
+
for j, frame in enumerate(frames):
|
| 32 |
+
idx = batch_indices[j]
|
| 33 |
+
timestamp = idx / original_fps
|
| 34 |
+
yield timestamp, frame
|
| 35 |
+
|
| 36 |
+
def calculate_ssim_simplified(img1: np.ndarray, img2: np.ndarray) -> float:
|
| 37 |
+
"""Calculates a simple structural similarity score (MSE based)."""
|
| 38 |
+
if img1.shape != img2.shape:
|
| 39 |
+
img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
|
| 40 |
+
|
| 41 |
+
g1 = cv2.cvtColor(img1, cv2.COLOR_RGB2GRAY)
|
| 42 |
+
g2 = cv2.cvtColor(img2, cv2.COLOR_RGB2GRAY)
|
| 43 |
+
|
| 44 |
+
mse = np.mean((g1 - g2) ** 2)
|
| 45 |
+
if mse == 0: return 1.0
|
| 46 |
+
return 1.0 / (1.0 + (mse / 1000.0))
|
| 47 |
+
|
| 48 |
+
def extract_key_scenes(video_path: Path, threshold: float = 0.85) -> List[Tuple[float, np.ndarray]]:
|
| 49 |
+
"""
|
| 50 |
+
Extracts ONLY significant scene changes (Keyframes).
|
| 51 |
+
Reduces 60 frames -> 5-10 keyframes.
|
| 52 |
+
"""
|
| 53 |
+
print("🎬 Detecting Scenes...")
|
| 54 |
+
keyframes = []
|
| 55 |
+
last_frame = None
|
| 56 |
+
|
| 57 |
+
# Scan at 1 FPS
|
| 58 |
+
for ts, frame in extract_frames_decord(video_path, fps=1.0):
|
| 59 |
+
if last_frame is None:
|
| 60 |
+
keyframes.append((ts, frame))
|
| 61 |
+
last_frame = frame
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
score = calculate_ssim_simplified(last_frame, frame)
|
| 65 |
+
|
| 66 |
+
# If scene changed significantly (score < threshold)
|
| 67 |
+
if score < threshold:
|
| 68 |
+
keyframes.append((ts, frame))
|
| 69 |
+
last_frame = frame
|
| 70 |
+
|
| 71 |
+
print(f"🎬 Found {len(keyframes)} unique scenes.")
|
| 72 |
+
return keyframes
|
tests/test_graph.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from unittest.mock import MagicMock
|
| 3 |
+
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
|
| 4 |
+
from src.core.graph import create_agent_graph
|
| 5 |
+
|
| 6 |
+
class TestAgentGraph:
|
| 7 |
+
def test_graph_structure(self):
|
| 8 |
+
"""Test that the graph compiles and has expected nodes."""
|
| 9 |
+
# We pass mocks for dependencies
|
| 10 |
+
mock_perception = MagicMock()
|
| 11 |
+
mock_memory = MagicMock()
|
| 12 |
+
|
| 13 |
+
app = create_agent_graph(mock_perception, mock_memory)
|
| 14 |
+
|
| 15 |
+
# Check basic graph properties (it's a CompiledGraph)
|
| 16 |
+
assert app is not None
|
| 17 |
+
# We can't easily inspect nodes on the compiled object without internal access,
|
| 18 |
+
# but successful compilation means the structure is valid.
|
| 19 |
+
|
| 20 |
+
def test_tool_node_execution(self):
|
| 21 |
+
"""
|
| 22 |
+
We can't easily mock the entire LangGraph execution loop in a unit test
|
| 23 |
+
without spinning up the full runtime.
|
| 24 |
+
Instead, we will test the 'Tools' individually here to ensure they
|
| 25 |
+
interface with the Graph correctly.
|
| 26 |
+
"""
|
| 27 |
+
pass
|
tests/test_memory.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from src.memory.manager import SimpleMemoryManager
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
class TestMemoryManager:
|
| 7 |
+
|
| 8 |
+
def test_initialize_storage(self, tmp_path):
|
| 9 |
+
"""Test that initialization creates a new empty structure."""
|
| 10 |
+
# Using pytest's tmp_path fixture for isolated file IO
|
| 11 |
+
manager = SimpleMemoryManager(storage_dir=tmp_path)
|
| 12 |
+
video_id = "test_video_01"
|
| 13 |
+
|
| 14 |
+
manager.initialize_storage(video_id)
|
| 15 |
+
|
| 16 |
+
# Verify file creation
|
| 17 |
+
expected_file = tmp_path / f"{video_id}_metadata.json"
|
| 18 |
+
assert expected_file.exists()
|
| 19 |
+
|
| 20 |
+
with open(expected_file) as f:
|
| 21 |
+
data = json.load(f)
|
| 22 |
+
assert data["video_id"] == video_id
|
| 23 |
+
assert data["events"] == []
|
| 24 |
+
assert data["entities"] == {}
|
| 25 |
+
|
| 26 |
+
def test_commit_event(self, tmp_path):
|
| 27 |
+
"""Test adding an event to the timeline."""
|
| 28 |
+
manager = SimpleMemoryManager(storage_dir=tmp_path)
|
| 29 |
+
video_id = "test_video_01"
|
| 30 |
+
manager.initialize_storage(video_id)
|
| 31 |
+
|
| 32 |
+
manager.commit_event(
|
| 33 |
+
video_id=video_id,
|
| 34 |
+
timestamp="00:05",
|
| 35 |
+
description="Man walks dog",
|
| 36 |
+
metadata={"objects": ["man", "dog"]}
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Reload and check
|
| 40 |
+
expected_file = tmp_path / f"{video_id}_metadata.json"
|
| 41 |
+
with open(expected_file) as f:
|
| 42 |
+
data = json.load(f)
|
| 43 |
+
assert len(data["events"]) == 1
|
| 44 |
+
assert data["events"][0]["description"] == "Man walks dog"
|
| 45 |
+
assert data["events"][0]["timestamp"] == "00:05"
|
| 46 |
+
|
| 47 |
+
def test_query_knowledge(self, tmp_path):
|
| 48 |
+
"""Test simple keyword search."""
|
| 49 |
+
manager = SimpleMemoryManager(storage_dir=tmp_path)
|
| 50 |
+
video_id = "test_video_01"
|
| 51 |
+
manager.initialize_storage(video_id)
|
| 52 |
+
|
| 53 |
+
manager.commit_event(video_id, "00:01", "Car drives by", {})
|
| 54 |
+
manager.commit_event(video_id, "00:05", "Man walks dog", {})
|
| 55 |
+
|
| 56 |
+
# Search for "dog"
|
| 57 |
+
results = manager.query_knowledge(video_id, "dog")
|
| 58 |
+
assert len(results) == 1
|
| 59 |
+
assert results[0]["description"] == "Man walks dog"
|
tests/test_orchestrator.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from unittest.mock import MagicMock
|
| 3 |
+
from src.core.orchestrator import VideoAgent
|
| 4 |
+
from src.interfaces.base import PerceptionEngine, MemoryManager
|
| 5 |
+
|
| 6 |
+
class TestVideoAgent:
|
| 7 |
+
def test_agent_answers_from_memory(self):
|
| 8 |
+
"""If memory has the answer, it returns it without watching video."""
|
| 9 |
+
# Setup
|
| 10 |
+
mock_perception = MagicMock(spec=PerceptionEngine)
|
| 11 |
+
mock_memory = MagicMock(spec=MemoryManager)
|
| 12 |
+
|
| 13 |
+
# Memory returns a hit
|
| 14 |
+
mock_memory.query_knowledge.return_value = [
|
| 15 |
+
{"timestamp": "00:05", "description": "The dog is jumping."}
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
agent = VideoAgent(perception=mock_perception, memory=mock_memory)
|
| 19 |
+
|
| 20 |
+
# Act
|
| 21 |
+
response = agent.ask("What is the dog doing?", video_id="test_vid")
|
| 22 |
+
|
| 23 |
+
# Assert
|
| 24 |
+
assert "jumping" in response
|
| 25 |
+
# Should NOT call vision engine because memory had it
|
| 26 |
+
mock_perception.analyze_video_segment.assert_not_called()
|
| 27 |
+
|
| 28 |
+
def test_agent_uses_perception_if_memory_fails(self):
|
| 29 |
+
"""If memory is empty, it triggers vision lookups."""
|
| 30 |
+
# Setup
|
| 31 |
+
mock_perception = MagicMock(spec=PerceptionEngine)
|
| 32 |
+
mock_memory = MagicMock(spec=MemoryManager)
|
| 33 |
+
|
| 34 |
+
# Memory miss
|
| 35 |
+
mock_memory.query_knowledge.return_value = []
|
| 36 |
+
|
| 37 |
+
# Vision engine returns info
|
| 38 |
+
mock_perception.analyze_video_segment.return_value = "The car is red."
|
| 39 |
+
|
| 40 |
+
agent = VideoAgent(perception=mock_perception, memory=mock_memory)
|
| 41 |
+
|
| 42 |
+
# Act
|
| 43 |
+
response = agent.ask("What color is the car?", video_id="test_vid")
|
| 44 |
+
|
| 45 |
+
# Assert
|
| 46 |
+
# It should have tried to look at the video
|
| 47 |
+
# (In a real implementation, we'd need logic to find *where* to look,
|
| 48 |
+
# but for this MVP test, we assume it scans or we mock the 'plan' logic).
|
| 49 |
+
# For simplicity in MVP, if memory fails, maybe it scans the beginning or a default segment?
|
| 50 |
+
# Let's assume the agent defaults to scanning.
|
| 51 |
+
pass # To be defined in logic
|
tests/test_perception.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from unittest.mock import MagicMock, patch, ANY
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
# 1. Mock the dependencies BEFORE import
|
| 7 |
+
mock_llama = MagicMock()
|
| 8 |
+
mock_chat_handler = MagicMock()
|
| 9 |
+
sys.modules["llama_cpp"] = mock_llama
|
| 10 |
+
sys.modules["llama_cpp.llama_chat_format"] = MagicMock()
|
| 11 |
+
sys.modules["llama_cpp.llama_chat_format"].Llava15ChatHandler = mock_chat_handler
|
| 12 |
+
|
| 13 |
+
from src.perception.engine import Qwen2PerceptionEngine
|
| 14 |
+
from src.config.settings import settings
|
| 15 |
+
|
| 16 |
+
class TestPerceptionEngine:
|
| 17 |
+
|
| 18 |
+
@patch("src.perception.engine.settings")
|
| 19 |
+
def test_load_model_calls_llama_correctly(self, mock_settings):
|
| 20 |
+
"""Test that load_model initializes the Llama class with GPU settings."""
|
| 21 |
+
# Setup
|
| 22 |
+
engine = Qwen2PerceptionEngine()
|
| 23 |
+
fake_path = Path("/tmp/fake_model.gguf")
|
| 24 |
+
fake_projector = Path("/tmp/mmproj.gguf")
|
| 25 |
+
|
| 26 |
+
# Mock the glob search for the projector
|
| 27 |
+
mock_settings.paths.models_dir.glob.return_value = [fake_projector]
|
| 28 |
+
|
| 29 |
+
# Act
|
| 30 |
+
engine.load_model(fake_path)
|
| 31 |
+
|
| 32 |
+
# Assert
|
| 33 |
+
# 1. Check if it looked for the projector
|
| 34 |
+
mock_settings.paths.models_dir.glob.assert_called()
|
| 35 |
+
|
| 36 |
+
# 2. Check if ChatHandler was initialized
|
| 37 |
+
mock_chat_handler.assert_called_with(clip_model_path=str(fake_projector))
|
| 38 |
+
|
| 39 |
+
# 3. Check if Llama was instantiated with GPU layers
|
| 40 |
+
mock_llama.Llama.assert_called_with(
|
| 41 |
+
model_path=str(fake_path),
|
| 42 |
+
chat_handler=ANY, # The instance of chat handler
|
| 43 |
+
n_ctx=2048,
|
| 44 |
+
n_gpu_layers=-1, # Important: Must be -1 for full GPU
|
| 45 |
+
n_batch=512,
|
| 46 |
+
verbose=False
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
def test_analyze_frame_structure(self):
|
| 50 |
+
"""Test that analyze_frame constructs the correct message format for Qwen."""
|
| 51 |
+
engine = Qwen2PerceptionEngine()
|
| 52 |
+
|
| 53 |
+
# Mock the internal Llama model
|
| 54 |
+
engine._model = MagicMock()
|
| 55 |
+
engine._model.create_chat_completion.return_value = {
|
| 56 |
+
"choices": [{"message": {"content": "A dog on a bike"}}]
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# Mock image loader to avoid file IO
|
| 60 |
+
with patch("src.perception.engine.open", create=True) as mock_open:
|
| 61 |
+
mock_open.return_value.__enter__.return_value.read.return_value = b"fake_image_bytes"
|
| 62 |
+
|
| 63 |
+
result = engine.analyze_frame("test.jpg", "Describe this")
|
| 64 |
+
|
| 65 |
+
# Assert
|
| 66 |
+
assert result == "A dog on a bike"
|
| 67 |
+
|
| 68 |
+
# Verify the prompt structure
|
| 69 |
+
calls = engine._model.create_chat_completion.call_args
|
| 70 |
+
messages = calls.kwargs['messages']
|
| 71 |
+
|
| 72 |
+
assert messages[0]['role'] == 'system'
|
| 73 |
+
assert messages[1]['role'] == 'user'
|
| 74 |
+
# Check content list (Image + Text)
|
| 75 |
+
content = messages[1]['content']
|
| 76 |
+
assert content[0]['type'] == 'image_url'
|
| 77 |
+
assert content[1]['type'] == 'text'
|
| 78 |
+
assert content[1]['text'] == 'Describe this'
|
tests/verify_pipeline.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from unittest.mock import MagicMock
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
# Mock heavy dependencies before imports
|
| 6 |
+
sys.modules["sentence_transformers"] = MagicMock()
|
| 7 |
+
sys.modules["llama_cpp"] = MagicMock()
|
| 8 |
+
sys.modules["llama_cpp.llama_chat_format"] = MagicMock()
|
| 9 |
+
sys.modules["decord"] = MagicMock()
|
| 10 |
+
|
| 11 |
+
# Import core modules
|
| 12 |
+
from src.core.orchestrator import VideoAgent
|
| 13 |
+
from src.memory.vector_index import VectorIndex
|
| 14 |
+
from src.interfaces.base import PerceptionEngine, MemoryManager
|
| 15 |
+
|
| 16 |
+
def test_pipeline_flow():
|
| 17 |
+
print("🧪 Starting Pipeline Verification Test...")
|
| 18 |
+
|
| 19 |
+
# 1. Setup Mocks
|
| 20 |
+
mock_perception = MagicMock(spec=PerceptionEngine)
|
| 21 |
+
# Mock text generation
|
| 22 |
+
mock_perception.generate_text.side_effect = ["The man is walking."] # Synthesizer output
|
| 23 |
+
# Mock vision analysis
|
| 24 |
+
mock_perception.analyze_video_segment.return_value = "A man walking on the street."
|
| 25 |
+
|
| 26 |
+
mock_memory = MagicMock(spec=MemoryManager)
|
| 27 |
+
|
| 28 |
+
# Mock Context (Scout + Index)
|
| 29 |
+
mock_scout = MagicMock()
|
| 30 |
+
mock_scout.embed_text.return_value = np.array([0.1, 0.2])
|
| 31 |
+
|
| 32 |
+
mock_index = MagicMock(spec=VectorIndex)
|
| 33 |
+
mock_index.timestamps = [0.0, 5.0, 10.0] # Dummy data
|
| 34 |
+
mock_index.search.return_value = [(5.0, 0.95)] # Found match at 5s
|
| 35 |
+
|
| 36 |
+
# 2. Initialize Agent
|
| 37 |
+
agent = VideoAgent(perception, mock_memory)
|
| 38 |
+
agent.context = {
|
| 39 |
+
"scout": mock_scout,
|
| 40 |
+
"index": mock_index
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# 3. Execute "Search" Flow
|
| 44 |
+
print(" 🔹 Testing Search Query: 'Find the man'")
|
| 45 |
+
response = agent.ask("Find the man", "video_id")
|
| 46 |
+
|
| 47 |
+
print(f" ✅ Response: {response}")
|
| 48 |
+
|
| 49 |
+
# Verifications
|
| 50 |
+
# Planner should have chosen SEARCH (implied by calling scout)
|
| 51 |
+
mock_scout.embed_text.assert_called_once()
|
| 52 |
+
mock_index.search.assert_called_once()
|
| 53 |
+
# Analyst should have been called for timestamp 5.0
|
| 54 |
+
mock_perception.analyze_video_segment.assert_called()
|
| 55 |
+
args = mock_perception.analyze_video_segment.call_args[1]
|
| 56 |
+
assert args['start_time'] == 5.0
|
| 57 |
+
|
| 58 |
+
print(" 🎉 Search Pipeline Verified!")
|
| 59 |
+
|
| 60 |
+
# 4. Execute "Summary" Flow
|
| 61 |
+
print(" 🔹 Testing Summary Query: 'Summarize the video'")
|
| 62 |
+
# Reset mocks
|
| 63 |
+
mock_perception.reset_mock()
|
| 64 |
+
mock_scout.reset_mock()
|
| 65 |
+
|
| 66 |
+
agent.ask("Summarize the video", "video_id")
|
| 67 |
+
|
| 68 |
+
# Verifications
|
| 69 |
+
# Planner should choose SUMMARY -> Skip search, go straight to sampling
|
| 70 |
+
mock_scout.embed_text.assert_not_called()
|
| 71 |
+
# Should scan multiple timestamps (we defined 5 evenly spaced in graph.py)
|
| 72 |
+
assert mock_perception.analyze_video_segment.call_count >= 2
|
| 73 |
+
|
| 74 |
+
print(" 🎉 Summary Pipeline Verified!")
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
try:
|
| 78 |
+
test_pipeline_flow()
|
| 79 |
+
print("\n✅ SYSTEM INTEGRITY CONFIRMED.")
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"\n❌ TEST FAILED: {e}")
|
| 82 |
+
sys.exit(1)
|