ashleshp commited on
Commit
fca155a
·
0 Parent(s):

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Documentation & Tasks (Excluded)
2
+ ACTION_PLAN_TASK_*.md
3
+ ARCHITECTURE_*.md
4
+ PROGRESS_CHECKPOINT.md
5
+ PROJECT_OVERVIEW.md
6
+ RESTART_CONTEXT.md
7
+ TECHNICAL_GUIDELINES.md
8
+ project.txt
9
+
10
+ # Python
11
+ __pycache__/
12
+ *.pyc
13
+ venv/
14
+ .pytest_cache/
15
+
16
+ # Data & Models (Generated at runtime or too large)
17
+ models/*.gguf
18
+ data/*.mp4
19
+ data/*.idx
20
+ data/*.json
21
+ data/temp*
22
+ data/metadata/
23
+
24
+ # Environment & Config
25
+ .env
26
+ .streamlit/secrets.toml
27
+ .gemini/
README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Visual Scout AI
3
+ emoji: 🦅
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.30.0
8
+ app_file: src/app.py
9
+ pinned: false
10
+ models:
11
+ - Qwen/Qwen2-VL-2B-Instruct
12
+ ---
13
+
14
+ # Visual Scout: Agentic Video Understanding
15
+
16
+ An agentic AI system that watches videos, builds a semantic index, and answers natural language questions using **Qwen2-VL**.
17
+
18
+ ## 🚀 How to Run Locally
19
+
20
+ 1. **Install Dependencies:**
21
+ ```bash
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ 2. **Download Model:**
26
+ ```bash
27
+ python scripts/download_model.py
28
+ ```
29
+
30
+ 3. **Run App:**
31
+ ```bash
32
+ streamlit run src/app.py
33
+ ```
34
+
35
+ ## ☁️ Deployment (Hugging Face Spaces)
36
+
37
+ This repository is configured for immediate deployment on Hugging Face Spaces.
38
+
39
+ 1. Create a new Space on [Hugging Face](https://huggingface.co/spaces).
40
+ 2. Select **Streamlit** as the SDK.
41
+ 3. Connect this Git repository.
42
+ 4. The system will automatically build using `requirements.txt`.
check_environment.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import shutil
4
+
5
+ def check_gpu():
6
+ print("\n--- GPU Check (via nvidia-smi) ---")
7
+ if shutil.which("nvidia-smi"):
8
+ ret = os.system("nvidia-smi")
9
+ if ret == 0:
10
+ print("✅ NVIDIA Driver detected.")
11
+ else:
12
+ print("⚠️ nvidia-smi found but returned error.")
13
+ else:
14
+ print("❌ nvidia-smi not found. CUDA might not be in PATH.")
15
+
16
+ def check_llama_cuda():
17
+ print("\n--- Llama.cpp CUDA Check ---")
18
+ try:
19
+ from llama_cpp import Llama
20
+ print("✅ llama-cpp-python is installed.")
21
+ print(f"Llama.cpp package location: {sys.modules['llama_cpp'].__file__}")
22
+ except ImportError:
23
+ print("❌ llama-cpp-python is NOT installed.")
24
+ sys.exit(1)
25
+
26
+ if __name__ == "__main__":
27
+ print(f"Python Version: {sys.version}")
28
+ check_gpu()
29
+ check_llama_cuda()
30
+ print("\nEnvironment check complete.")
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ffmpeg
2
+ libsm6
3
+ libxext6
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core AI & Inference
2
+ llama-cpp-python>=0.2.82 # The engine for Qwen2-VL
3
+ numpy>=1.24.0 # Array manipulation
4
+ opencv-python-headless>=4.8.0 # Video processing (headless for server/CLI environments)
5
+
6
+ # Utility & CLI
7
+ rich>=13.0.0 # Beautiful terminal output
8
+ pydantic>=2.0.0 # Data validation and settings management
9
+ Pillow>=10.0.0 # Image handling
10
+
11
+ # Development & Testing
12
+ pytest>=7.0.0 # Testing framework
13
+ black>=23.0.0 # Code formatter (for dev use)
14
+ huggingface_hub>=0.19.0
15
+ langgraph>=0.0.10
16
+ langchain>=0.1.0
17
+ langchain-core>=0.1.0
18
+ streamlit>=1.30.0
19
+ sentence-transformers>=2.2.2
20
+ scikit-learn>=1.3.0
21
+ decord>=0.6.0
scripts/download_model.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import list_repo_files, hf_hub_download
2
+ import os
3
+
4
+ REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
5
+ MODEL_DIR = "models"
6
+
7
+ def list_files():
8
+ print(f"Inspecting repo: {REPO_ID}")
9
+ files = list_repo_files(repo_id=REPO_ID)
10
+ return files
11
+
12
+ def download_file(filename):
13
+ print(f"Downloading {filename}...")
14
+ hf_hub_download(
15
+ repo_id=REPO_ID,
16
+ filename=filename,
17
+ local_dir=MODEL_DIR,
18
+ local_dir_use_symlinks=False
19
+ )
20
+ print(f"✅ Saved to {MODEL_DIR}/{filename}")
21
+
22
+ if __name__ == "__main__":
23
+ if not os.path.exists(MODEL_DIR):
24
+ os.makedirs(MODEL_DIR)
25
+
26
+ files = list_files()
27
+
28
+ # 1. Find the best Quantization (Q4_K_M is the sweet spot)
29
+ target_quant = "q4_k_m.gguf"
30
+ model_file = next((f for f in files if target_quant in f.lower()), None)
31
+
32
+ if model_file:
33
+ print(f"Found model: {model_file}")
34
+ if not os.path.exists(os.path.join(MODEL_DIR, model_file)):
35
+ download_file(model_file)
36
+ else:
37
+ print("✅ Model already exists.")
38
+ else:
39
+ print("❌ Could not find Q4_K_M model file.")
40
+
41
+ # 2. Look for mmproj (Vision Adapter) if it exists separately
42
+ # Qwen2-VL GGUFs usually embed it, but let's check for 'mmproj' just in case.
43
+ mmproj_file = next((f for f in files if "mmproj" in f.lower()), None)
44
+ if mmproj_file:
45
+ print(f"Found projector: {mmproj_file}")
46
+ if not os.path.exists(os.path.join(MODEL_DIR, mmproj_file)):
47
+ download_file(mmproj_file)
48
+ else:
49
+ print("✅ Projector already exists.")
50
+ else:
51
+ print("ℹ️ No separate mmproj file found (likely embedded or not needed for this repo).")
src/app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import shutil
3
+ import time
4
+ import os
5
+ import sys
6
+ import cv2
7
+ from pathlib import Path
8
+ from huggingface_hub import hf_hub_download
9
+
10
+ # Add project root to python path
11
+ sys.path.append(os.getcwd())
12
+
13
+ # Internal Modules
14
+ from src.perception.engine import Qwen2PerceptionEngine
15
+ from src.perception.scout import VisualScout
16
+ from src.memory.manager import SimpleMemoryManager
17
+ from src.memory.vector_index import VectorIndex
18
+ from src.core.orchestrator import VideoAgent
19
+ from src.config.settings import settings
20
+ from src.utils.video import extract_frames_decord
21
+
22
+ # --- PAGE CONFIGURATION ---
23
+ st.set_page_config(
24
+ page_title="Visual Scout AI",
25
+ page_icon="🦅",
26
+ layout="wide",
27
+ initial_sidebar_state="expanded"
28
+ )
29
+
30
+ # --- SYSTEM SETUP ---
31
+
32
+ def ensure_models_exist():
33
+ """
34
+ Checks if the AI models are present.
35
+ If not (first run or cloud deploy), it downloads them automatically.
36
+ """
37
+ REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
38
+ MODEL_FILENAME = "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
39
+ VISION_ADAPTER_FILENAME = "Qwen2-VL-2B-Instruct-f16-mmproj.gguf"
40
+
41
+ if not settings.paths.models_dir.exists():
42
+ settings.paths.models_dir.mkdir(parents=True)
43
+
44
+ model_path = settings.paths.models_dir / MODEL_FILENAME
45
+ adapter_path = settings.paths.models_dir / VISION_ADAPTER_FILENAME
46
+
47
+ # If either file is missing, trigger download
48
+ if not model_path.exists() or not adapter_path.exists():
49
+ with st.spinner("📥 Performing First-Time Setup: Downloading AI Models..."):
50
+ if not model_path.exists():
51
+ st.toast("Downloading Main Model (1.5GB)...")
52
+ hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, local_dir=settings.paths.models_dir)
53
+
54
+ if not adapter_path.exists():
55
+ st.toast("Downloading Vision Adapter...")
56
+ try:
57
+ hf_hub_download(repo_id=REPO_ID, filename=VISION_ADAPTER_FILENAME, local_dir=settings.paths.models_dir)
58
+ except Exception:
59
+ st.warning("Could not download specific adapter. Trying to proceed...")
60
+
61
+ st.success("Models Ready!")
62
+
63
+ @st.cache_resource
64
+ def initialize_system():
65
+ """
66
+ Loads the heavy AI models once and caches them.
67
+ """
68
+ ensure_models_exist()
69
+
70
+ print("🚀 System Startup: Initializing AI Engines...")
71
+
72
+ # 1. The Analyst (High Intelligence, GPU)
73
+ perception_engine = Qwen2PerceptionEngine()
74
+ try:
75
+ perception_engine.load_model(settings.paths.model_path)
76
+ except Exception as error:
77
+ st.error(f"Critical Error Loading AI: {error}")
78
+ st.stop()
79
+
80
+ # 2. The Scout (Fast Search, CPU)
81
+ visual_scout = VisualScout()
82
+
83
+ # 3. The Memory Manager
84
+ memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
85
+
86
+ # 4. The Agent Orchestrator
87
+ video_agent = VideoAgent(perception_engine, memory_manager)
88
+
89
+ return perception_engine, visual_scout, memory_manager, video_agent
90
+
91
+ # Load the system
92
+ perception_engine, visual_scout, memory_manager, video_agent = initialize_system()
93
+
94
+ # --- SIDEBAR ---
95
+ st.sidebar.title("1. Upload Video")
96
+ uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])
97
+
98
+ # --- SESSION STATE INITIALIZATION ---
99
+ if "active_video_id" not in st.session_state:
100
+ st.session_state.active_video_id = None
101
+ if "is_video_processed" not in st.session_state:
102
+ st.session_state.is_video_processed = False
103
+ if "chat_history" not in st.session_state:
104
+ st.session_state.chat_history = []
105
+ if "visual_memory" not in st.session_state:
106
+ st.session_state.visual_memory = None
107
+ if "text_memory" not in st.session_state:
108
+ st.session_state.text_memory = None
109
+
110
+ # --- MAIN UI ---
111
+ st.title("🦅 Visual Scout")
112
+ st.caption("Agentic Video Understanding System")
113
+
114
+ if uploaded_file is not None:
115
+ # Generate a simple ID from the filename
116
+ current_video_id = uploaded_file.name.split(".")[0]
117
+
118
+ # Detect if a new video was uploaded
119
+ if st.session_state.active_video_id != current_video_id:
120
+ st.session_state.active_video_id = current_video_id
121
+ st.session_state.is_video_processed = False
122
+ st.session_state.chat_history = []
123
+ st.session_state.visual_memory = None
124
+ st.session_state.text_memory = None
125
+
126
+ # Save the file locally
127
+ local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
128
+ with open(local_video_path, "wb") as f:
129
+ f.write(uploaded_file.getbuffer())
130
+ st.toast(f"Video '{current_video_id}' loaded.")
131
+
132
+ # --- PROCESSING PIPELINE ---
133
+ if not st.session_state.is_video_processed:
134
+ st.divider()
135
+ st.header("🧠 Analyzing Video Content")
136
+ st.info("The agent is watching the video to build a semantic index. This happens once per video.")
137
+
138
+ video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
139
+
140
+ # Initialize Memory Indices
141
+ visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
142
+ text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
143
+
144
+ visual_memory_index = VectorIndex(visual_index_path)
145
+ text_memory_index = VectorIndex(text_index_path)
146
+
147
+ # Store in session
148
+ st.session_state.visual_memory = visual_memory_index
149
+ st.session_state.text_memory = text_memory_index
150
+
151
+ memory_manager.initialize_storage(current_video_id)
152
+
153
+ with st.status("🦅 Scout: Scanning video timeline...", expanded=True) as status:
154
+
155
+ # Step 1: Extract Frames
156
+ status.write("Extracting frames at 1 FPS...")
157
+ raw_frames = list(extract_frames_decord(video_path, fps=1.0))
158
+
159
+ # Step 2: Semantic Cuts
160
+ status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
161
+ key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
162
+
163
+ # Index Visuals
164
+ for timestamp, frame in raw_frames:
165
+ embedding = visual_scout.embed_image(frame)
166
+ visual_memory_index.add(timestamp, embedding)
167
+ visual_memory_index.save()
168
+
169
+ status.write(f"Detected {len(key_events)} key semantic events.")
170
+
171
+ # Step 3: Deep Captioning
172
+ progress_bar = st.progress(0)
173
+ event_log = []
174
+
175
+ ANALYSIS_PROMPT = """Analyze this scene.
176
+ 1. Describe the main action and subject.
177
+ 2. Note any text or signs visible.
178
+ 3. Describe the environment."""
179
+
180
+ for i, (timestamp, frame) in enumerate(key_events):
181
+ status.write(f"👁️ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
182
+
183
+ # Save temp frame for the VLM
184
+ temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
185
+ cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
186
+
187
+ description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
188
+
189
+ time_string = time.strftime('%M:%S', time.gmtime(timestamp))
190
+ memory_manager.commit_event(current_video_id, time_string, description, {})
191
+ event_log.append(f"**{time_string}**: {description}")
192
+
193
+ # Index Text (for semantic search later)
194
+ text_embedding = visual_scout.embed_text(description)
195
+ text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
196
+
197
+ progress_bar.progress((i + 1) / len(key_events))
198
+
199
+ text_memory_index.save()
200
+
201
+ # Step 4: Summary
202
+ status.write("📝 Writing Global Summary...")
203
+ full_timeline_text = "\n".join(event_log)
204
+ summary_prompt = f"""<|im_start|>system
205
+ You are a video editor. Read the timeline below and write a concise summary of the entire video.
206
+ TIMELINE:
207
+ {full_timeline_text}
208
+ <|im_end|>
209
+ <|im_start|>assistant
210
+ """
211
+ summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
212
+ memory_manager.save_summary(current_video_id, summary)
213
+
214
+ status.update(label="✅ Analysis Complete!", state="complete", expanded=False)
215
+
216
+ st.session_state.is_video_processed = True
217
+ st.success("Video Index Ready.")
218
+ st.markdown(f"**Summary:** {summary}")
219
+
220
+ with st.expander("See Detailed Timeline"):
221
+ for event in event_log:
222
+ st.write(event)
223
+
224
+ # Rerun to switch to Chat Mode cleanly
225
+ st.rerun()
226
+
227
+ # --- CHAT INTERFACE ---
228
+ else:
229
+ st.divider()
230
+
231
+ # Display History
232
+ for message in st.session_state.chat_history:
233
+ with st.chat_message(message["role"]):
234
+ st.write(message["content"])
235
+
236
+ # Chat Input
237
+ if user_query := st.chat_input("Ask about the video..."):
238
+
239
+ # Add User Message
240
+ st.session_state.chat_history.append({"role": "user", "content": user_query})
241
+ with st.chat_message("user"):
242
+ st.write(user_query)
243
+
244
+ # Generate Answer
245
+ with st.chat_message("assistant"):
246
+ with st.spinner("Agent is thinking..."):
247
+
248
+ # Inject Tools/Context into the Agent
249
+ video_agent.context = {
250
+ "scout": visual_scout,
251
+ "vis_index": st.session_state.visual_memory,
252
+ "txt_index": st.session_state.text_memory
253
+ }
254
+
255
+ response_text = video_agent.ask(user_query, st.session_state.active_video_id)
256
+ st.write(response_text)
257
+
258
+ # Add Assistant Message
259
+ st.session_state.chat_history.append({"role": "assistant", "content": response_text})
src/config/settings.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from pydantic import BaseModel, Field
3
+
4
+ class ProjectPaths(BaseModel):
5
+ """Defines the standard file paths for the project."""
6
+ root: Path = Path(__file__).parent.parent.parent
7
+ models_dir: Path = Field(default_factory=lambda: Path(__file__).parent.parent.parent / "models")
8
+ data_dir: Path = Field(default_factory=lambda: Path(__file__).parent.parent.parent / "data")
9
+
10
+ @property
11
+ def model_path(self) -> Path:
12
+ # Default model name
13
+ return self.models_dir / "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
14
+
15
+ class PerceptionSettings(BaseModel):
16
+ """Tunable parameters for the vision system."""
17
+ frame_interval: int = 2 # Process 1 frame every X seconds
18
+ ssim_threshold: float = 0.90 # Similarity threshold to skip frames (0.0 - 1.0)
19
+
20
+ class Config(BaseModel):
21
+ """Global Application Configuration."""
22
+ paths: ProjectPaths = Field(default_factory=ProjectPaths)
23
+ perception: PerceptionSettings = Field(default_factory=PerceptionSettings)
24
+
25
+ class Config:
26
+ arbitrary_types_allowed = True
27
+
28
+ # Singleton instance
29
+ settings = Config()
src/core/graph.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langgraph.graph import StateGraph, END
2
+ from typing import TypedDict, Annotated, Sequence, Dict, Any, List
3
+ from langchain_core.messages import BaseMessage
4
+ import operator
5
+
6
+ class AgentState(TypedDict):
7
+ """
8
+ Represents the 'Brain State' of the agent as it thinks.
9
+ This dictionary is passed between all the nodes in the graph.
10
+ """
11
+ user_query: str
12
+ video_id: str
13
+ plan: str
14
+ search_topic: str
15
+ target_timestamps: List[float]
16
+ observations: Annotated[List[str], operator.add] # 'add' means new observations are appended, not overwritten
17
+ final_answer: str
18
+ context: Dict[str, Any] # Holds references to the Search Index and Tools
19
+
20
+ def create_agent_graph(perception_engine, memory_manager):
21
+ """
22
+ Builds the Decision Graph (The 'Flowchart' of the AI).
23
+ """
24
+
25
+ # --- NODE 1: PLANNER ---
26
+ def planner_node(state: AgentState):
27
+ query = state["user_query"]
28
+ print(f"🤖 Planner: Receiving query -> '{query}'")
29
+ # For now, we assume every query requires a search.
30
+ # Future improvement: Distinguish between 'Summary' and 'Specific Search'.
31
+ return {
32
+ "plan": "SEARCH",
33
+ "search_topic": query,
34
+ "observations": []
35
+ }
36
+
37
+ # --- NODE 2: RETRIEVER (The Librarian) ---
38
+ def retriever_node(state: AgentState):
39
+ """
40
+ Searches both Text (Metadata) and Vision (CLIP) indices.
41
+ """
42
+ video_id = state["video_id"]
43
+ search_topic = state["search_topic"]
44
+
45
+ # Unpack tools from context
46
+ scout = state["context"]["scout"]
47
+ visual_memory = state["context"]["vis_index"]
48
+ text_memory = state["context"]["txt_index"]
49
+
50
+ found_observations = []
51
+ timestamps_to_investigate = []
52
+
53
+ print(f"📚 Retriever: Looking up '{search_topic}'...")
54
+ query_vector = scout.embed_text(search_topic)
55
+
56
+ # A. Search Semantic Text Memory (Captions we generated earlier)
57
+ if text_memory:
58
+ # Find top 3 text matches
59
+ text_matches = text_memory.search(query_vector, top_k=3)
60
+
61
+ # Filter matches that are actually relevant (Score > 0.35)
62
+ relevant_text_matches = [match for match in text_matches if match[1] > 0.35]
63
+
64
+ if relevant_text_matches:
65
+ print(f" ✅ Found {len(relevant_text_matches)} relevant text records.")
66
+
67
+ # Note: ideally we'd fetch the exact text from the index metadata here.
68
+ # For this implementation, we rely on the generic system memory or
69
+ # we accept the timestamp and let the Analyst re-verify.
70
+ # Let's map these timestamps to potential investigation points.
71
+ for timestamp, score in relevant_text_matches:
72
+ timestamps_to_investigate.append(timestamp)
73
+ found_observations.append(f"Memory Hint: Something relevant might be at {timestamp:.1f}s (Confidence: {score:.2f})")
74
+
75
+ else:
76
+ print(" ⚠️ No strong text matches found. Switching to Visual Search.")
77
+
78
+ # B. Visual Fallback (If text failed, or to double-check)
79
+ # We look for frames that *look* like the user's query
80
+ if not found_observations and visual_memory:
81
+ visual_matches = visual_memory.search(query_vector, top_k=3)
82
+
83
+ # Visual similarity needs a lower threshold usually
84
+ valid_visual_matches = [match for match in visual_matches if match[1] > 0.22]
85
+
86
+ if valid_visual_matches:
87
+ found_timestamps = [match[0] for match in valid_visual_matches]
88
+ print(f" 🦅 Visual Scout suggests checking times: {found_timestamps}")
89
+ timestamps_to_investigate.extend(found_timestamps)
90
+ else:
91
+ found_observations.append("No direct visual matches found.")
92
+
93
+ # Remove duplicates and sort
94
+ unique_timestamps = sorted(list(set(timestamps_to_investigate)))
95
+
96
+ return {
97
+ "observations": found_observations,
98
+ "target_timestamps": unique_timestamps
99
+ }
100
+
101
+ # --- NODE 3: ANALYST (The Eyes) ---
102
+ def analyst_node(state: AgentState):
103
+ """
104
+ Visits the specific timestamps found by the Retriever and looks closely.
105
+ """
106
+ video_id = state["video_id"]
107
+ timestamps = state["target_timestamps"]
108
+ search_topic = state["search_topic"]
109
+ new_findings = []
110
+
111
+ if not timestamps:
112
+ return {"observations": ["Analyst: I have nowhere to look."]}
113
+
114
+ print(f"👁️ Analyst: Zooming in on {len(timestamps)} moments...")
115
+
116
+ # We give the Vision Model a very specific task
117
+ verification_prompt = f"Look specifically for '{search_topic}'. If you see it, describe it in detail. If not, say 'Not visible'."
118
+
119
+ for time_point in timestamps:
120
+ description = perception_engine.analyze_video_segment(
121
+ video_path=f"data/{video_id}.mp4",
122
+ start_time=time_point,
123
+ end_time=time_point + 1.0,
124
+ prompt=verification_prompt
125
+ )
126
+
127
+ log_entry = f"Visual Inspection at {time_point:.1f}s: {description}"
128
+ new_findings.append(log_entry)
129
+ print(f" > {log_entry}")
130
+
131
+ return {"observations": new_findings}
132
+
133
+ # --- NODE 4: SYNTHESIZER (The Speaker) ---
134
+ def synthesizer_node(state: AgentState):
135
+ """
136
+ Compiles all observations into a final natural language answer.
137
+ """
138
+ user_query = state["user_query"]
139
+ all_evidence = state["observations"]
140
+
141
+ if not all_evidence:
142
+ return {"final_answer": "I'm sorry, I couldn't find any information about that in the video."}
143
+
144
+ evidence_text = "\n".join(all_evidence)
145
+
146
+ system_prompt = f"""<|im_start|>system
147
+ You are a helpful video assistant.
148
+ Answer the user's question based strictly on the evidence below.
149
+ If the evidence contradicts itself, trust the 'Visual Inspection' over the 'Memory Hint'.
150
+
151
+ EVIDENCE COLLECTED:
152
+ {evidence_text}
153
+ <|im_end|>
154
+ <|im_start|>user
155
+ {user_query}
156
+ <|im_end|>
157
+ <|im_start|>assistant
158
+ """
159
+ # We use the raw text generation here for direct control
160
+ answer = perception_engine.generate_text(system_prompt, stop=["<|im_end|>"])
161
+ return {"final_answer": answer}
162
+
163
+ # --- GRAPH CONSTRUCTION ---
164
+ workflow = StateGraph(AgentState)
165
+
166
+ # Add Nodes
167
+ workflow.add_node("planner", planner_node)
168
+ workflow.add_node("retriever", retriever_node)
169
+ workflow.add_node("analyst", analyst_node)
170
+ workflow.add_node("synthesizer", synthesizer_node)
171
+
172
+ # Define Edges (The Flow)
173
+ workflow.set_entry_point("planner")
174
+ workflow.add_edge("planner", "retriever")
175
+ workflow.add_edge("retriever", "analyst")
176
+ workflow.add_edge("analyst", "synthesizer")
177
+ workflow.add_edge("synthesizer", END)
178
+
179
+ return workflow.compile()
src/core/orchestrator.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.interfaces.base import PerceptionEngine, MemoryManager
2
+ from src.core.graph import create_agent_graph
3
+
4
+ class VideoAgent:
5
+ def __init__(self, perception: PerceptionEngine, memory: MemoryManager):
6
+ self.graph = create_agent_graph(perception, memory)
7
+ self.context = {} # Scout/Index injection
8
+
9
+ def ask(self, question: str, video_id: str) -> str:
10
+ """
11
+ Runs the Linear Pipeline.
12
+ """
13
+ initial_state = {
14
+ "query": question,
15
+ "video_id": video_id,
16
+ "plan": "",
17
+ "search_term": "",
18
+ "timestamps": [],
19
+ "observations": [],
20
+ "final_answer": "",
21
+ "context": self.context
22
+ }
23
+
24
+ final_state = self.graph.invoke(initial_state)
25
+ return final_state["final_answer"]
src/interfaces/base.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Dict, Optional, Any
3
+ from pathlib import Path
4
+
5
+ class PerceptionEngine(ABC):
6
+ """Abstract Base Class for the Visual Perception System."""
7
+
8
+ @abstractmethod
9
+ def load_model(self, model_path: Path) -> None:
10
+ pass
11
+
12
+ @abstractmethod
13
+ def analyze_frame(self, frame_path: str, prompt: str) -> str:
14
+ pass
15
+
16
+ @abstractmethod
17
+ def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, prompt: str) -> str:
18
+ pass
19
+
20
+ @abstractmethod
21
+ def chat(self, messages: List[Dict[str, str]]) -> str:
22
+ """General purpose chat completion (for the Agent brain)."""
23
+ pass
24
+
25
+ @abstractmethod
26
+ def generate_text(self, prompt: str, stop: Optional[List[str]] = None) -> str:
27
+ """Raw text completion for advanced prompting."""
28
+ pass
29
+
30
+ class MemoryManager(ABC):
31
+ """Abstract Base Class for the Knowledge Graph."""
32
+
33
+ @abstractmethod
34
+ def initialize_storage(self, video_id: str) -> None:
35
+ pass
36
+
37
+ @abstractmethod
38
+ def commit_event(self, video_id: str, timestamp: str, description: str, metadata: Dict[str, Any]) -> None:
39
+ pass
40
+
41
+ @abstractmethod
42
+ def query_knowledge(self, video_id: str, query: str) -> List[Dict[str, Any]]:
43
+ pass
44
+
45
+ @abstractmethod
46
+ def get_summary(self, video_id: str) -> str:
47
+ pass
src/main.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ from rich.console import Console
4
+ from rich.prompt import Prompt
5
+
6
+ from src.perception.engine import Qwen2PerceptionEngine
7
+ from src.memory.manager import SimpleMemoryManager
8
+ from src.core.orchestrator import VideoAgent
9
+ from src.config.settings import settings
10
+
11
+ console = Console()
12
+
13
+ def main():
14
+ console.rule("[bold blue]Agentic Video Understanding System[/]")
15
+
16
+ # 1. Initialize Components
17
+ with console.status("[bold green]Loading AI Models... (This uses GPU)[/]"):
18
+ perception = Qwen2PerceptionEngine()
19
+ # Pre-load to avoid delay on first query
20
+ try:
21
+ perception.load_model(settings.paths.model_path)
22
+ except Exception as e:
23
+ console.print(f"[bold red]Critical Error:[/] Failed to load Qwen2-VL. {e}")
24
+ console.print("Ensure you ran 'python scripts/download_model.py'")
25
+ sys.exit(1)
26
+
27
+ memory = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
28
+
29
+ agent = VideoAgent(perception, memory)
30
+
31
+ # 2. Select Video
32
+ video_id = "test_video"
33
+ video_path = settings.paths.data_dir / f"{video_id}.mp4"
34
+
35
+ if not video_path.exists():
36
+ console.print(f"[yellow]Warning:[/] Video {video_path} not found.")
37
+ console.print("Please place a video at data/test_video.mp4")
38
+ return
39
+
40
+ # 3. Initialize Memory for this video
41
+ memory.initialize_storage(video_id)
42
+ console.print(f"[green]Video Loaded:[/] {video_id}.mp4")
43
+
44
+ # 4. Interactive Loop
45
+ while True:
46
+ console.print()
47
+ query = Prompt.ask("[bold cyan]Ask a question[/] (or 'exit')")
48
+
49
+ if query.lower() in ["exit", "quit", "q"]:
50
+ break
51
+
52
+ with console.status("[bold yellow]Agent Thinking...[/]"):
53
+ try:
54
+ response = agent.ask(query, video_id)
55
+ console.print(f"[bold white]Answer:[/] {response}")
56
+ except Exception as e:
57
+ console.print(f"[bold red]Error:[/] {e}")
58
+
59
+ if __name__ == "__main__":
60
+ main()
src/memory/manager.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.interfaces.base import MemoryManager
2
+ from pathlib import Path
3
+ from typing import List, Dict, Any
4
+ import json
5
+
6
+ class SimpleMemoryManager(MemoryManager):
7
+ """
8
+ A lightweight file-based memory manager using JSON.
9
+ Perfect for single-video sessions.
10
+ """
11
+ def __init__(self, storage_dir: Path):
12
+ self.storage_dir = storage_dir
13
+ if not self.storage_dir.exists():
14
+ self.storage_dir.mkdir(parents=True)
15
+
16
+ def _get_file_path(self, video_id: str) -> Path:
17
+ return self.storage_dir / f"{video_id}_metadata.json"
18
+
19
+ def _load_data(self, video_id: str) -> Dict[str, Any]:
20
+ path = self._get_file_path(video_id)
21
+ if not path.exists():
22
+ raise FileNotFoundError(f"Metadata not found for {video_id}")
23
+ with open(path, "r") as f:
24
+ return json.load(f)
25
+
26
+ def _save_data(self, video_id: str, data: Dict[str, Any]) -> None:
27
+ path = self._get_file_path(video_id)
28
+ with open(path, "w") as f:
29
+ json.dump(data, f, indent=2)
30
+
31
+ def initialize_storage(self, video_id: str) -> None:
32
+ """Sets up the storage structure for a new video."""
33
+ data = {
34
+ "video_id": video_id,
35
+ "events": [],
36
+ "entities": {},
37
+ "summary": ""
38
+ }
39
+ self._save_data(video_id, data)
40
+
41
+ def commit_event(self, video_id: str, timestamp: str, description: str, metadata: Dict[str, Any]) -> None:
42
+ """Saves a new event to the timeline."""
43
+ data = self._load_data(video_id)
44
+
45
+ event = {
46
+ "timestamp": timestamp,
47
+ "description": description,
48
+ "metadata": metadata
49
+ }
50
+
51
+ data["events"].append(event)
52
+ self._save_data(video_id, data)
53
+
54
+ def query_knowledge(self, video_id: str, query: str) -> List[Dict[str, Any]]:
55
+ """Searches the existing knowledge base."""
56
+ data = self._load_data(video_id)
57
+ results = []
58
+
59
+ # Simple keyword search
60
+ query_lower = query.lower()
61
+ for event in data["events"]:
62
+ if query_lower in event["description"].lower():
63
+ results.append(event)
64
+
65
+ return results
66
+
67
+ def get_summary(self, video_id: str) -> str:
68
+ data = self._load_data(video_id)
69
+ return data.get("summary", "")
70
+
71
+ def save_summary(self, video_id: str, summary_text: str) -> None:
72
+ """Updates the global summary for the video."""
73
+ data = self._load_data(video_id)
74
+ data["summary"] = summary_text
75
+ self._save_data(video_id, data)
src/memory/vector_index.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from pathlib import Path
3
+ from typing import List, Tuple, Dict, Any, Optional
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import pickle
6
+
7
+ class VectorIndex:
8
+ """
9
+ In-memory Vector Database.
10
+
11
+ This acts as the 'Long Term Memory' for visual concepts.
12
+ It maps a Timestamp (when something happened) to a Vector (what it looked like).
13
+ """
14
+
15
+ def __init__(self, index_file_path: Path):
16
+ self.file_path = index_file_path
17
+ self.timestamps: List[float] = []
18
+ self.embedding_matrix: Optional[np.ndarray] = None
19
+ self.metadata_store: List[Dict[str, Any]] = []
20
+
21
+ # Load existing index if available
22
+ if self.file_path.exists():
23
+ self.load()
24
+
25
+ def add(self, timestamp_seconds: float, vector: np.ndarray, extra_data: Dict[str, Any] = None):
26
+ """Adds a new memory entry (timestamp + vector)."""
27
+ self.timestamps.append(timestamp_seconds)
28
+ self.metadata_store.append(extra_data or {})
29
+
30
+ # Normalize the vector to length 1.
31
+ # This is crucial so that 'Cosine Similarity' is just a Dot Product (faster).
32
+ vector_norm = np.linalg.norm(vector)
33
+ if vector_norm > 0:
34
+ vector = vector / vector_norm
35
+
36
+ if self.embedding_matrix is None:
37
+ self.embedding_matrix = vector.reshape(1, -1)
38
+ else:
39
+ self.embedding_matrix = np.vstack([self.embedding_matrix, vector])
40
+
41
+ def search(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[float, float]]:
42
+ """
43
+ Finds the moments in the video that are most similar to the query.
44
+
45
+ Returns:
46
+ A list of tuples: (timestamp_seconds, similarity_score)
47
+ """
48
+ if self.embedding_matrix is None:
49
+ return []
50
+
51
+ # Normalize the query too
52
+ query_norm = np.linalg.norm(query_vector)
53
+ if query_norm > 0:
54
+ query_vector = query_vector / query_norm
55
+
56
+ # Calculate similarity against ALL stored memories at once
57
+ similarity_scores = cosine_similarity(query_vector.reshape(1, -1), self.embedding_matrix)[0]
58
+
59
+ # Sort by highest score first
60
+ best_indices = np.argsort(similarity_scores)[::-1][:top_k]
61
+
62
+ results = []
63
+ for index in best_indices:
64
+ score = float(similarity_scores[index])
65
+ time_point = self.timestamps[index]
66
+ results.append((time_point, score))
67
+
68
+ return results
69
+
70
+ def save(self):
71
+ """Persists the index to the disk using Pickle."""
72
+ data_packet = {
73
+ "timestamps": self.timestamps,
74
+ "vectors": self.embedding_matrix,
75
+ "metadata": self.metadata_store
76
+ }
77
+ with open(self.file_path, "wb") as f:
78
+ pickle.dump(data_packet, f)
79
+
80
+ def load(self):
81
+ """Loads the index from disk."""
82
+ with open(self.file_path, "rb") as f:
83
+ data_packet = pickle.load(f)
84
+ self.timestamps = data_packet["timestamps"]
85
+ self.embedding_matrix = data_packet["vectors"]
86
+ self.metadata_store = data_packet.get("metadata", [])
src/perception/engine.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional, List, Dict
4
+ import base64
5
+
6
+ # Third-party imports
7
+ from llama_cpp import Llama
8
+ from llama_cpp.llama_chat_format import Llava15ChatHandler
9
+ import cv2
10
+
11
+ # Local imports
12
+ from src.interfaces.base import PerceptionEngine
13
+ from src.config.settings import settings
14
+
15
+ class Qwen2PerceptionEngine(PerceptionEngine):
16
+ """
17
+ The 'Eyes' of the system.
18
+
19
+ This class wraps the Qwen2-VL (Vision-Language) model running via llama.cpp.
20
+ It handles loading the heavy GPU weights and formatting images for the AI to 'see'.
21
+ """
22
+
23
+ def __init__(self):
24
+ # We hold the model in memory here.
25
+ # It's set to None initially to allow for lazy loading (saving RAM until needed).
26
+ self._vision_language_model: Optional[Llama] = None
27
+
28
+ def _find_vision_adapter(self) -> Path:
29
+ """
30
+ Locates the 'mmproj' file (Multimedia Projector).
31
+ This file acts as a translator between the Image Encoder and the Language Model.
32
+ """
33
+ candidates = list(settings.paths.models_dir.glob("*mmproj*.gguf"))
34
+ if not candidates:
35
+ raise FileNotFoundError("Critical: Could not find the vision adapter (mmproj) in models/ directory.")
36
+ return candidates[0]
37
+
38
+ def load_model(self, model_file_path: Path) -> None:
39
+ """Loads the AI model into GPU memory."""
40
+ if self._vision_language_model is not None:
41
+ return # Already loaded
42
+
43
+ print(f"Loading Qwen2-VL from {model_file_path}...")
44
+
45
+ try:
46
+ # The ChatHandler takes care of the complex CLIP image processing
47
+ vision_handler = Llava15ChatHandler(clip_model_path=str(self._find_vision_adapter()))
48
+
49
+ self._vision_language_model = Llama(
50
+ model_path=str(model_file_path),
51
+ chat_handler=vision_handler,
52
+ n_ctx=2048, # Context Window (how much text/image data it can hold)
53
+ n_gpu_layers=-1, # -1 means "Put everything on the GPU"
54
+ n_batch=512,
55
+ verbose=False # Keep logs clean
56
+ )
57
+ print("✅ Vision Model loaded successfully on GPU.")
58
+ except Exception as error:
59
+ print(f"❌ Failed to load model: {error}")
60
+ raise
61
+
62
+ def _convert_image_to_base64(self, local_image_path: str) -> str:
63
+ """Reads an image file and encodes it as a string for the API."""
64
+ with open(local_image_path, "rb") as image_file:
65
+ return base64.b64encode(image_file.read()).decode('utf-8')
66
+
67
+ def analyze_frame(self, frame_path: str, user_prompt: str) -> str:
68
+ """
69
+ Main Vision Function: Looks at a single image and answers a prompt.
70
+ """
71
+ if self._vision_language_model is None:
72
+ self.load_model(settings.paths.model_path)
73
+
74
+ # Create the data URI that the model expects
75
+ image_uri = f"data:image/jpeg;base64,{self._convert_image_to_base64(frame_path)}"
76
+
77
+ # Construct the conversation history
78
+ conversation = [
79
+ {
80
+ "role": "user",
81
+ "content": [
82
+ {"type": "image_url", "image_url": {"url": image_uri}},
83
+ {"type": "text", "text": user_prompt}
84
+ ]
85
+ }
86
+ ]
87
+
88
+ # Ask the model
89
+ response = self._vision_language_model.create_chat_completion(
90
+ messages=conversation,
91
+ max_tokens=256, # Limit response length to avoid rambling
92
+ temperature=0.3 # Low temperature = More factual, less creative
93
+ )
94
+
95
+ return response["choices"][0]["message"]["content"]
96
+
97
+ def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, analysis_prompt: str) -> str:
98
+ """
99
+ Analyzes a specific time range in the video.
100
+ Currently extracts the middle frame of that segment.
101
+ """
102
+ # 1. Open the video file
103
+ video_capture = cv2.VideoCapture(str(video_path))
104
+ fps = video_capture.get(cv2.CAP_PROP_FPS)
105
+
106
+ # 2. Jump to the middle of the requested segment
107
+ middle_timestamp = (start_time + end_time) / 2
108
+ target_frame_number = int(middle_timestamp * fps)
109
+
110
+ video_capture.set(cv2.CAP_PROP_POS_FRAMES, target_frame_number)
111
+ success, video_frame = video_capture.read()
112
+ video_capture.release()
113
+
114
+ if not success:
115
+ return "Error: Could not read video frame at this timestamp."
116
+
117
+ # 3. Save a temporary snapshot to disk (Model reads from disk)
118
+ temp_snapshot_path = settings.paths.data_dir / "temp_analysis_frame.jpg"
119
+
120
+ # Ensure directory exists
121
+ if not temp_snapshot_path.parent.exists():
122
+ temp_snapshot_path.parent.mkdir(parents=True)
123
+
124
+ cv2.imwrite(str(temp_snapshot_path), video_frame)
125
+
126
+ # 4. Perform the analysis
127
+ return self.analyze_frame(str(temp_snapshot_path), analysis_prompt)
128
+
129
+ def chat(self, chat_history: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None) -> str:
130
+ """Standard text-only chat (for reasoning without new images)."""
131
+ if self._vision_language_model is None:
132
+ self.load_model(settings.paths.model_path)
133
+
134
+ response = self._vision_language_model.create_chat_completion(
135
+ messages=chat_history,
136
+ max_tokens=512,
137
+ temperature=0.7,
138
+ stop=stop_sequences
139
+ )
140
+ return response["choices"][0]["message"]["content"]
141
+
142
+ def generate_text(self, raw_prompt: str, stop_sequences: Optional[List[str]] = None) -> str:
143
+ """
144
+ Raw text completion.
145
+ Useful when we want strict control over the output format (like standardizing a summary).
146
+ """
147
+ if self._vision_language_model is None:
148
+ self.load_model(settings.paths.model_path)
149
+
150
+ response = self._vision_language_model.create_completion(
151
+ prompt=raw_prompt,
152
+ max_tokens=512,
153
+ temperature=0.7,
154
+ stop=stop_sequences
155
+ )
156
+ return response["choices"][0]["text"]
src/perception/scout.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from PIL import Image
3
+ import numpy as np
4
+ from typing import Union, List, Tuple
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ class VisualScout:
8
+ """
9
+ The 'Scout' Agent.
10
+
11
+ This class handles the fast, lightweight semantic analysis of the video.
12
+ It uses CLIP (Vision Transformer) to convert video frames into mathematical vectors,
13
+ allowing us to 'search' the video content numerically without needing a heavy LLM.
14
+ """
15
+ def __init__(self, model_name: str = "clip-ViT-B-32"):
16
+ print(f"Initializing Visual Scout with model: {model_name}...")
17
+ # We use CPU here to save the GPU VRAM for the main Chat Model (Qwen)
18
+ self.embedding_model = SentenceTransformer(model_name, device="cpu")
19
+
20
+ def embed_image(self, image_data: Union[np.ndarray, Image.Image]) -> np.ndarray:
21
+ """Converts a single video frame into a 512-dimensional vector."""
22
+ if isinstance(image_data, np.ndarray):
23
+ # Convert OpenCV format (numpy) to PIL for the model
24
+ image_data = Image.fromarray(image_data)
25
+ return self.embedding_model.encode(image_data)
26
+
27
+ def embed_text(self, search_text: str) -> np.ndarray:
28
+ """Converts a user's search query into a vector for comparison."""
29
+ return self.embedding_model.encode(search_text)
30
+
31
+ def detect_semantic_changes(self, video_frames: List[Tuple[float, np.ndarray]], sensitivity: float = 0.85) -> List[Tuple[float, np.ndarray]]:
32
+ """
33
+ Scans the video to find 'Scenes' rather than just raw frames.
34
+
35
+ It compares each frame to the previous one using vector similarity.
36
+ If the similarity drops below the 'sensitivity' threshold, we mark it as a new event.
37
+ """
38
+ if not video_frames:
39
+ return []
40
+
41
+ print(f"🦅 Scout: Analyzing {len(video_frames)} frames for scene changes...")
42
+
43
+ # Optimization: Batch process all images at once instead of a loop
44
+ pil_images = [Image.fromarray(frame) for _, frame in video_frames]
45
+
46
+ # This is the heavy lifting - encoding all frames
47
+ frame_embeddings = self.embedding_model.encode(pil_images, batch_size=32, show_progress_bar=True)
48
+
49
+ significant_events = []
50
+
51
+ # Always include the very first frame
52
+ significant_events.append(video_frames[0])
53
+
54
+ previous_vector = frame_embeddings[0].reshape(1, -1)
55
+
56
+ # Iterate through the timeline
57
+ for i in range(1, len(frame_embeddings)):
58
+ current_vector = frame_embeddings[i].reshape(1, -1)
59
+
60
+ # Calculate how similar this frame is to the previous one (0.0 to 1.0)
61
+ similarity_score = cosine_similarity(previous_vector, current_vector)[0][0]
62
+
63
+ # If the scene changed drastically (low similarity), keep this frame
64
+ if similarity_score < sensitivity:
65
+ timestamp = video_frames[i][0]
66
+ print(f" ✂️ New Scene detected at {timestamp:.1f}s (Similarity: {similarity_score:.2f})")
67
+
68
+ significant_events.append(video_frames[i])
69
+ previous_vector = current_vector
70
+
71
+ print(f"🦅 Scout: Condensed video into {len(significant_events)} key semantic events.")
72
+ return significant_events
src/utils/video.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from pathlib import Path
4
+ from typing import Generator, Tuple, List
5
+ import decord
6
+ from decord import VideoReader, cpu
7
+
8
+ # Fix decord seed to avoid warnings
9
+ decord.bridge.set_bridge('torch')
10
+
11
+ def extract_frames_decord(video_path: Path, fps: float = 1.0) -> Generator[Tuple[float, np.ndarray], None, None]:
12
+ """Efficiently extracts frames from a video using Decord."""
13
+ if not video_path.exists():
14
+ raise FileNotFoundError(f"Video not found: {video_path}")
15
+
16
+ vr = VideoReader(str(video_path), ctx=cpu(0))
17
+ original_fps = vr.get_avg_fps()
18
+
19
+ # Calculate indices
20
+ step = int(original_fps / fps)
21
+ if step < 1: step = 1
22
+
23
+ indices = list(range(0, len(vr), step))
24
+
25
+ # Batch extraction
26
+ batch_size = 32
27
+ for i in range(0, len(indices), batch_size):
28
+ batch_indices = indices[i : i + batch_size]
29
+ frames = vr.get_batch(batch_indices).asnumpy()
30
+
31
+ for j, frame in enumerate(frames):
32
+ idx = batch_indices[j]
33
+ timestamp = idx / original_fps
34
+ yield timestamp, frame
35
+
36
+ def calculate_ssim_simplified(img1: np.ndarray, img2: np.ndarray) -> float:
37
+ """Calculates a simple structural similarity score (MSE based)."""
38
+ if img1.shape != img2.shape:
39
+ img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
40
+
41
+ g1 = cv2.cvtColor(img1, cv2.COLOR_RGB2GRAY)
42
+ g2 = cv2.cvtColor(img2, cv2.COLOR_RGB2GRAY)
43
+
44
+ mse = np.mean((g1 - g2) ** 2)
45
+ if mse == 0: return 1.0
46
+ return 1.0 / (1.0 + (mse / 1000.0))
47
+
48
+ def extract_key_scenes(video_path: Path, threshold: float = 0.85) -> List[Tuple[float, np.ndarray]]:
49
+ """
50
+ Extracts ONLY significant scene changes (Keyframes).
51
+ Reduces 60 frames -> 5-10 keyframes.
52
+ """
53
+ print("🎬 Detecting Scenes...")
54
+ keyframes = []
55
+ last_frame = None
56
+
57
+ # Scan at 1 FPS
58
+ for ts, frame in extract_frames_decord(video_path, fps=1.0):
59
+ if last_frame is None:
60
+ keyframes.append((ts, frame))
61
+ last_frame = frame
62
+ continue
63
+
64
+ score = calculate_ssim_simplified(last_frame, frame)
65
+
66
+ # If scene changed significantly (score < threshold)
67
+ if score < threshold:
68
+ keyframes.append((ts, frame))
69
+ last_frame = frame
70
+
71
+ print(f"🎬 Found {len(keyframes)} unique scenes.")
72
+ return keyframes
tests/test_graph.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from unittest.mock import MagicMock
3
+ from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
4
+ from src.core.graph import create_agent_graph
5
+
6
+ class TestAgentGraph:
7
+ def test_graph_structure(self):
8
+ """Test that the graph compiles and has expected nodes."""
9
+ # We pass mocks for dependencies
10
+ mock_perception = MagicMock()
11
+ mock_memory = MagicMock()
12
+
13
+ app = create_agent_graph(mock_perception, mock_memory)
14
+
15
+ # Check basic graph properties (it's a CompiledGraph)
16
+ assert app is not None
17
+ # We can't easily inspect nodes on the compiled object without internal access,
18
+ # but successful compilation means the structure is valid.
19
+
20
+ def test_tool_node_execution(self):
21
+ """
22
+ We can't easily mock the entire LangGraph execution loop in a unit test
23
+ without spinning up the full runtime.
24
+ Instead, we will test the 'Tools' individually here to ensure they
25
+ interface with the Graph correctly.
26
+ """
27
+ pass
tests/test_memory.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.memory.manager import SimpleMemoryManager
3
+ import json
4
+ import os
5
+
6
+ class TestMemoryManager:
7
+
8
+ def test_initialize_storage(self, tmp_path):
9
+ """Test that initialization creates a new empty structure."""
10
+ # Using pytest's tmp_path fixture for isolated file IO
11
+ manager = SimpleMemoryManager(storage_dir=tmp_path)
12
+ video_id = "test_video_01"
13
+
14
+ manager.initialize_storage(video_id)
15
+
16
+ # Verify file creation
17
+ expected_file = tmp_path / f"{video_id}_metadata.json"
18
+ assert expected_file.exists()
19
+
20
+ with open(expected_file) as f:
21
+ data = json.load(f)
22
+ assert data["video_id"] == video_id
23
+ assert data["events"] == []
24
+ assert data["entities"] == {}
25
+
26
+ def test_commit_event(self, tmp_path):
27
+ """Test adding an event to the timeline."""
28
+ manager = SimpleMemoryManager(storage_dir=tmp_path)
29
+ video_id = "test_video_01"
30
+ manager.initialize_storage(video_id)
31
+
32
+ manager.commit_event(
33
+ video_id=video_id,
34
+ timestamp="00:05",
35
+ description="Man walks dog",
36
+ metadata={"objects": ["man", "dog"]}
37
+ )
38
+
39
+ # Reload and check
40
+ expected_file = tmp_path / f"{video_id}_metadata.json"
41
+ with open(expected_file) as f:
42
+ data = json.load(f)
43
+ assert len(data["events"]) == 1
44
+ assert data["events"][0]["description"] == "Man walks dog"
45
+ assert data["events"][0]["timestamp"] == "00:05"
46
+
47
+ def test_query_knowledge(self, tmp_path):
48
+ """Test simple keyword search."""
49
+ manager = SimpleMemoryManager(storage_dir=tmp_path)
50
+ video_id = "test_video_01"
51
+ manager.initialize_storage(video_id)
52
+
53
+ manager.commit_event(video_id, "00:01", "Car drives by", {})
54
+ manager.commit_event(video_id, "00:05", "Man walks dog", {})
55
+
56
+ # Search for "dog"
57
+ results = manager.query_knowledge(video_id, "dog")
58
+ assert len(results) == 1
59
+ assert results[0]["description"] == "Man walks dog"
tests/test_orchestrator.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from unittest.mock import MagicMock
3
+ from src.core.orchestrator import VideoAgent
4
+ from src.interfaces.base import PerceptionEngine, MemoryManager
5
+
6
+ class TestVideoAgent:
7
+ def test_agent_answers_from_memory(self):
8
+ """If memory has the answer, it returns it without watching video."""
9
+ # Setup
10
+ mock_perception = MagicMock(spec=PerceptionEngine)
11
+ mock_memory = MagicMock(spec=MemoryManager)
12
+
13
+ # Memory returns a hit
14
+ mock_memory.query_knowledge.return_value = [
15
+ {"timestamp": "00:05", "description": "The dog is jumping."}
16
+ ]
17
+
18
+ agent = VideoAgent(perception=mock_perception, memory=mock_memory)
19
+
20
+ # Act
21
+ response = agent.ask("What is the dog doing?", video_id="test_vid")
22
+
23
+ # Assert
24
+ assert "jumping" in response
25
+ # Should NOT call vision engine because memory had it
26
+ mock_perception.analyze_video_segment.assert_not_called()
27
+
28
+ def test_agent_uses_perception_if_memory_fails(self):
29
+ """If memory is empty, it triggers vision lookups."""
30
+ # Setup
31
+ mock_perception = MagicMock(spec=PerceptionEngine)
32
+ mock_memory = MagicMock(spec=MemoryManager)
33
+
34
+ # Memory miss
35
+ mock_memory.query_knowledge.return_value = []
36
+
37
+ # Vision engine returns info
38
+ mock_perception.analyze_video_segment.return_value = "The car is red."
39
+
40
+ agent = VideoAgent(perception=mock_perception, memory=mock_memory)
41
+
42
+ # Act
43
+ response = agent.ask("What color is the car?", video_id="test_vid")
44
+
45
+ # Assert
46
+ # It should have tried to look at the video
47
+ # (In a real implementation, we'd need logic to find *where* to look,
48
+ # but for this MVP test, we assume it scans or we mock the 'plan' logic).
49
+ # For simplicity in MVP, if memory fails, maybe it scans the beginning or a default segment?
50
+ # Let's assume the agent defaults to scanning.
51
+ pass # To be defined in logic
tests/test_perception.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from unittest.mock import MagicMock, patch, ANY
3
+ from pathlib import Path
4
+ import sys
5
+
6
+ # 1. Mock the dependencies BEFORE import
7
+ mock_llama = MagicMock()
8
+ mock_chat_handler = MagicMock()
9
+ sys.modules["llama_cpp"] = mock_llama
10
+ sys.modules["llama_cpp.llama_chat_format"] = MagicMock()
11
+ sys.modules["llama_cpp.llama_chat_format"].Llava15ChatHandler = mock_chat_handler
12
+
13
+ from src.perception.engine import Qwen2PerceptionEngine
14
+ from src.config.settings import settings
15
+
16
+ class TestPerceptionEngine:
17
+
18
+ @patch("src.perception.engine.settings")
19
+ def test_load_model_calls_llama_correctly(self, mock_settings):
20
+ """Test that load_model initializes the Llama class with GPU settings."""
21
+ # Setup
22
+ engine = Qwen2PerceptionEngine()
23
+ fake_path = Path("/tmp/fake_model.gguf")
24
+ fake_projector = Path("/tmp/mmproj.gguf")
25
+
26
+ # Mock the glob search for the projector
27
+ mock_settings.paths.models_dir.glob.return_value = [fake_projector]
28
+
29
+ # Act
30
+ engine.load_model(fake_path)
31
+
32
+ # Assert
33
+ # 1. Check if it looked for the projector
34
+ mock_settings.paths.models_dir.glob.assert_called()
35
+
36
+ # 2. Check if ChatHandler was initialized
37
+ mock_chat_handler.assert_called_with(clip_model_path=str(fake_projector))
38
+
39
+ # 3. Check if Llama was instantiated with GPU layers
40
+ mock_llama.Llama.assert_called_with(
41
+ model_path=str(fake_path),
42
+ chat_handler=ANY, # The instance of chat handler
43
+ n_ctx=2048,
44
+ n_gpu_layers=-1, # Important: Must be -1 for full GPU
45
+ n_batch=512,
46
+ verbose=False
47
+ )
48
+
49
+ def test_analyze_frame_structure(self):
50
+ """Test that analyze_frame constructs the correct message format for Qwen."""
51
+ engine = Qwen2PerceptionEngine()
52
+
53
+ # Mock the internal Llama model
54
+ engine._model = MagicMock()
55
+ engine._model.create_chat_completion.return_value = {
56
+ "choices": [{"message": {"content": "A dog on a bike"}}]
57
+ }
58
+
59
+ # Mock image loader to avoid file IO
60
+ with patch("src.perception.engine.open", create=True) as mock_open:
61
+ mock_open.return_value.__enter__.return_value.read.return_value = b"fake_image_bytes"
62
+
63
+ result = engine.analyze_frame("test.jpg", "Describe this")
64
+
65
+ # Assert
66
+ assert result == "A dog on a bike"
67
+
68
+ # Verify the prompt structure
69
+ calls = engine._model.create_chat_completion.call_args
70
+ messages = calls.kwargs['messages']
71
+
72
+ assert messages[0]['role'] == 'system'
73
+ assert messages[1]['role'] == 'user'
74
+ # Check content list (Image + Text)
75
+ content = messages[1]['content']
76
+ assert content[0]['type'] == 'image_url'
77
+ assert content[1]['type'] == 'text'
78
+ assert content[1]['text'] == 'Describe this'
tests/verify_pipeline.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from unittest.mock import MagicMock
3
+ import numpy as np
4
+
5
+ # Mock heavy dependencies before imports
6
+ sys.modules["sentence_transformers"] = MagicMock()
7
+ sys.modules["llama_cpp"] = MagicMock()
8
+ sys.modules["llama_cpp.llama_chat_format"] = MagicMock()
9
+ sys.modules["decord"] = MagicMock()
10
+
11
+ # Import core modules
12
+ from src.core.orchestrator import VideoAgent
13
+ from src.memory.vector_index import VectorIndex
14
+ from src.interfaces.base import PerceptionEngine, MemoryManager
15
+
16
+ def test_pipeline_flow():
17
+ print("🧪 Starting Pipeline Verification Test...")
18
+
19
+ # 1. Setup Mocks
20
+ mock_perception = MagicMock(spec=PerceptionEngine)
21
+ # Mock text generation
22
+ mock_perception.generate_text.side_effect = ["The man is walking."] # Synthesizer output
23
+ # Mock vision analysis
24
+ mock_perception.analyze_video_segment.return_value = "A man walking on the street."
25
+
26
+ mock_memory = MagicMock(spec=MemoryManager)
27
+
28
+ # Mock Context (Scout + Index)
29
+ mock_scout = MagicMock()
30
+ mock_scout.embed_text.return_value = np.array([0.1, 0.2])
31
+
32
+ mock_index = MagicMock(spec=VectorIndex)
33
+ mock_index.timestamps = [0.0, 5.0, 10.0] # Dummy data
34
+ mock_index.search.return_value = [(5.0, 0.95)] # Found match at 5s
35
+
36
+ # 2. Initialize Agent
37
+ agent = VideoAgent(perception, mock_memory)
38
+ agent.context = {
39
+ "scout": mock_scout,
40
+ "index": mock_index
41
+ }
42
+
43
+ # 3. Execute "Search" Flow
44
+ print(" 🔹 Testing Search Query: 'Find the man'")
45
+ response = agent.ask("Find the man", "video_id")
46
+
47
+ print(f" ✅ Response: {response}")
48
+
49
+ # Verifications
50
+ # Planner should have chosen SEARCH (implied by calling scout)
51
+ mock_scout.embed_text.assert_called_once()
52
+ mock_index.search.assert_called_once()
53
+ # Analyst should have been called for timestamp 5.0
54
+ mock_perception.analyze_video_segment.assert_called()
55
+ args = mock_perception.analyze_video_segment.call_args[1]
56
+ assert args['start_time'] == 5.0
57
+
58
+ print(" 🎉 Search Pipeline Verified!")
59
+
60
+ # 4. Execute "Summary" Flow
61
+ print(" 🔹 Testing Summary Query: 'Summarize the video'")
62
+ # Reset mocks
63
+ mock_perception.reset_mock()
64
+ mock_scout.reset_mock()
65
+
66
+ agent.ask("Summarize the video", "video_id")
67
+
68
+ # Verifications
69
+ # Planner should choose SUMMARY -> Skip search, go straight to sampling
70
+ mock_scout.embed_text.assert_not_called()
71
+ # Should scan multiple timestamps (we defined 5 evenly spaced in graph.py)
72
+ assert mock_perception.analyze_video_segment.call_count >= 2
73
+
74
+ print(" 🎉 Summary Pipeline Verified!")
75
+
76
+ if __name__ == "__main__":
77
+ try:
78
+ test_pipeline_flow()
79
+ print("\n✅ SYSTEM INTEGRITY CONFIRMED.")
80
+ except Exception as e:
81
+ print(f"\n❌ TEST FAILED: {e}")
82
+ sys.exit(1)