Spaces:

ashleshp
/

Video-Scout

Runtime error

App Files Files Community

ashleshp commited on Jan 27

Commit

fca155a

0 Parent(s):

first commit

Browse files

Files changed (22) hide show

.gitignore +27 -0
README.md +42 -0
check_environment.py +30 -0
packages.txt +3 -0
requirements.txt +21 -0
scripts/download_model.py +51 -0
src/app.py +259 -0
src/config/settings.py +29 -0
src/core/graph.py +179 -0
src/core/orchestrator.py +25 -0
src/interfaces/base.py +47 -0
src/main.py +60 -0
src/memory/manager.py +75 -0
src/memory/vector_index.py +86 -0
src/perception/engine.py +156 -0
src/perception/scout.py +72 -0
src/utils/video.py +72 -0
tests/test_graph.py +27 -0
tests/test_memory.py +59 -0
tests/test_orchestrator.py +51 -0
tests/test_perception.py +78 -0
tests/verify_pipeline.py +82 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Documentation & Tasks (Excluded)
+ACTION_PLAN_TASK_*.md
+ARCHITECTURE_*.md
+PROGRESS_CHECKPOINT.md
+PROJECT_OVERVIEW.md
+RESTART_CONTEXT.md
+TECHNICAL_GUIDELINES.md
+project.txt
+# Python
+__pycache__/
+*.pyc
+venv/
+.pytest_cache/
+# Data & Models (Generated at runtime or too large)
+models/*.gguf
+data/*.mp4
+data/*.idx
+data/*.json
+data/temp*
+data/metadata/
+# Environment & Config
+.env
+.streamlit/secrets.toml
+.gemini/

README.md ADDED Viewed

	@@ -0,0 +1,42 @@

+---
+title: Visual Scout AI
+emoji: 🦅
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.30.0
+app_file: src/app.py
+pinned: false
+models:
+  - Qwen/Qwen2-VL-2B-Instruct
+---
+# Visual Scout: Agentic Video Understanding
+An agentic AI system that watches videos, builds a semantic index, and answers natural language questions using **Qwen2-VL**.
+## 🚀 How to Run Locally
+1. **Install Dependencies:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. **Download Model:**
+   ```bash
+   python scripts/download_model.py
+   ```
+3. **Run App:**
+   ```bash
+   streamlit run src/app.py
+   ```
+## ☁️ Deployment (Hugging Face Spaces)
+This repository is configured for immediate deployment on Hugging Face Spaces.
+1. Create a new Space on [Hugging Face](https://huggingface.co/spaces).
+2. Select **Streamlit** as the SDK.
+3. Connect this Git repository.
+4. The system will automatically build using `requirements.txt`.

check_environment.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import sys
+import os
+import shutil
+def check_gpu():
+    print("\n--- GPU Check (via nvidia-smi) ---")
+    if shutil.which("nvidia-smi"):
+        ret = os.system("nvidia-smi")
+        if ret == 0:
+            print("✅ NVIDIA Driver detected.")
+        else:
+            print("⚠️ nvidia-smi found but returned error.")
+    else:
+        print("❌ nvidia-smi not found. CUDA might not be in PATH.")
+def check_llama_cuda():
+    print("\n--- Llama.cpp CUDA Check ---")
+    try:
+        from llama_cpp import Llama
+        print("✅ llama-cpp-python is installed.")
+        print(f"Llama.cpp package location: {sys.modules['llama_cpp'].__file__}")
+    except ImportError:
+        print("❌ llama-cpp-python is NOT installed.")
+        sys.exit(1)
+if __name__ == "__main__":
+    print(f"Python Version: {sys.version}")
+    check_gpu()
+    check_llama_cuda()
+    print("\nEnvironment check complete.")

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+ffmpeg
+libsm6
+libxext6

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# Core AI & Inference
+llama-cpp-python>=0.2.82  # The engine for Qwen2-VL
+numpy>=1.24.0             # Array manipulation
+opencv-python-headless>=4.8.0 # Video processing (headless for server/CLI environments)
+# Utility & CLI
+rich>=13.0.0              # Beautiful terminal output
+pydantic>=2.0.0           # Data validation and settings management
+Pillow>=10.0.0            # Image handling
+# Development & Testing
+pytest>=7.0.0             # Testing framework
+black>=23.0.0             # Code formatter (for dev use)
+huggingface_hub>=0.19.0
+langgraph>=0.0.10
+langchain>=0.1.0
+langchain-core>=0.1.0
+streamlit>=1.30.0
+sentence-transformers>=2.2.2
+scikit-learn>=1.3.0
+decord>=0.6.0

scripts/download_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from huggingface_hub import list_repo_files, hf_hub_download
+import os
+REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
+MODEL_DIR = "models"
+def list_files():
+    print(f"Inspecting repo: {REPO_ID}")
+    files = list_repo_files(repo_id=REPO_ID)
+    return files
+def download_file(filename):
+    print(f"Downloading {filename}...")
+    hf_hub_download(
+        repo_id=REPO_ID,
+        filename=filename,
+        local_dir=MODEL_DIR,
+        local_dir_use_symlinks=False
+    )
+    print(f"✅ Saved to {MODEL_DIR}/{filename}")
+if __name__ == "__main__":
+    if not os.path.exists(MODEL_DIR):
+        os.makedirs(MODEL_DIR)
+    files = list_files()
+    # 1. Find the best Quantization (Q4_K_M is the sweet spot)
+    target_quant = "q4_k_m.gguf"
+    model_file = next((f for f in files if target_quant in f.lower()), None)
+    if model_file:
+        print(f"Found model: {model_file}")
+        if not os.path.exists(os.path.join(MODEL_DIR, model_file)):
+            download_file(model_file)
+        else:
+            print("✅ Model already exists.")
+    else:
+        print("❌ Could not find Q4_K_M model file.")
+    # 2. Look for mmproj (Vision Adapter) if it exists separately
+    # Qwen2-VL GGUFs usually embed it, but let's check for 'mmproj' just in case.
+    mmproj_file = next((f for f in files if "mmproj" in f.lower()), None)
+    if mmproj_file:
+        print(f"Found projector: {mmproj_file}")
+        if not os.path.exists(os.path.join(MODEL_DIR, mmproj_file)):
+            download_file(mmproj_file)
+        else:
+            print("✅ Projector already exists.")
+    else:
+        print("ℹ️ No separate mmproj file found (likely embedded or not needed for this repo).")

src/app.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import streamlit as st
+import shutil
+import time
+import os
+import sys
+import cv2
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+# Add project root to python path
+sys.path.append(os.getcwd())
+# Internal Modules
+from src.perception.engine import Qwen2PerceptionEngine
+from src.perception.scout import VisualScout
+from src.memory.manager import SimpleMemoryManager
+from src.memory.vector_index import VectorIndex
+from src.core.orchestrator import VideoAgent
+from src.config.settings import settings
+from src.utils.video import extract_frames_decord
+# --- PAGE CONFIGURATION ---
+st.set_page_config(
+    page_title="Visual Scout AI",
+    page_icon="🦅",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# --- SYSTEM SETUP ---
+def ensure_models_exist():
+    """
+    Checks if the AI models are present.
+    If not (first run or cloud deploy), it downloads them automatically.
+    """
+    REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
+    MODEL_FILENAME = "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
+    VISION_ADAPTER_FILENAME = "Qwen2-VL-2B-Instruct-f16-mmproj.gguf"
+    if not settings.paths.models_dir.exists():
+        settings.paths.models_dir.mkdir(parents=True)
+    model_path = settings.paths.models_dir / MODEL_FILENAME
+    adapter_path = settings.paths.models_dir / VISION_ADAPTER_FILENAME
+    # If either file is missing, trigger download
+    if not model_path.exists() or not adapter_path.exists():
+        with st.spinner("📥 Performing First-Time Setup: Downloading AI Models..."):
+            if not model_path.exists():
+                st.toast("Downloading Main Model (1.5GB)...")
+                hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, local_dir=settings.paths.models_dir)
+            if not adapter_path.exists():
+                st.toast("Downloading Vision Adapter...")
+                try:
+                    hf_hub_download(repo_id=REPO_ID, filename=VISION_ADAPTER_FILENAME, local_dir=settings.paths.models_dir)
+                except Exception:
+                    st.warning("Could not download specific adapter. Trying to proceed...")
+        st.success("Models Ready!")
+@st.cache_resource
+def initialize_system():
+    """
+    Loads the heavy AI models once and caches them.
+    """
+    ensure_models_exist()
+    print("🚀 System Startup: Initializing AI Engines...")
+    # 1. The Analyst (High Intelligence, GPU)
+    perception_engine = Qwen2PerceptionEngine()
+    try:
+        perception_engine.load_model(settings.paths.model_path)
+    except Exception as error:
+        st.error(f"Critical Error Loading AI: {error}")
+        st.stop()
+    # 2. The Scout (Fast Search, CPU)
+    visual_scout = VisualScout()
+    # 3. The Memory Manager
+    memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
+    # 4. The Agent Orchestrator
+    video_agent = VideoAgent(perception_engine, memory_manager)
+    return perception_engine, visual_scout, memory_manager, video_agent
+# Load the system
+perception_engine, visual_scout, memory_manager, video_agent = initialize_system()
+# --- SIDEBAR ---
+st.sidebar.title("1. Upload Video")
+uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])
+# --- SESSION STATE INITIALIZATION ---
+if "active_video_id" not in st.session_state:
+    st.session_state.active_video_id = None
+if "is_video_processed" not in st.session_state:
+    st.session_state.is_video_processed = False
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+if "visual_memory" not in st.session_state:
+    st.session_state.visual_memory = None
+if "text_memory" not in st.session_state:
+    st.session_state.text_memory = None
+# --- MAIN UI ---
+st.title("🦅 Visual Scout")
+st.caption("Agentic Video Understanding System")
+if uploaded_file is not None:
+    # Generate a simple ID from the filename
+    current_video_id = uploaded_file.name.split(".")[0]
+    # Detect if a new video was uploaded
+    if st.session_state.active_video_id != current_video_id:
+        st.session_state.active_video_id = current_video_id
+        st.session_state.is_video_processed = False
+        st.session_state.chat_history = []
+        st.session_state.visual_memory = None
+        st.session_state.text_memory = None
+        # Save the file locally
+        local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
+        with open(local_video_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        st.toast(f"Video '{current_video_id}' loaded.")
+    # --- PROCESSING PIPELINE ---
+    if not st.session_state.is_video_processed:
+        st.divider()
+        st.header("🧠 Analyzing Video Content")
+        st.info("The agent is watching the video to build a semantic index. This happens once per video.")
+        video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
+        # Initialize Memory Indices
+        visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
+        text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
+        visual_memory_index = VectorIndex(visual_index_path)
+        text_memory_index = VectorIndex(text_index_path)
+        # Store in session
+        st.session_state.visual_memory = visual_memory_index
+        st.session_state.text_memory = text_memory_index
+        memory_manager.initialize_storage(current_video_id)
+        with st.status("🦅 Scout: Scanning video timeline...", expanded=True) as status:
+            # Step 1: Extract Frames
+            status.write("Extracting frames at 1 FPS...")
+            raw_frames = list(extract_frames_decord(video_path, fps=1.0))
+            # Step 2: Semantic Cuts
+            status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
+            key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
+            # Index Visuals
+            for timestamp, frame in raw_frames:
+                embedding = visual_scout.embed_image(frame)
+                visual_memory_index.add(timestamp, embedding)
+            visual_memory_index.save()
+            status.write(f"Detected {len(key_events)} key semantic events.")
+            # Step 3: Deep Captioning
+            progress_bar = st.progress(0)
+            event_log = []
+            ANALYSIS_PROMPT = """Analyze this scene.
+1. Describe the main action and subject.
+2. Note any text or signs visible.
+3. Describe the environment."""
+            for i, (timestamp, frame) in enumerate(key_events):
+                status.write(f"👁️ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
+                # Save temp frame for the VLM
+                temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
+                cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+                description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
+                time_string = time.strftime('%M:%S', time.gmtime(timestamp))
+                memory_manager.commit_event(current_video_id, time_string, description, {})
+                event_log.append(f"**{time_string}**: {description}")
+                # Index Text (for semantic search later)
+                text_embedding = visual_scout.embed_text(description)
+                text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
+                progress_bar.progress((i + 1) / len(key_events))
+            text_memory_index.save()
+            # Step 4: Summary
+            status.write("📝 Writing Global Summary...")
+            full_timeline_text = "\n".join(event_log)
+            summary_prompt = f"""<|im_start|>system
+You are a video editor. Read the timeline below and write a concise summary of the entire video.
+TIMELINE:
+{full_timeline_text}
+<|im_end|>
+<|im_start|>assistant
+"""
+            summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
+            memory_manager.save_summary(current_video_id, summary)
+            status.update(label="✅ Analysis Complete!", state="complete", expanded=False)
+        st.session_state.is_video_processed = True
+        st.success("Video Index Ready.")
+        st.markdown(f"**Summary:** {summary}")
+        with st.expander("See Detailed Timeline"):
+            for event in event_log:
+                st.write(event)
+        # Rerun to switch to Chat Mode cleanly
+        st.rerun()
+    # --- CHAT INTERFACE ---
+    else:
+        st.divider()
+        # Display History
+        for message in st.session_state.chat_history:
+            with st.chat_message(message["role"]):
+                st.write(message["content"])
+        # Chat Input
+        if user_query := st.chat_input("Ask about the video..."):
+            # Add User Message
+            st.session_state.chat_history.append({"role": "user", "content": user_query})
+            with st.chat_message("user"):
+                st.write(user_query)
+            # Generate Answer
+            with st.chat_message("assistant"):
+                with st.spinner("Agent is thinking..."):
+                    # Inject Tools/Context into the Agent
+                    video_agent.context = {
+                        "scout": visual_scout,
+                        "vis_index": st.session_state.visual_memory,
+                        "txt_index": st.session_state.text_memory
+                    }
+                    response_text = video_agent.ask(user_query, st.session_state.active_video_id)
+                    st.write(response_text)
+            # Add Assistant Message
+            st.session_state.chat_history.append({"role": "assistant", "content": response_text})

src/config/settings.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pathlib import Path
+from pydantic import BaseModel, Field
+class ProjectPaths(BaseModel):
+    """Defines the standard file paths for the project."""
+    root: Path = Path(__file__).parent.parent.parent
+    models_dir: Path = Field(default_factory=lambda: Path(__file__).parent.parent.parent / "models")
+    data_dir: Path = Field(default_factory=lambda: Path(__file__).parent.parent.parent / "data")
+    @property
+    def model_path(self) -> Path:
+        # Default model name
+        return self.models_dir / "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
+class PerceptionSettings(BaseModel):
+    """Tunable parameters for the vision system."""
+    frame_interval: int = 2  # Process 1 frame every X seconds
+    ssim_threshold: float = 0.90  # Similarity threshold to skip frames (0.0 - 1.0)
+class Config(BaseModel):
+    """Global Application Configuration."""
+    paths: ProjectPaths = Field(default_factory=ProjectPaths)
+    perception: PerceptionSettings = Field(default_factory=PerceptionSettings)
+    class Config:
+        arbitrary_types_allowed = True
+# Singleton instance
+settings = Config()

src/core/graph.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from langgraph.graph import StateGraph, END
+from typing import TypedDict, Annotated, Sequence, Dict, Any, List
+from langchain_core.messages import BaseMessage
+import operator
+class AgentState(TypedDict):
+    """
+    Represents the 'Brain State' of the agent as it thinks.
+    This dictionary is passed between all the nodes in the graph.
+    """
+    user_query: str
+    video_id: str
+    plan: str
+    search_topic: str
+    target_timestamps: List[float]
+    observations: Annotated[List[str], operator.add] # 'add' means new observations are appended, not overwritten
+    final_answer: str
+    context: Dict[str, Any] # Holds references to the Search Index and Tools
+def create_agent_graph(perception_engine, memory_manager):
+    """
+    Builds the Decision Graph (The 'Flowchart' of the AI).
+    """
+    # --- NODE 1: PLANNER ---
+    def planner_node(state: AgentState):
+        query = state["user_query"]
+        print(f"🤖 Planner: Receiving query -> '{query}'")
+        # For now, we assume every query requires a search.
+        # Future improvement: Distinguish between 'Summary' and 'Specific Search'.
+        return {
+            "plan": "SEARCH",
+            "search_topic": query,
+            "observations": []
+        }
+    # --- NODE 2: RETRIEVER (The Librarian) ---
+    def retriever_node(state: AgentState):
+        """
+        Searches both Text (Metadata) and Vision (CLIP) indices.
+        """
+        video_id = state["video_id"]
+        search_topic = state["search_topic"]
+        # Unpack tools from context
+        scout = state["context"]["scout"]
+        visual_memory = state["context"]["vis_index"]
+        text_memory = state["context"]["txt_index"]
+        found_observations = []
+        timestamps_to_investigate = []
+        print(f"📚 Retriever: Looking up '{search_topic}'...")
+        query_vector = scout.embed_text(search_topic)
+        # A. Search Semantic Text Memory (Captions we generated earlier)
+        if text_memory:
+            # Find top 3 text matches
+            text_matches = text_memory.search(query_vector, top_k=3)
+            # Filter matches that are actually relevant (Score > 0.35)
+            relevant_text_matches = [match for match in text_matches if match[1] > 0.35]
+            if relevant_text_matches:
+                print(f"   ✅ Found {len(relevant_text_matches)} relevant text records.")
+                # Note: ideally we'd fetch the exact text from the index metadata here.
+                # For this implementation, we rely on the generic system memory or
+                # we accept the timestamp and let the Analyst re-verify.
+                # Let's map these timestamps to potential investigation points.
+                for timestamp, score in relevant_text_matches:
+                    timestamps_to_investigate.append(timestamp)
+                    found_observations.append(f"Memory Hint: Something relevant might be at {timestamp:.1f}s (Confidence: {score:.2f})")
+            else:
+                print("   ⚠️ No strong text matches found. Switching to Visual Search.")
+        # B. Visual Fallback (If text failed, or to double-check)
+        # We look for frames that *look* like the user's query
+        if not found_observations and visual_memory:
+            visual_matches = visual_memory.search(query_vector, top_k=3)
+            # Visual similarity needs a lower threshold usually
+            valid_visual_matches = [match for match in visual_matches if match[1] > 0.22]
+            if valid_visual_matches:
+                found_timestamps = [match[0] for match in valid_visual_matches]
+                print(f"   🦅 Visual Scout suggests checking times: {found_timestamps}")
+                timestamps_to_investigate.extend(found_timestamps)
+            else:
+                found_observations.append("No direct visual matches found.")
+        # Remove duplicates and sort
+        unique_timestamps = sorted(list(set(timestamps_to_investigate)))
+        return {
+            "observations": found_observations,
+            "target_timestamps": unique_timestamps
+        }
+    # --- NODE 3: ANALYST (The Eyes) ---
+    def analyst_node(state: AgentState):
+        """
+        Visits the specific timestamps found by the Retriever and looks closely.
+        """
+        video_id = state["video_id"]
+        timestamps = state["target_timestamps"]
+        search_topic = state["search_topic"]
+        new_findings = []
+        if not timestamps:
+            return {"observations": ["Analyst: I have nowhere to look."]}
+        print(f"👁️ Analyst: Zooming in on {len(timestamps)} moments...")
+        # We give the Vision Model a very specific task
+        verification_prompt = f"Look specifically for '{search_topic}'. If you see it, describe it in detail. If not, say 'Not visible'."
+        for time_point in timestamps:
+            description = perception_engine.analyze_video_segment(
+                video_path=f"data/{video_id}.mp4",
+                start_time=time_point,
+                end_time=time_point + 1.0,
+                prompt=verification_prompt
+            )
+            log_entry = f"Visual Inspection at {time_point:.1f}s: {description}"
+            new_findings.append(log_entry)
+            print(f"   > {log_entry}")
+        return {"observations": new_findings}
+    # --- NODE 4: SYNTHESIZER (The Speaker) ---
+    def synthesizer_node(state: AgentState):
+        """
+        Compiles all observations into a final natural language answer.
+        """
+        user_query = state["user_query"]
+        all_evidence = state["observations"]
+        if not all_evidence:
+            return {"final_answer": "I'm sorry, I couldn't find any information about that in the video."}
+        evidence_text = "\n".join(all_evidence)
+        system_prompt = f"""<|im_start|>system
+You are a helpful video assistant.
+Answer the user's question based strictly on the evidence below.
+If the evidence contradicts itself, trust the 'Visual Inspection' over the 'Memory Hint'.
+EVIDENCE COLLECTED:
+{evidence_text}
+<|im_end|>
+<|im_start|>user
+{user_query}
+<|im_end|>
+<|im_start|>assistant
+"""
+        # We use the raw text generation here for direct control
+        answer = perception_engine.generate_text(system_prompt, stop=["<|im_end|>"])
+        return {"final_answer": answer}
+    # --- GRAPH CONSTRUCTION ---
+    workflow = StateGraph(AgentState)
+    # Add Nodes
+    workflow.add_node("planner", planner_node)
+    workflow.add_node("retriever", retriever_node)
+    workflow.add_node("analyst", analyst_node)
+    workflow.add_node("synthesizer", synthesizer_node)
+    # Define Edges (The Flow)
+    workflow.set_entry_point("planner")
+    workflow.add_edge("planner", "retriever")
+    workflow.add_edge("retriever", "analyst")
+    workflow.add_edge("analyst", "synthesizer")
+    workflow.add_edge("synthesizer", END)
+    return workflow.compile()

src/core/orchestrator.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from src.interfaces.base import PerceptionEngine, MemoryManager
+from src.core.graph import create_agent_graph
+class VideoAgent:
+    def __init__(self, perception: PerceptionEngine, memory: MemoryManager):
+        self.graph = create_agent_graph(perception, memory)
+        self.context = {} # Scout/Index injection
+    def ask(self, question: str, video_id: str) -> str:
+        """
+        Runs the Linear Pipeline.
+        """
+        initial_state = {
+            "query": question,
+            "video_id": video_id,
+            "plan": "",
+            "search_term": "",
+            "timestamps": [],
+            "observations": [],
+            "final_answer": "",
+            "context": self.context
+        }
+        final_state = self.graph.invoke(initial_state)
+        return final_state["final_answer"]

src/interfaces/base.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional, Any
+from pathlib import Path
+class PerceptionEngine(ABC):
+    """Abstract Base Class for the Visual Perception System."""
+    @abstractmethod
+    def load_model(self, model_path: Path) -> None:
+        pass
+    @abstractmethod
+    def analyze_frame(self, frame_path: str, prompt: str) -> str:
+        pass
+    @abstractmethod
+    def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, prompt: str) -> str:
+        pass
+    @abstractmethod
+    def chat(self, messages: List[Dict[str, str]]) -> str:
+        """General purpose chat completion (for the Agent brain)."""
+        pass
+    @abstractmethod
+    def generate_text(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        """Raw text completion for advanced prompting."""
+        pass
+class MemoryManager(ABC):
+    """Abstract Base Class for the Knowledge Graph."""
+    @abstractmethod
+    def initialize_storage(self, video_id: str) -> None:
+        pass
+    @abstractmethod
+    def commit_event(self, video_id: str, timestamp: str, description: str, metadata: Dict[str, Any]) -> None:
+        pass
+    @abstractmethod
+    def query_knowledge(self, video_id: str, query: str) -> List[Dict[str, Any]]:
+        pass
+    @abstractmethod
+    def get_summary(self, video_id: str) -> str:
+        pass

src/main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import sys
+from pathlib import Path
+from rich.console import Console
+from rich.prompt import Prompt
+from src.perception.engine import Qwen2PerceptionEngine
+from src.memory.manager import SimpleMemoryManager
+from src.core.orchestrator import VideoAgent
+from src.config.settings import settings
+console = Console()
+def main():
+    console.rule("[bold blue]Agentic Video Understanding System[/]")
+    # 1. Initialize Components
+    with console.status("[bold green]Loading AI Models... (This uses GPU)[/]"):
+        perception = Qwen2PerceptionEngine()
+        # Pre-load to avoid delay on first query
+        try:
+            perception.load_model(settings.paths.model_path)
+        except Exception as e:
+            console.print(f"[bold red]Critical Error:[/] Failed to load Qwen2-VL. {e}")
+            console.print("Ensure you ran 'python scripts/download_model.py'")
+            sys.exit(1)
+        memory = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
+    agent = VideoAgent(perception, memory)
+    # 2. Select Video
+    video_id = "test_video"
+    video_path = settings.paths.data_dir / f"{video_id}.mp4"
+    if not video_path.exists():
+        console.print(f"[yellow]Warning:[/] Video {video_path} not found.")
+        console.print("Please place a video at data/test_video.mp4")
+        return
+    # 3. Initialize Memory for this video
+    memory.initialize_storage(video_id)
+    console.print(f"[green]Video Loaded:[/] {video_id}.mp4")
+    # 4. Interactive Loop
+    while True:
+        console.print()
+        query = Prompt.ask("[bold cyan]Ask a question[/] (or 'exit')")
+        if query.lower() in ["exit", "quit", "q"]:
+            break
+        with console.status("[bold yellow]Agent Thinking...[/]"):
+            try:
+                response = agent.ask(query, video_id)
+                console.print(f"[bold white]Answer:[/] {response}")
+            except Exception as e:
+                console.print(f"[bold red]Error:[/] {e}")
+if __name__ == "__main__":
+    main()

src/memory/manager.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from src.interfaces.base import MemoryManager
+from pathlib import Path
+from typing import List, Dict, Any
+import json
+class SimpleMemoryManager(MemoryManager):
+    """
+    A lightweight file-based memory manager using JSON.
+    Perfect for single-video sessions.
+    """
+    def __init__(self, storage_dir: Path):
+        self.storage_dir = storage_dir
+        if not self.storage_dir.exists():
+            self.storage_dir.mkdir(parents=True)
+    def _get_file_path(self, video_id: str) -> Path:
+        return self.storage_dir / f"{video_id}_metadata.json"
+    def _load_data(self, video_id: str) -> Dict[str, Any]:
+        path = self._get_file_path(video_id)
+        if not path.exists():
+            raise FileNotFoundError(f"Metadata not found for {video_id}")
+        with open(path, "r") as f:
+            return json.load(f)
+    def _save_data(self, video_id: str, data: Dict[str, Any]) -> None:
+        path = self._get_file_path(video_id)
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2)
+    def initialize_storage(self, video_id: str) -> None:
+        """Sets up the storage structure for a new video."""
+        data = {
+            "video_id": video_id,
+            "events": [],
+            "entities": {},
+            "summary": ""
+        }
+        self._save_data(video_id, data)
+    def commit_event(self, video_id: str, timestamp: str, description: str, metadata: Dict[str, Any]) -> None:
+        """Saves a new event to the timeline."""
+        data = self._load_data(video_id)
+        event = {
+            "timestamp": timestamp,
+            "description": description,
+            "metadata": metadata
+        }
+        data["events"].append(event)
+        self._save_data(video_id, data)
+    def query_knowledge(self, video_id: str, query: str) -> List[Dict[str, Any]]:
+        """Searches the existing knowledge base."""
+        data = self._load_data(video_id)
+        results = []
+        # Simple keyword search
+        query_lower = query.lower()
+        for event in data["events"]:
+            if query_lower in event["description"].lower():
+                results.append(event)
+        return results
+    def get_summary(self, video_id: str) -> str:
+        data = self._load_data(video_id)
+        return data.get("summary", "")
+    def save_summary(self, video_id: str, summary_text: str) -> None:
+        """Updates the global summary for the video."""
+        data = self._load_data(video_id)
+        data["summary"] = summary_text
+        self._save_data(video_id, data)

src/memory/vector_index.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import numpy as np
+from pathlib import Path
+from typing import List, Tuple, Dict, Any, Optional
+from sklearn.metrics.pairwise import cosine_similarity
+import pickle
+class VectorIndex:
+    """
+    In-memory Vector Database.
+    This acts as the 'Long Term Memory' for visual concepts.
+    It maps a Timestamp (when something happened) to a Vector (what it looked like).
+    """
+    def __init__(self, index_file_path: Path):
+        self.file_path = index_file_path
+        self.timestamps: List[float] = []
+        self.embedding_matrix: Optional[np.ndarray] = None
+        self.metadata_store: List[Dict[str, Any]] = []
+        # Load existing index if available
+        if self.file_path.exists():
+            self.load()
+    def add(self, timestamp_seconds: float, vector: np.ndarray, extra_data: Dict[str, Any] = None):
+        """Adds a new memory entry (timestamp + vector)."""
+        self.timestamps.append(timestamp_seconds)
+        self.metadata_store.append(extra_data or {})
+        # Normalize the vector to length 1.
+        # This is crucial so that 'Cosine Similarity' is just a Dot Product (faster).
+        vector_norm = np.linalg.norm(vector)
+        if vector_norm > 0:
+            vector = vector / vector_norm
+        if self.embedding_matrix is None:
+            self.embedding_matrix = vector.reshape(1, -1)
+        else:
+            self.embedding_matrix = np.vstack([self.embedding_matrix, vector])
+    def search(self, query_vector: np.ndarray, top_k: int = 5) -> List[Tuple[float, float]]:
+        """
+        Finds the moments in the video that are most similar to the query.
+        Returns:
+            A list of tuples: (timestamp_seconds, similarity_score)
+        """
+        if self.embedding_matrix is None:
+            return []
+        # Normalize the query too
+        query_norm = np.linalg.norm(query_vector)
+        if query_norm > 0:
+            query_vector = query_vector / query_norm
+        # Calculate similarity against ALL stored memories at once
+        similarity_scores = cosine_similarity(query_vector.reshape(1, -1), self.embedding_matrix)[0]
+        # Sort by highest score first
+        best_indices = np.argsort(similarity_scores)[::-1][:top_k]
+        results = []
+        for index in best_indices:
+            score = float(similarity_scores[index])
+            time_point = self.timestamps[index]
+            results.append((time_point, score))
+        return results
+    def save(self):
+        """Persists the index to the disk using Pickle."""
+        data_packet = {
+            "timestamps": self.timestamps,
+            "vectors": self.embedding_matrix,
+            "metadata": self.metadata_store
+        }
+        with open(self.file_path, "wb") as f:
+            pickle.dump(data_packet, f)
+    def load(self):
+        """Loads the index from disk."""
+        with open(self.file_path, "rb") as f:
+            data_packet = pickle.load(f)
+            self.timestamps = data_packet["timestamps"]
+            self.embedding_matrix = data_packet["vectors"]
+            self.metadata_store = data_packet.get("metadata", [])

src/perception/engine.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+from pathlib import Path
+from typing import Optional, List, Dict
+import base64
+# Third-party imports
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava15ChatHandler
+import cv2
+# Local imports
+from src.interfaces.base import PerceptionEngine
+from src.config.settings import settings
+class Qwen2PerceptionEngine(PerceptionEngine):
+    """
+    The 'Eyes' of the system.
+    This class wraps the Qwen2-VL (Vision-Language) model running via llama.cpp.
+    It handles loading the heavy GPU weights and formatting images for the AI to 'see'.
+    """
+    def __init__(self):
+        # We hold the model in memory here.
+        # It's set to None initially to allow for lazy loading (saving RAM until needed).
+        self._vision_language_model: Optional[Llama] = None
+    def _find_vision_adapter(self) -> Path:
+        """
+        Locates the 'mmproj' file (Multimedia Projector).
+        This file acts as a translator between the Image Encoder and the Language Model.
+        """
+        candidates = list(settings.paths.models_dir.glob("*mmproj*.gguf"))
+        if not candidates:
+            raise FileNotFoundError("Critical: Could not find the vision adapter (mmproj) in models/ directory.")
+        return candidates[0]
+    def load_model(self, model_file_path: Path) -> None:
+        """Loads the AI model into GPU memory."""
+        if self._vision_language_model is not None:
+            return # Already loaded
+        print(f"Loading Qwen2-VL from {model_file_path}...")
+        try:
+            # The ChatHandler takes care of the complex CLIP image processing
+            vision_handler = Llava15ChatHandler(clip_model_path=str(self._find_vision_adapter()))
+            self._vision_language_model = Llama(
+                model_path=str(model_file_path),
+                chat_handler=vision_handler,
+                n_ctx=2048,      # Context Window (how much text/image data it can hold)
+                n_gpu_layers=-1, # -1 means "Put everything on the GPU"
+                n_batch=512,
+                verbose=False    # Keep logs clean
+            )
+            print("✅ Vision Model loaded successfully on GPU.")
+        except Exception as error:
+            print(f"❌ Failed to load model: {error}")
+            raise
+    def _convert_image_to_base64(self, local_image_path: str) -> str:
+        """Reads an image file and encodes it as a string for the API."""
+        with open(local_image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    def analyze_frame(self, frame_path: str, user_prompt: str) -> str:
+        """
+        Main Vision Function: Looks at a single image and answers a prompt.
+        """
+        if self._vision_language_model is None:
+            self.load_model(settings.paths.model_path)
+        # Create the data URI that the model expects
+        image_uri = f"data:image/jpeg;base64,{self._convert_image_to_base64(frame_path)}"
+        # Construct the conversation history
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_uri}},
+                    {"type": "text", "text": user_prompt}
+                ]
+            }
+        ]
+        # Ask the model
+        response = self._vision_language_model.create_chat_completion(
+            messages=conversation,
+            max_tokens=256,   # Limit response length to avoid rambling
+            temperature=0.3   # Low temperature = More factual, less creative
+        )
+        return response["choices"][0]["message"]["content"]
+    def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, analysis_prompt: str) -> str:
+        """
+        Analyzes a specific time range in the video.
+        Currently extracts the middle frame of that segment.
+        """
+        # 1. Open the video file
+        video_capture = cv2.VideoCapture(str(video_path))
+        fps = video_capture.get(cv2.CAP_PROP_FPS)
+        # 2. Jump to the middle of the requested segment
+        middle_timestamp = (start_time + end_time) / 2
+        target_frame_number = int(middle_timestamp * fps)
+        video_capture.set(cv2.CAP_PROP_POS_FRAMES, target_frame_number)
+        success, video_frame = video_capture.read()
+        video_capture.release()
+        if not success:
+            return "Error: Could not read video frame at this timestamp."
+        # 3. Save a temporary snapshot to disk (Model reads from disk)
+        temp_snapshot_path = settings.paths.data_dir / "temp_analysis_frame.jpg"
+        # Ensure directory exists
+        if not temp_snapshot_path.parent.exists():
+            temp_snapshot_path.parent.mkdir(parents=True)
+        cv2.imwrite(str(temp_snapshot_path), video_frame)
+        # 4. Perform the analysis
+        return self.analyze_frame(str(temp_snapshot_path), analysis_prompt)
+    def chat(self, chat_history: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None) -> str:
+        """Standard text-only chat (for reasoning without new images)."""
+        if self._vision_language_model is None:
+            self.load_model(settings.paths.model_path)
+        response = self._vision_language_model.create_chat_completion(
+            messages=chat_history,
+            max_tokens=512,
+            temperature=0.7,
+            stop=stop_sequences
+        )
+        return response["choices"][0]["message"]["content"]
+    def generate_text(self, raw_prompt: str, stop_sequences: Optional[List[str]] = None) -> str:
+        """
+        Raw text completion.
+        Useful when we want strict control over the output format (like standardizing a summary).
+        """
+        if self._vision_language_model is None:
+            self.load_model(settings.paths.model_path)
+        response = self._vision_language_model.create_completion(
+            prompt=raw_prompt,
+            max_tokens=512,
+            temperature=0.7,
+            stop=stop_sequences
+        )
+        return response["choices"][0]["text"]

src/perception/scout.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from sentence_transformers import SentenceTransformer
+from PIL import Image
+import numpy as np
+from typing import Union, List, Tuple
+from sklearn.metrics.pairwise import cosine_similarity
+class VisualScout:
+    """
+    The 'Scout' Agent.
+    This class handles the fast, lightweight semantic analysis of the video.
+    It uses CLIP (Vision Transformer) to convert video frames into mathematical vectors,
+    allowing us to 'search' the video content numerically without needing a heavy LLM.
+    """
+    def __init__(self, model_name: str = "clip-ViT-B-32"):
+        print(f"Initializing Visual Scout with model: {model_name}...")
+        # We use CPU here to save the GPU VRAM for the main Chat Model (Qwen)
+        self.embedding_model = SentenceTransformer(model_name, device="cpu")
+    def embed_image(self, image_data: Union[np.ndarray, Image.Image]) -> np.ndarray:
+        """Converts a single video frame into a 512-dimensional vector."""
+        if isinstance(image_data, np.ndarray):
+            # Convert OpenCV format (numpy) to PIL for the model
+            image_data = Image.fromarray(image_data)
+        return self.embedding_model.encode(image_data)
+    def embed_text(self, search_text: str) -> np.ndarray:
+        """Converts a user's search query into a vector for comparison."""
+        return self.embedding_model.encode(search_text)
+    def detect_semantic_changes(self, video_frames: List[Tuple[float, np.ndarray]], sensitivity: float = 0.85) -> List[Tuple[float, np.ndarray]]:
+        """
+        Scans the video to find 'Scenes' rather than just raw frames.
+        It compares each frame to the previous one using vector similarity.
+        If the similarity drops below the 'sensitivity' threshold, we mark it as a new event.
+        """
+        if not video_frames:
+            return []
+        print(f"🦅 Scout: Analyzing {len(video_frames)} frames for scene changes...")
+        # Optimization: Batch process all images at once instead of a loop
+        pil_images = [Image.fromarray(frame) for _, frame in video_frames]
+        # This is the heavy lifting - encoding all frames
+        frame_embeddings = self.embedding_model.encode(pil_images, batch_size=32, show_progress_bar=True)
+        significant_events = []
+        # Always include the very first frame
+        significant_events.append(video_frames[0])
+        previous_vector = frame_embeddings[0].reshape(1, -1)
+        # Iterate through the timeline
+        for i in range(1, len(frame_embeddings)):
+            current_vector = frame_embeddings[i].reshape(1, -1)
+            # Calculate how similar this frame is to the previous one (0.0 to 1.0)
+            similarity_score = cosine_similarity(previous_vector, current_vector)[0][0]
+            # If the scene changed drastically (low similarity), keep this frame
+            if similarity_score < sensitivity:
+                timestamp = video_frames[i][0]
+                print(f"   ✂️ New Scene detected at {timestamp:.1f}s (Similarity: {similarity_score:.2f})")
+                significant_events.append(video_frames[i])
+                previous_vector = current_vector
+        print(f"🦅 Scout: Condensed video into {len(significant_events)} key semantic events.")
+        return significant_events

src/utils/video.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import cv2
+import numpy as np
+from pathlib import Path
+from typing import Generator, Tuple, List
+import decord
+from decord import VideoReader, cpu
+# Fix decord seed to avoid warnings
+decord.bridge.set_bridge('torch')
+def extract_frames_decord(video_path: Path, fps: float = 1.0) -> Generator[Tuple[float, np.ndarray], None, None]:
+    """Efficiently extracts frames from a video using Decord."""
+    if not video_path.exists():
+        raise FileNotFoundError(f"Video not found: {video_path}")
+    vr = VideoReader(str(video_path), ctx=cpu(0))
+    original_fps = vr.get_avg_fps()
+    # Calculate indices
+    step = int(original_fps / fps)
+    if step < 1: step = 1
+    indices = list(range(0, len(vr), step))
+    # Batch extraction
+    batch_size = 32
+    for i in range(0, len(indices), batch_size):
+        batch_indices = indices[i : i + batch_size]
+        frames = vr.get_batch(batch_indices).asnumpy()
+        for j, frame in enumerate(frames):
+            idx = batch_indices[j]
+            timestamp = idx / original_fps
+            yield timestamp, frame
+def calculate_ssim_simplified(img1: np.ndarray, img2: np.ndarray) -> float:
+    """Calculates a simple structural similarity score (MSE based)."""
+    if img1.shape != img2.shape:
+        img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))
+    g1 = cv2.cvtColor(img1, cv2.COLOR_RGB2GRAY)
+    g2 = cv2.cvtColor(img2, cv2.COLOR_RGB2GRAY)
+    mse = np.mean((g1 - g2) ** 2)
+    if mse == 0: return 1.0
+    return 1.0 / (1.0 + (mse / 1000.0))
+def extract_key_scenes(video_path: Path, threshold: float = 0.85) -> List[Tuple[float, np.ndarray]]:
+    """
+    Extracts ONLY significant scene changes (Keyframes).
+    Reduces 60 frames -> 5-10 keyframes.
+    """
+    print("🎬 Detecting Scenes...")
+    keyframes = []
+    last_frame = None
+    # Scan at 1 FPS
+    for ts, frame in extract_frames_decord(video_path, fps=1.0):
+        if last_frame is None:
+            keyframes.append((ts, frame))
+            last_frame = frame
+            continue
+        score = calculate_ssim_simplified(last_frame, frame)
+        # If scene changed significantly (score < threshold)
+        if score < threshold:
+            keyframes.append((ts, frame))
+            last_frame = frame
+    print(f"🎬 Found {len(keyframes)} unique scenes.")
+    return keyframes

tests/test_graph.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import pytest
+from unittest.mock import MagicMock
+from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
+from src.core.graph import create_agent_graph
+class TestAgentGraph:
+    def test_graph_structure(self):
+        """Test that the graph compiles and has expected nodes."""
+        # We pass mocks for dependencies
+        mock_perception = MagicMock()
+        mock_memory = MagicMock()
+        app = create_agent_graph(mock_perception, mock_memory)
+        # Check basic graph properties (it's a CompiledGraph)
+        assert app is not None
+        # We can't easily inspect nodes on the compiled object without internal access,
+        # but successful compilation means the structure is valid.
+    def test_tool_node_execution(self):
+        """
+        We can't easily mock the entire LangGraph execution loop in a unit test
+        without spinning up the full runtime.
+        Instead, we will test the 'Tools' individually here to ensure they
+        interface with the Graph correctly.
+        """
+        pass

tests/test_memory.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import pytest
+from src.memory.manager import SimpleMemoryManager
+import json
+import os
+class TestMemoryManager:
+    def test_initialize_storage(self, tmp_path):
+        """Test that initialization creates a new empty structure."""
+        # Using pytest's tmp_path fixture for isolated file IO
+        manager = SimpleMemoryManager(storage_dir=tmp_path)
+        video_id = "test_video_01"
+        manager.initialize_storage(video_id)
+        # Verify file creation
+        expected_file = tmp_path / f"{video_id}_metadata.json"
+        assert expected_file.exists()
+        with open(expected_file) as f:
+            data = json.load(f)
+            assert data["video_id"] == video_id
+            assert data["events"] == []
+            assert data["entities"] == {}
+    def test_commit_event(self, tmp_path):
+        """Test adding an event to the timeline."""
+        manager = SimpleMemoryManager(storage_dir=tmp_path)
+        video_id = "test_video_01"
+        manager.initialize_storage(video_id)
+        manager.commit_event(
+            video_id=video_id,
+            timestamp="00:05",
+            description="Man walks dog",
+            metadata={"objects": ["man", "dog"]}
+        )
+        # Reload and check
+        expected_file = tmp_path / f"{video_id}_metadata.json"
+        with open(expected_file) as f:
+            data = json.load(f)
+            assert len(data["events"]) == 1
+            assert data["events"][0]["description"] == "Man walks dog"
+            assert data["events"][0]["timestamp"] == "00:05"
+    def test_query_knowledge(self, tmp_path):
+        """Test simple keyword search."""
+        manager = SimpleMemoryManager(storage_dir=tmp_path)
+        video_id = "test_video_01"
+        manager.initialize_storage(video_id)
+        manager.commit_event(video_id, "00:01", "Car drives by", {})
+        manager.commit_event(video_id, "00:05", "Man walks dog", {})
+        # Search for "dog"
+        results = manager.query_knowledge(video_id, "dog")
+        assert len(results) == 1
+        assert results[0]["description"] == "Man walks dog"

tests/test_orchestrator.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import pytest
+from unittest.mock import MagicMock
+from src.core.orchestrator import VideoAgent
+from src.interfaces.base import PerceptionEngine, MemoryManager
+class TestVideoAgent:
+    def test_agent_answers_from_memory(self):
+        """If memory has the answer, it returns it without watching video."""
+        # Setup
+        mock_perception = MagicMock(spec=PerceptionEngine)
+        mock_memory = MagicMock(spec=MemoryManager)
+        # Memory returns a hit
+        mock_memory.query_knowledge.return_value = [
+            {"timestamp": "00:05", "description": "The dog is jumping."}
+        ]
+        agent = VideoAgent(perception=mock_perception, memory=mock_memory)
+        # Act
+        response = agent.ask("What is the dog doing?", video_id="test_vid")
+        # Assert
+        assert "jumping" in response
+        # Should NOT call vision engine because memory had it
+        mock_perception.analyze_video_segment.assert_not_called()
+    def test_agent_uses_perception_if_memory_fails(self):
+        """If memory is empty, it triggers vision lookups."""
+        # Setup
+        mock_perception = MagicMock(spec=PerceptionEngine)
+        mock_memory = MagicMock(spec=MemoryManager)
+        # Memory miss
+        mock_memory.query_knowledge.return_value = []
+        # Vision engine returns info
+        mock_perception.analyze_video_segment.return_value = "The car is red."
+        agent = VideoAgent(perception=mock_perception, memory=mock_memory)
+        # Act
+        response = agent.ask("What color is the car?", video_id="test_vid")
+        # Assert
+        # It should have tried to look at the video
+        # (In a real implementation, we'd need logic to find *where* to look,
+        # but for this MVP test, we assume it scans or we mock the 'plan' logic).
+        # For simplicity in MVP, if memory fails, maybe it scans the beginning or a default segment?
+        # Let's assume the agent defaults to scanning.
+        pass # To be defined in logic

tests/test_perception.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import pytest
+from unittest.mock import MagicMock, patch, ANY
+from pathlib import Path
+import sys
+# 1. Mock the dependencies BEFORE import
+mock_llama = MagicMock()
+mock_chat_handler = MagicMock()
+sys.modules["llama_cpp"] = mock_llama
+sys.modules["llama_cpp.llama_chat_format"] = MagicMock()
+sys.modules["llama_cpp.llama_chat_format"].Llava15ChatHandler = mock_chat_handler
+from src.perception.engine import Qwen2PerceptionEngine
+from src.config.settings import settings
+class TestPerceptionEngine:
+    @patch("src.perception.engine.settings")
+    def test_load_model_calls_llama_correctly(self, mock_settings):
+        """Test that load_model initializes the Llama class with GPU settings."""
+        # Setup
+        engine = Qwen2PerceptionEngine()
+        fake_path = Path("/tmp/fake_model.gguf")
+        fake_projector = Path("/tmp/mmproj.gguf")
+        # Mock the glob search for the projector
+        mock_settings.paths.models_dir.glob.return_value = [fake_projector]
+        # Act
+        engine.load_model(fake_path)
+        # Assert
+        # 1. Check if it looked for the projector
+        mock_settings.paths.models_dir.glob.assert_called()
+        # 2. Check if ChatHandler was initialized
+        mock_chat_handler.assert_called_with(clip_model_path=str(fake_projector))
+        # 3. Check if Llama was instantiated with GPU layers
+        mock_llama.Llama.assert_called_with(
+            model_path=str(fake_path),
+            chat_handler=ANY, # The instance of chat handler
+            n_ctx=2048,
+            n_gpu_layers=-1, # Important: Must be -1 for full GPU
+            n_batch=512,
+            verbose=False
+        )
+    def test_analyze_frame_structure(self):
+        """Test that analyze_frame constructs the correct message format for Qwen."""
+        engine = Qwen2PerceptionEngine()
+        # Mock the internal Llama model
+        engine._model = MagicMock()
+        engine._model.create_chat_completion.return_value = {
+            "choices": [{"message": {"content": "A dog on a bike"}}]
+        }
+        # Mock image loader to avoid file IO
+        with patch("src.perception.engine.open", create=True) as mock_open:
+            mock_open.return_value.__enter__.return_value.read.return_value = b"fake_image_bytes"
+            result = engine.analyze_frame("test.jpg", "Describe this")
+            # Assert
+            assert result == "A dog on a bike"
+            # Verify the prompt structure
+            calls = engine._model.create_chat_completion.call_args
+            messages = calls.kwargs['messages']
+            assert messages[0]['role'] == 'system'
+            assert messages[1]['role'] == 'user'
+            # Check content list (Image + Text)
+            content = messages[1]['content']
+            assert content[0]['type'] == 'image_url'
+            assert content[1]['type'] == 'text'
+            assert content[1]['text'] == 'Describe this'

tests/verify_pipeline.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import sys
+from unittest.mock import MagicMock
+import numpy as np
+# Mock heavy dependencies before imports
+sys.modules["sentence_transformers"] = MagicMock()
+sys.modules["llama_cpp"] = MagicMock()
+sys.modules["llama_cpp.llama_chat_format"] = MagicMock()
+sys.modules["decord"] = MagicMock()
+# Import core modules
+from src.core.orchestrator import VideoAgent
+from src.memory.vector_index import VectorIndex
+from src.interfaces.base import PerceptionEngine, MemoryManager
+def test_pipeline_flow():
+    print("🧪 Starting Pipeline Verification Test...")
+    # 1. Setup Mocks
+    mock_perception = MagicMock(spec=PerceptionEngine)
+    # Mock text generation
+    mock_perception.generate_text.side_effect = ["The man is walking."] # Synthesizer output
+    # Mock vision analysis
+    mock_perception.analyze_video_segment.return_value = "A man walking on the street."
+    mock_memory = MagicMock(spec=MemoryManager)
+    # Mock Context (Scout + Index)
+    mock_scout = MagicMock()
+    mock_scout.embed_text.return_value = np.array([0.1, 0.2])
+    mock_index = MagicMock(spec=VectorIndex)
+    mock_index.timestamps = [0.0, 5.0, 10.0] # Dummy data
+    mock_index.search.return_value = [(5.0, 0.95)] # Found match at 5s
+    # 2. Initialize Agent
+    agent = VideoAgent(perception, mock_memory)
+    agent.context = {
+        "scout": mock_scout,
+        "index": mock_index
+    }
+    # 3. Execute "Search" Flow
+    print("   🔹 Testing Search Query: 'Find the man'")
+    response = agent.ask("Find the man", "video_id")
+    print(f"   ✅ Response: {response}")
+    # Verifications
+    # Planner should have chosen SEARCH (implied by calling scout)
+    mock_scout.embed_text.assert_called_once()
+    mock_index.search.assert_called_once()
+    # Analyst should have been called for timestamp 5.0
+    mock_perception.analyze_video_segment.assert_called()
+    args = mock_perception.analyze_video_segment.call_args[1]
+    assert args['start_time'] == 5.0
+    print("   🎉 Search Pipeline Verified!")
+    # 4. Execute "Summary" Flow
+    print("   🔹 Testing Summary Query: 'Summarize the video'")
+    # Reset mocks
+    mock_perception.reset_mock()
+    mock_scout.reset_mock()
+    agent.ask("Summarize the video", "video_id")
+    # Verifications
+    # Planner should choose SUMMARY -> Skip search, go straight to sampling
+    mock_scout.embed_text.assert_not_called()
+    # Should scan multiple timestamps (we defined 5 evenly spaced in graph.py)
+    assert mock_perception.analyze_video_segment.call_count >= 2
+    print("   🎉 Summary Pipeline Verified!")
+if __name__ == "__main__":
+    try:
+        test_pipeline_flow()
+        print("\n✅ SYSTEM INTEGRITY CONFIRMED.")
+    except Exception as e:
+        print(f"\n❌ TEST FAILED: {e}")
+        sys.exit(1)