Spaces:

ashleshp
/

Video-Scout

Runtime error

File size: 8,605 Bytes

fca155a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6192e4
fca155a
b6192e4
fca155a
b6192e4
fca155a
b6192e4
fca155a

import streamlit as st
import shutil
import time
import os
import sys
import cv2
from pathlib import Path
from huggingface_hub import hf_hub_download

# Add project root to python path
sys.path.append(os.getcwd())

# Internal Modules
from src.perception.engine import Qwen2PerceptionEngine
from src.perception.scout import VisualScout
from src.memory.manager import SimpleMemoryManager
from src.memory.vector_index import VectorIndex
from src.core.orchestrator import VideoAgent
from src.config.settings import settings
from src.utils.video import extract_frames_decord

# --- PAGE CONFIGURATION ---
st.set_page_config(
    page_title="Visual Scout AI", 
    page_icon="🦅", 
    layout="wide",
    initial_sidebar_state="expanded"
)

# --- SYSTEM SETUP ---

@st.cache_resource
def initialize_system():
    """
    Loads the native Hugging Face model.
    """
    print("🚀 System Startup: Initializing Native Transformers Engine...")
    
    # 1. The Analyst (Native Qwen2-VL)
    perception_engine = Qwen2PerceptionEngine()
    # Model will lazy-load on first use or we can trigger it here
    
    # 2. The Scout (Fast Search, CPU)
    visual_scout = VisualScout()
    
    # 3. The Memory Manager
    memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
    
    # 4. The Agent Orchestrator
    video_agent = VideoAgent(perception_engine, memory_manager)
    
    return perception_engine, visual_scout, memory_manager, video_agent

# Load the system
perception_engine, visual_scout, memory_manager, video_agent = initialize_system()

# --- SIDEBAR ---
st.sidebar.title("1. Upload Video")
uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])

# --- SESSION STATE INITIALIZATION ---
if "active_video_id" not in st.session_state: 
    st.session_state.active_video_id = None
if "is_video_processed" not in st.session_state: 
    st.session_state.is_video_processed = False
if "chat_history" not in st.session_state: 
    st.session_state.chat_history = []
if "visual_memory" not in st.session_state: 
    st.session_state.visual_memory = None
if "text_memory" not in st.session_state: 
    st.session_state.text_memory = None

# --- MAIN UI ---
st.title("🦅 Visual Scout")
st.caption("Agentic Video Understanding System")

if uploaded_file is not None:
    # Generate a simple ID from the filename
    current_video_id = uploaded_file.name.split(".")[0]
    
    # Detect if a new video was uploaded
    if st.session_state.active_video_id != current_video_id:
        st.session_state.active_video_id = current_video_id
        st.session_state.is_video_processed = False
        st.session_state.chat_history = []
        st.session_state.visual_memory = None
        st.session_state.text_memory = None
        
        # Save the file locally
        local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
        with open(local_video_path, "wb") as f: 
            f.write(uploaded_file.getbuffer())
        st.toast(f"Video '{current_video_id}' loaded.")

    # --- PROCESSING PIPELINE ---
    if not st.session_state.is_video_processed:
        st.divider()
        st.header("🧠 Analyzing Video Content")
        st.info("The agent is watching the video to build a semantic index. This happens once per video.")
        
        video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
        
        # Initialize Memory Indices
        visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
        text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
        
        visual_memory_index = VectorIndex(visual_index_path)
        text_memory_index = VectorIndex(text_index_path)
        
        # Store in session
        st.session_state.visual_memory = visual_memory_index
        st.session_state.text_memory = text_memory_index
        
        memory_manager.initialize_storage(current_video_id)
        
        with st.status("🦅 Scout: Scanning video timeline...", expanded=True) as status:
            
            # Step 1: Extract Frames
            status.write("Extracting frames at 1 FPS...")
            raw_frames = list(extract_frames_decord(video_path, fps=1.0))
            
            # Step 2: Semantic Cuts
            status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
            key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
            
            # Index Visuals
            for timestamp, frame in raw_frames:
                embedding = visual_scout.embed_image(frame)
                visual_memory_index.add(timestamp, embedding)
            visual_memory_index.save()
            
            status.write(f"Detected {len(key_events)} key semantic events.")
            
            # Step 3: Deep Captioning
            progress_bar = st.progress(0)
            event_log = []
            
            ANALYSIS_PROMPT = """Analyze this scene.
1. Describe the main action and subject.
2. Note any text or signs visible.
3. Describe the environment."""
            
            for i, (timestamp, frame) in enumerate(key_events):
                status.write(f"👁️ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
                
                # Save temp frame for the VLM
                temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
                cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
                
                description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
                
                time_string = time.strftime('%M:%S', time.gmtime(timestamp))
                memory_manager.commit_event(current_video_id, time_string, description, {})
                event_log.append(f"**{time_string}**: {description}")
                
                # Index Text (for semantic search later)
                text_embedding = visual_scout.embed_text(description)
                text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
                
                progress_bar.progress((i + 1) / len(key_events))
            
            text_memory_index.save()
            
            # Step 4: Summary
            status.write("📝 Writing Global Summary...")
            full_timeline_text = "\n".join(event_log)
            summary_prompt = f"""<|im_start|>system
You are a video editor. Read the timeline below and write a concise summary of the entire video.
TIMELINE:
{full_timeline_text}
<|im_end|>
<|im_start|>assistant
"""
            summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
            memory_manager.save_summary(current_video_id, summary)
            
            status.update(label="✅ Analysis Complete!", state="complete", expanded=False)
            
        st.session_state.is_video_processed = True
        st.success("Video Index Ready.")
        st.markdown(f"**Summary:** {summary}")
        
        with st.expander("See Detailed Timeline"):
            for event in event_log: 
                st.write(event)
        
        # Rerun to switch to Chat Mode cleanly
        st.rerun()

    # --- CHAT INTERFACE ---
    else:
        st.divider()
        
        # Display History
        for message in st.session_state.chat_history:
            with st.chat_message(message["role"]):
                st.write(message["content"])
                
        # Chat Input
        if user_query := st.chat_input("Ask about the video..."):
            
            # Add User Message
            st.session_state.chat_history.append({"role": "user", "content": user_query})
            with st.chat_message("user"):
                st.write(user_query)
                
            # Generate Answer
            with st.chat_message("assistant"):
                with st.spinner("Agent is thinking..."):
                    
                    # Inject Tools/Context into the Agent
                    video_agent.context = {
                        "scout": visual_scout, 
                        "vis_index": st.session_state.visual_memory,
                        "txt_index": st.session_state.text_memory
                    }
                    
                    response_text = video_agent.ask(user_query, st.session_state.active_video_id)
                    st.write(response_text)
            
            # Add Assistant Message
            st.session_state.chat_history.append({"role": "assistant", "content": response_text})