Video-Scout / src /app.py
ashleshp's picture
Switch transformers
b6192e4
import streamlit as st
import shutil
import time
import os
import sys
import cv2
from pathlib import Path
from huggingface_hub import hf_hub_download
# Add project root to python path
sys.path.append(os.getcwd())
# Internal Modules
from src.perception.engine import Qwen2PerceptionEngine
from src.perception.scout import VisualScout
from src.memory.manager import SimpleMemoryManager
from src.memory.vector_index import VectorIndex
from src.core.orchestrator import VideoAgent
from src.config.settings import settings
from src.utils.video import extract_frames_decord
# --- PAGE CONFIGURATION ---
st.set_page_config(
page_title="Visual Scout AI",
page_icon="πŸ¦…",
layout="wide",
initial_sidebar_state="expanded"
)
# --- SYSTEM SETUP ---
@st.cache_resource
def initialize_system():
"""
Loads the native Hugging Face model.
"""
print("πŸš€ System Startup: Initializing Native Transformers Engine...")
# 1. The Analyst (Native Qwen2-VL)
perception_engine = Qwen2PerceptionEngine()
# Model will lazy-load on first use or we can trigger it here
# 2. The Scout (Fast Search, CPU)
visual_scout = VisualScout()
# 3. The Memory Manager
memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
# 4. The Agent Orchestrator
video_agent = VideoAgent(perception_engine, memory_manager)
return perception_engine, visual_scout, memory_manager, video_agent
# Load the system
perception_engine, visual_scout, memory_manager, video_agent = initialize_system()
# --- SIDEBAR ---
st.sidebar.title("1. Upload Video")
uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])
# --- SESSION STATE INITIALIZATION ---
if "active_video_id" not in st.session_state:
st.session_state.active_video_id = None
if "is_video_processed" not in st.session_state:
st.session_state.is_video_processed = False
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "visual_memory" not in st.session_state:
st.session_state.visual_memory = None
if "text_memory" not in st.session_state:
st.session_state.text_memory = None
# --- MAIN UI ---
st.title("πŸ¦… Visual Scout")
st.caption("Agentic Video Understanding System")
if uploaded_file is not None:
# Generate a simple ID from the filename
current_video_id = uploaded_file.name.split(".")[0]
# Detect if a new video was uploaded
if st.session_state.active_video_id != current_video_id:
st.session_state.active_video_id = current_video_id
st.session_state.is_video_processed = False
st.session_state.chat_history = []
st.session_state.visual_memory = None
st.session_state.text_memory = None
# Save the file locally
local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
with open(local_video_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.toast(f"Video '{current_video_id}' loaded.")
# --- PROCESSING PIPELINE ---
if not st.session_state.is_video_processed:
st.divider()
st.header("🧠 Analyzing Video Content")
st.info("The agent is watching the video to build a semantic index. This happens once per video.")
video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
# Initialize Memory Indices
visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
visual_memory_index = VectorIndex(visual_index_path)
text_memory_index = VectorIndex(text_index_path)
# Store in session
st.session_state.visual_memory = visual_memory_index
st.session_state.text_memory = text_memory_index
memory_manager.initialize_storage(current_video_id)
with st.status("πŸ¦… Scout: Scanning video timeline...", expanded=True) as status:
# Step 1: Extract Frames
status.write("Extracting frames at 1 FPS...")
raw_frames = list(extract_frames_decord(video_path, fps=1.0))
# Step 2: Semantic Cuts
status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
# Index Visuals
for timestamp, frame in raw_frames:
embedding = visual_scout.embed_image(frame)
visual_memory_index.add(timestamp, embedding)
visual_memory_index.save()
status.write(f"Detected {len(key_events)} key semantic events.")
# Step 3: Deep Captioning
progress_bar = st.progress(0)
event_log = []
ANALYSIS_PROMPT = """Analyze this scene.
1. Describe the main action and subject.
2. Note any text or signs visible.
3. Describe the environment."""
for i, (timestamp, frame) in enumerate(key_events):
status.write(f"πŸ‘οΈ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
# Save temp frame for the VLM
temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
time_string = time.strftime('%M:%S', time.gmtime(timestamp))
memory_manager.commit_event(current_video_id, time_string, description, {})
event_log.append(f"**{time_string}**: {description}")
# Index Text (for semantic search later)
text_embedding = visual_scout.embed_text(description)
text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
progress_bar.progress((i + 1) / len(key_events))
text_memory_index.save()
# Step 4: Summary
status.write("πŸ“ Writing Global Summary...")
full_timeline_text = "\n".join(event_log)
summary_prompt = f"""<|im_start|>system
You are a video editor. Read the timeline below and write a concise summary of the entire video.
TIMELINE:
{full_timeline_text}
<|im_end|>
<|im_start|>assistant
"""
summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
memory_manager.save_summary(current_video_id, summary)
status.update(label="βœ… Analysis Complete!", state="complete", expanded=False)
st.session_state.is_video_processed = True
st.success("Video Index Ready.")
st.markdown(f"**Summary:** {summary}")
with st.expander("See Detailed Timeline"):
for event in event_log:
st.write(event)
# Rerun to switch to Chat Mode cleanly
st.rerun()
# --- CHAT INTERFACE ---
else:
st.divider()
# Display History
for message in st.session_state.chat_history:
with st.chat_message(message["role"]):
st.write(message["content"])
# Chat Input
if user_query := st.chat_input("Ask about the video..."):
# Add User Message
st.session_state.chat_history.append({"role": "user", "content": user_query})
with st.chat_message("user"):
st.write(user_query)
# Generate Answer
with st.chat_message("assistant"):
with st.spinner("Agent is thinking..."):
# Inject Tools/Context into the Agent
video_agent.context = {
"scout": visual_scout,
"vis_index": st.session_state.visual_memory,
"txt_index": st.session_state.text_memory
}
response_text = video_agent.ask(user_query, st.session_state.active_video_id)
st.write(response_text)
# Add Assistant Message
st.session_state.chat_history.append({"role": "assistant", "content": response_text})