Spaces:

ashleshp
/

Video-Scout

Runtime error

App Files Files Community

Video-Scout / src /app.py

ashleshp

Switch transformers

b6192e4 7 days ago

raw

history blame contribute delete

8.61 kB

	import streamlit as st
	import shutil
	import time
	import os
	import sys
	import cv2
	from pathlib import Path
	from huggingface_hub import hf_hub_download

	# Add project root to python path
	sys.path.append(os.getcwd())

	# Internal Modules
	from src.perception.engine import Qwen2PerceptionEngine
	from src.perception.scout import VisualScout
	from src.memory.manager import SimpleMemoryManager
	from src.memory.vector_index import VectorIndex
	from src.core.orchestrator import VideoAgent
	from src.config.settings import settings
	from src.utils.video import extract_frames_decord

	# --- PAGE CONFIGURATION ---
	st.set_page_config(
	page_title="Visual Scout AI",
	page_icon="🦅",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# --- SYSTEM SETUP ---

	@st.cache_resource
	def initialize_system():
	"""
	Loads the native Hugging Face model.
	"""
	print("🚀 System Startup: Initializing Native Transformers Engine...")

	# 1. The Analyst (Native Qwen2-VL)
	perception_engine = Qwen2PerceptionEngine()
	# Model will lazy-load on first use or we can trigger it here

	# 2. The Scout (Fast Search, CPU)
	visual_scout = VisualScout()

	# 3. The Memory Manager
	memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")

	# 4. The Agent Orchestrator
	video_agent = VideoAgent(perception_engine, memory_manager)

	return perception_engine, visual_scout, memory_manager, video_agent

	# Load the system
	perception_engine, visual_scout, memory_manager, video_agent = initialize_system()

	# --- SIDEBAR ---
	st.sidebar.title("1. Upload Video")
	uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])

	# --- SESSION STATE INITIALIZATION ---
	if "active_video_id" not in st.session_state:
	st.session_state.active_video_id = None
	if "is_video_processed" not in st.session_state:
	st.session_state.is_video_processed = False
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []
	if "visual_memory" not in st.session_state:
	st.session_state.visual_memory = None
	if "text_memory" not in st.session_state:
	st.session_state.text_memory = None

	# --- MAIN UI ---
	st.title("🦅 Visual Scout")
	st.caption("Agentic Video Understanding System")

	if uploaded_file is not None:
	# Generate a simple ID from the filename
	current_video_id = uploaded_file.name.split(".")[0]

	# Detect if a new video was uploaded
	if st.session_state.active_video_id != current_video_id:
	st.session_state.active_video_id = current_video_id
	st.session_state.is_video_processed = False
	st.session_state.chat_history = []
	st.session_state.visual_memory = None
	st.session_state.text_memory = None

	# Save the file locally
	local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
	with open(local_video_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	st.toast(f"Video '{current_video_id}' loaded.")

	# --- PROCESSING PIPELINE ---
	if not st.session_state.is_video_processed:
	st.divider()
	st.header("🧠 Analyzing Video Content")
	st.info("The agent is watching the video to build a semantic index. This happens once per video.")

	video_path = settings.paths.data_dir / f"{current_video_id}.mp4"

	# Initialize Memory Indices
	visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
	text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"

	visual_memory_index = VectorIndex(visual_index_path)
	text_memory_index = VectorIndex(text_index_path)

	# Store in session
	st.session_state.visual_memory = visual_memory_index
	st.session_state.text_memory = text_memory_index

	memory_manager.initialize_storage(current_video_id)

	with st.status("🦅 Scout: Scanning video timeline...", expanded=True) as status:

	# Step 1: Extract Frames
	status.write("Extracting frames at 1 FPS...")
	raw_frames = list(extract_frames_decord(video_path, fps=1.0))

	# Step 2: Semantic Cuts
	status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
	key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)

	# Index Visuals
	for timestamp, frame in raw_frames:
	embedding = visual_scout.embed_image(frame)
	visual_memory_index.add(timestamp, embedding)
	visual_memory_index.save()

	status.write(f"Detected {len(key_events)} key semantic events.")

	# Step 3: Deep Captioning
	progress_bar = st.progress(0)
	event_log = []

	ANALYSIS_PROMPT = """Analyze this scene.
	1. Describe the main action and subject.
	2. Note any text or signs visible.
	3. Describe the environment."""

	for i, (timestamp, frame) in enumerate(key_events):
	status.write(f"👁️ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")

	# Save temp frame for the VLM
	temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
	cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

	description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)

	time_string = time.strftime('%M:%S', time.gmtime(timestamp))
	memory_manager.commit_event(current_video_id, time_string, description, {})
	event_log.append(f"{time_string}: {description}")

	# Index Text (for semantic search later)
	text_embedding = visual_scout.embed_text(description)
	text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})

	progress_bar.progress((i + 1) / len(key_events))

	text_memory_index.save()

	# Step 4: Summary
	status.write("📝 Writing Global Summary...")
	full_timeline_text = "\n".join(event_log)
	summary_prompt = f"""<\|im_start\|>system
	You are a video editor. Read the timeline below and write a concise summary of the entire video.
	TIMELINE:
	{full_timeline_text}
	<\|im_end\|>
	<\|im_start\|>assistant
	"""
	summary = perception_engine.generate_text(summary_prompt, stop=["<\|im_end\|>"])
	memory_manager.save_summary(current_video_id, summary)

	status.update(label="✅ Analysis Complete!", state="complete", expanded=False)

	st.session_state.is_video_processed = True
	st.success("Video Index Ready.")
	st.markdown(f"Summary: {summary}")

	with st.expander("See Detailed Timeline"):
	for event in event_log:
	st.write(event)

	# Rerun to switch to Chat Mode cleanly
	st.rerun()

	# --- CHAT INTERFACE ---
	else:
	st.divider()

	# Display History
	for message in st.session_state.chat_history:
	with st.chat_message(message["role"]):
	st.write(message["content"])

	# Chat Input
	if user_query := st.chat_input("Ask about the video..."):

	# Add User Message
	st.session_state.chat_history.append({"role": "user", "content": user_query})
	with st.chat_message("user"):
	st.write(user_query)

	# Generate Answer
	with st.chat_message("assistant"):
	with st.spinner("Agent is thinking..."):

	# Inject Tools/Context into the Agent
	video_agent.context = {
	"scout": visual_scout,
	"vis_index": st.session_state.visual_memory,
	"txt_index": st.session_state.text_memory
	}

	response_text = video_agent.ask(user_query, st.session_state.active_video_id)
	st.write(response_text)

	# Add Assistant Message
	st.session_state.chat_history.append({"role": "assistant", "content": response_text})