import streamlit as st import shutil import time import os import sys import cv2 from pathlib import Path from huggingface_hub import hf_hub_download # Add project root to python path sys.path.append(os.getcwd()) # Internal Modules from src.perception.engine import Qwen2PerceptionEngine from src.perception.scout import VisualScout from src.memory.manager import SimpleMemoryManager from src.memory.vector_index import VectorIndex from src.core.orchestrator import VideoAgent from src.config.settings import settings from src.utils.video import extract_frames_decord # --- PAGE CONFIGURATION --- st.set_page_config( page_title="Visual Scout AI", page_icon="🦅", layout="wide", initial_sidebar_state="expanded" ) # --- SYSTEM SETUP --- @st.cache_resource def initialize_system(): """ Loads the native Hugging Face model. """ print("🚀 System Startup: Initializing Native Transformers Engine...") # 1. The Analyst (Native Qwen2-VL) perception_engine = Qwen2PerceptionEngine() # Model will lazy-load on first use or we can trigger it here # 2. The Scout (Fast Search, CPU) visual_scout = VisualScout() # 3. The Memory Manager memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata") # 4. The Agent Orchestrator video_agent = VideoAgent(perception_engine, memory_manager) return perception_engine, visual_scout, memory_manager, video_agent # Load the system perception_engine, visual_scout, memory_manager, video_agent = initialize_system() # --- SIDEBAR --- st.sidebar.title("1. Upload Video") uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"]) # --- SESSION STATE INITIALIZATION --- if "active_video_id" not in st.session_state: st.session_state.active_video_id = None if "is_video_processed" not in st.session_state: st.session_state.is_video_processed = False if "chat_history" not in st.session_state: st.session_state.chat_history = [] if "visual_memory" not in st.session_state: st.session_state.visual_memory = None if "text_memory" not in st.session_state: st.session_state.text_memory = None # --- MAIN UI --- st.title("🦅 Visual Scout") st.caption("Agentic Video Understanding System") if uploaded_file is not None: # Generate a simple ID from the filename current_video_id = uploaded_file.name.split(".")[0] # Detect if a new video was uploaded if st.session_state.active_video_id != current_video_id: st.session_state.active_video_id = current_video_id st.session_state.is_video_processed = False st.session_state.chat_history = [] st.session_state.visual_memory = None st.session_state.text_memory = None # Save the file locally local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4" with open(local_video_path, "wb") as f: f.write(uploaded_file.getbuffer()) st.toast(f"Video '{current_video_id}' loaded.") # --- PROCESSING PIPELINE --- if not st.session_state.is_video_processed: st.divider() st.header("🧠 Analyzing Video Content") st.info("The agent is watching the video to build a semantic index. This happens once per video.") video_path = settings.paths.data_dir / f"{current_video_id}.mp4" # Initialize Memory Indices visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx" text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx" visual_memory_index = VectorIndex(visual_index_path) text_memory_index = VectorIndex(text_index_path) # Store in session st.session_state.visual_memory = visual_memory_index st.session_state.text_memory = text_memory_index memory_manager.initialize_storage(current_video_id) with st.status("🦅 Scout: Scanning video timeline...", expanded=True) as status: # Step 1: Extract Frames status.write("Extracting frames at 1 FPS...") raw_frames = list(extract_frames_decord(video_path, fps=1.0)) # Step 2: Semantic Cuts status.write(f"Analyzing {len(raw_frames)} frames for scene changes...") key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90) # Index Visuals for timestamp, frame in raw_frames: embedding = visual_scout.embed_image(frame) visual_memory_index.add(timestamp, embedding) visual_memory_index.save() status.write(f"Detected {len(key_events)} key semantic events.") # Step 3: Deep Captioning progress_bar = st.progress(0) event_log = [] ANALYSIS_PROMPT = """Analyze this scene. 1. Describe the main action and subject. 2. Note any text or signs visible. 3. Describe the environment.""" for i, (timestamp, frame) in enumerate(key_events): status.write(f"👁️ Analyst: Describing Event {i+1} at {timestamp:.1f}s...") # Save temp frame for the VLM temp_frame_path = settings.paths.data_dir / "temp_scene.jpg" cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT) time_string = time.strftime('%M:%S', time.gmtime(timestamp)) memory_manager.commit_event(current_video_id, time_string, description, {}) event_log.append(f"**{time_string}**: {description}") # Index Text (for semantic search later) text_embedding = visual_scout.embed_text(description) text_memory_index.add(timestamp, text_embedding, extra_data={"text": description}) progress_bar.progress((i + 1) / len(key_events)) text_memory_index.save() # Step 4: Summary status.write("📝 Writing Global Summary...") full_timeline_text = "\n".join(event_log) summary_prompt = f"""<|im_start|>system You are a video editor. Read the timeline below and write a concise summary of the entire video. TIMELINE: {full_timeline_text} <|im_end|> <|im_start|>assistant """ summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"]) memory_manager.save_summary(current_video_id, summary) status.update(label="✅ Analysis Complete!", state="complete", expanded=False) st.session_state.is_video_processed = True st.success("Video Index Ready.") st.markdown(f"**Summary:** {summary}") with st.expander("See Detailed Timeline"): for event in event_log: st.write(event) # Rerun to switch to Chat Mode cleanly st.rerun() # --- CHAT INTERFACE --- else: st.divider() # Display History for message in st.session_state.chat_history: with st.chat_message(message["role"]): st.write(message["content"]) # Chat Input if user_query := st.chat_input("Ask about the video..."): # Add User Message st.session_state.chat_history.append({"role": "user", "content": user_query}) with st.chat_message("user"): st.write(user_query) # Generate Answer with st.chat_message("assistant"): with st.spinner("Agent is thinking..."): # Inject Tools/Context into the Agent video_agent.context = { "scout": visual_scout, "vis_index": st.session_state.visual_memory, "txt_index": st.session_state.text_memory } response_text = video_agent.ask(user_query, st.session_state.active_video_id) st.write(response_text) # Add Assistant Message st.session_state.chat_history.append({"role": "assistant", "content": response_text})