Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import shutil | |
| import time | |
| import os | |
| import sys | |
| import cv2 | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| # Add project root to python path | |
| sys.path.append(os.getcwd()) | |
| # Internal Modules | |
| from src.perception.engine import Qwen2PerceptionEngine | |
| from src.perception.scout import VisualScout | |
| from src.memory.manager import SimpleMemoryManager | |
| from src.memory.vector_index import VectorIndex | |
| from src.core.orchestrator import VideoAgent | |
| from src.config.settings import settings | |
| from src.utils.video import extract_frames_decord | |
| # --- PAGE CONFIGURATION --- | |
| st.set_page_config( | |
| page_title="Visual Scout AI", | |
| page_icon="π¦ ", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # --- SYSTEM SETUP --- | |
| def initialize_system(): | |
| """ | |
| Loads the native Hugging Face model. | |
| """ | |
| print("π System Startup: Initializing Native Transformers Engine...") | |
| # 1. The Analyst (Native Qwen2-VL) | |
| perception_engine = Qwen2PerceptionEngine() | |
| # Model will lazy-load on first use or we can trigger it here | |
| # 2. The Scout (Fast Search, CPU) | |
| visual_scout = VisualScout() | |
| # 3. The Memory Manager | |
| memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata") | |
| # 4. The Agent Orchestrator | |
| video_agent = VideoAgent(perception_engine, memory_manager) | |
| return perception_engine, visual_scout, memory_manager, video_agent | |
| # Load the system | |
| perception_engine, visual_scout, memory_manager, video_agent = initialize_system() | |
| # --- SIDEBAR --- | |
| st.sidebar.title("1. Upload Video") | |
| uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"]) | |
| # --- SESSION STATE INITIALIZATION --- | |
| if "active_video_id" not in st.session_state: | |
| st.session_state.active_video_id = None | |
| if "is_video_processed" not in st.session_state: | |
| st.session_state.is_video_processed = False | |
| if "chat_history" not in st.session_state: | |
| st.session_state.chat_history = [] | |
| if "visual_memory" not in st.session_state: | |
| st.session_state.visual_memory = None | |
| if "text_memory" not in st.session_state: | |
| st.session_state.text_memory = None | |
| # --- MAIN UI --- | |
| st.title("π¦ Visual Scout") | |
| st.caption("Agentic Video Understanding System") | |
| if uploaded_file is not None: | |
| # Generate a simple ID from the filename | |
| current_video_id = uploaded_file.name.split(".")[0] | |
| # Detect if a new video was uploaded | |
| if st.session_state.active_video_id != current_video_id: | |
| st.session_state.active_video_id = current_video_id | |
| st.session_state.is_video_processed = False | |
| st.session_state.chat_history = [] | |
| st.session_state.visual_memory = None | |
| st.session_state.text_memory = None | |
| # Save the file locally | |
| local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4" | |
| with open(local_video_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| st.toast(f"Video '{current_video_id}' loaded.") | |
| # --- PROCESSING PIPELINE --- | |
| if not st.session_state.is_video_processed: | |
| st.divider() | |
| st.header("π§ Analyzing Video Content") | |
| st.info("The agent is watching the video to build a semantic index. This happens once per video.") | |
| video_path = settings.paths.data_dir / f"{current_video_id}.mp4" | |
| # Initialize Memory Indices | |
| visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx" | |
| text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx" | |
| visual_memory_index = VectorIndex(visual_index_path) | |
| text_memory_index = VectorIndex(text_index_path) | |
| # Store in session | |
| st.session_state.visual_memory = visual_memory_index | |
| st.session_state.text_memory = text_memory_index | |
| memory_manager.initialize_storage(current_video_id) | |
| with st.status("π¦ Scout: Scanning video timeline...", expanded=True) as status: | |
| # Step 1: Extract Frames | |
| status.write("Extracting frames at 1 FPS...") | |
| raw_frames = list(extract_frames_decord(video_path, fps=1.0)) | |
| # Step 2: Semantic Cuts | |
| status.write(f"Analyzing {len(raw_frames)} frames for scene changes...") | |
| key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90) | |
| # Index Visuals | |
| for timestamp, frame in raw_frames: | |
| embedding = visual_scout.embed_image(frame) | |
| visual_memory_index.add(timestamp, embedding) | |
| visual_memory_index.save() | |
| status.write(f"Detected {len(key_events)} key semantic events.") | |
| # Step 3: Deep Captioning | |
| progress_bar = st.progress(0) | |
| event_log = [] | |
| ANALYSIS_PROMPT = """Analyze this scene. | |
| 1. Describe the main action and subject. | |
| 2. Note any text or signs visible. | |
| 3. Describe the environment.""" | |
| for i, (timestamp, frame) in enumerate(key_events): | |
| status.write(f"ποΈ Analyst: Describing Event {i+1} at {timestamp:.1f}s...") | |
| # Save temp frame for the VLM | |
| temp_frame_path = settings.paths.data_dir / "temp_scene.jpg" | |
| cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) | |
| description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT) | |
| time_string = time.strftime('%M:%S', time.gmtime(timestamp)) | |
| memory_manager.commit_event(current_video_id, time_string, description, {}) | |
| event_log.append(f"**{time_string}**: {description}") | |
| # Index Text (for semantic search later) | |
| text_embedding = visual_scout.embed_text(description) | |
| text_memory_index.add(timestamp, text_embedding, extra_data={"text": description}) | |
| progress_bar.progress((i + 1) / len(key_events)) | |
| text_memory_index.save() | |
| # Step 4: Summary | |
| status.write("π Writing Global Summary...") | |
| full_timeline_text = "\n".join(event_log) | |
| summary_prompt = f"""<|im_start|>system | |
| You are a video editor. Read the timeline below and write a concise summary of the entire video. | |
| TIMELINE: | |
| {full_timeline_text} | |
| <|im_end|> | |
| <|im_start|>assistant | |
| """ | |
| summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"]) | |
| memory_manager.save_summary(current_video_id, summary) | |
| status.update(label="β Analysis Complete!", state="complete", expanded=False) | |
| st.session_state.is_video_processed = True | |
| st.success("Video Index Ready.") | |
| st.markdown(f"**Summary:** {summary}") | |
| with st.expander("See Detailed Timeline"): | |
| for event in event_log: | |
| st.write(event) | |
| # Rerun to switch to Chat Mode cleanly | |
| st.rerun() | |
| # --- CHAT INTERFACE --- | |
| else: | |
| st.divider() | |
| # Display History | |
| for message in st.session_state.chat_history: | |
| with st.chat_message(message["role"]): | |
| st.write(message["content"]) | |
| # Chat Input | |
| if user_query := st.chat_input("Ask about the video..."): | |
| # Add User Message | |
| st.session_state.chat_history.append({"role": "user", "content": user_query}) | |
| with st.chat_message("user"): | |
| st.write(user_query) | |
| # Generate Answer | |
| with st.chat_message("assistant"): | |
| with st.spinner("Agent is thinking..."): | |
| # Inject Tools/Context into the Agent | |
| video_agent.context = { | |
| "scout": visual_scout, | |
| "vis_index": st.session_state.visual_memory, | |
| "txt_index": st.session_state.text_memory | |
| } | |
| response_text = video_agent.ask(user_query, st.session_state.active_video_id) | |
| st.write(response_text) | |
| # Add Assistant Message | |
| st.session_state.chat_history.append({"role": "assistant", "content": response_text}) |