Spaces:
Runtime error
Runtime error
File size: 8,605 Bytes
fca155a b6192e4 fca155a b6192e4 fca155a b6192e4 fca155a b6192e4 fca155a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import streamlit as st
import shutil
import time
import os
import sys
import cv2
from pathlib import Path
from huggingface_hub import hf_hub_download
# Add project root to python path
sys.path.append(os.getcwd())
# Internal Modules
from src.perception.engine import Qwen2PerceptionEngine
from src.perception.scout import VisualScout
from src.memory.manager import SimpleMemoryManager
from src.memory.vector_index import VectorIndex
from src.core.orchestrator import VideoAgent
from src.config.settings import settings
from src.utils.video import extract_frames_decord
# --- PAGE CONFIGURATION ---
st.set_page_config(
page_title="Visual Scout AI",
page_icon="π¦
",
layout="wide",
initial_sidebar_state="expanded"
)
# --- SYSTEM SETUP ---
@st.cache_resource
def initialize_system():
"""
Loads the native Hugging Face model.
"""
print("π System Startup: Initializing Native Transformers Engine...")
# 1. The Analyst (Native Qwen2-VL)
perception_engine = Qwen2PerceptionEngine()
# Model will lazy-load on first use or we can trigger it here
# 2. The Scout (Fast Search, CPU)
visual_scout = VisualScout()
# 3. The Memory Manager
memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
# 4. The Agent Orchestrator
video_agent = VideoAgent(perception_engine, memory_manager)
return perception_engine, visual_scout, memory_manager, video_agent
# Load the system
perception_engine, visual_scout, memory_manager, video_agent = initialize_system()
# --- SIDEBAR ---
st.sidebar.title("1. Upload Video")
uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])
# --- SESSION STATE INITIALIZATION ---
if "active_video_id" not in st.session_state:
st.session_state.active_video_id = None
if "is_video_processed" not in st.session_state:
st.session_state.is_video_processed = False
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "visual_memory" not in st.session_state:
st.session_state.visual_memory = None
if "text_memory" not in st.session_state:
st.session_state.text_memory = None
# --- MAIN UI ---
st.title("π¦
Visual Scout")
st.caption("Agentic Video Understanding System")
if uploaded_file is not None:
# Generate a simple ID from the filename
current_video_id = uploaded_file.name.split(".")[0]
# Detect if a new video was uploaded
if st.session_state.active_video_id != current_video_id:
st.session_state.active_video_id = current_video_id
st.session_state.is_video_processed = False
st.session_state.chat_history = []
st.session_state.visual_memory = None
st.session_state.text_memory = None
# Save the file locally
local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
with open(local_video_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.toast(f"Video '{current_video_id}' loaded.")
# --- PROCESSING PIPELINE ---
if not st.session_state.is_video_processed:
st.divider()
st.header("π§ Analyzing Video Content")
st.info("The agent is watching the video to build a semantic index. This happens once per video.")
video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
# Initialize Memory Indices
visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
visual_memory_index = VectorIndex(visual_index_path)
text_memory_index = VectorIndex(text_index_path)
# Store in session
st.session_state.visual_memory = visual_memory_index
st.session_state.text_memory = text_memory_index
memory_manager.initialize_storage(current_video_id)
with st.status("π¦
Scout: Scanning video timeline...", expanded=True) as status:
# Step 1: Extract Frames
status.write("Extracting frames at 1 FPS...")
raw_frames = list(extract_frames_decord(video_path, fps=1.0))
# Step 2: Semantic Cuts
status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
# Index Visuals
for timestamp, frame in raw_frames:
embedding = visual_scout.embed_image(frame)
visual_memory_index.add(timestamp, embedding)
visual_memory_index.save()
status.write(f"Detected {len(key_events)} key semantic events.")
# Step 3: Deep Captioning
progress_bar = st.progress(0)
event_log = []
ANALYSIS_PROMPT = """Analyze this scene.
1. Describe the main action and subject.
2. Note any text or signs visible.
3. Describe the environment."""
for i, (timestamp, frame) in enumerate(key_events):
status.write(f"ποΈ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
# Save temp frame for the VLM
temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
time_string = time.strftime('%M:%S', time.gmtime(timestamp))
memory_manager.commit_event(current_video_id, time_string, description, {})
event_log.append(f"**{time_string}**: {description}")
# Index Text (for semantic search later)
text_embedding = visual_scout.embed_text(description)
text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
progress_bar.progress((i + 1) / len(key_events))
text_memory_index.save()
# Step 4: Summary
status.write("π Writing Global Summary...")
full_timeline_text = "\n".join(event_log)
summary_prompt = f"""<|im_start|>system
You are a video editor. Read the timeline below and write a concise summary of the entire video.
TIMELINE:
{full_timeline_text}
<|im_end|>
<|im_start|>assistant
"""
summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
memory_manager.save_summary(current_video_id, summary)
status.update(label="β
Analysis Complete!", state="complete", expanded=False)
st.session_state.is_video_processed = True
st.success("Video Index Ready.")
st.markdown(f"**Summary:** {summary}")
with st.expander("See Detailed Timeline"):
for event in event_log:
st.write(event)
# Rerun to switch to Chat Mode cleanly
st.rerun()
# --- CHAT INTERFACE ---
else:
st.divider()
# Display History
for message in st.session_state.chat_history:
with st.chat_message(message["role"]):
st.write(message["content"])
# Chat Input
if user_query := st.chat_input("Ask about the video..."):
# Add User Message
st.session_state.chat_history.append({"role": "user", "content": user_query})
with st.chat_message("user"):
st.write(user_query)
# Generate Answer
with st.chat_message("assistant"):
with st.spinner("Agent is thinking..."):
# Inject Tools/Context into the Agent
video_agent.context = {
"scout": visual_scout,
"vis_index": st.session_state.visual_memory,
"txt_index": st.session_state.text_memory
}
response_text = video_agent.ask(user_query, st.session_state.active_video_id)
st.write(response_text)
# Add Assistant Message
st.session_state.chat_history.append({"role": "assistant", "content": response_text}) |