File size: 8,605 Bytes
fca155a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6192e4
fca155a
b6192e4
fca155a
b6192e4
fca155a
b6192e4
fca155a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import streamlit as st
import shutil
import time
import os
import sys
import cv2
from pathlib import Path
from huggingface_hub import hf_hub_download

# Add project root to python path
sys.path.append(os.getcwd())

# Internal Modules
from src.perception.engine import Qwen2PerceptionEngine
from src.perception.scout import VisualScout
from src.memory.manager import SimpleMemoryManager
from src.memory.vector_index import VectorIndex
from src.core.orchestrator import VideoAgent
from src.config.settings import settings
from src.utils.video import extract_frames_decord

# --- PAGE CONFIGURATION ---
st.set_page_config(
    page_title="Visual Scout AI", 
    page_icon="πŸ¦…", 
    layout="wide",
    initial_sidebar_state="expanded"
)

# --- SYSTEM SETUP ---

@st.cache_resource
def initialize_system():
    """
    Loads the native Hugging Face model.
    """
    print("πŸš€ System Startup: Initializing Native Transformers Engine...")
    
    # 1. The Analyst (Native Qwen2-VL)
    perception_engine = Qwen2PerceptionEngine()
    # Model will lazy-load on first use or we can trigger it here
    
    # 2. The Scout (Fast Search, CPU)
    visual_scout = VisualScout()
    
    # 3. The Memory Manager
    memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
    
    # 4. The Agent Orchestrator
    video_agent = VideoAgent(perception_engine, memory_manager)
    
    return perception_engine, visual_scout, memory_manager, video_agent

# Load the system
perception_engine, visual_scout, memory_manager, video_agent = initialize_system()

# --- SIDEBAR ---
st.sidebar.title("1. Upload Video")
uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])

# --- SESSION STATE INITIALIZATION ---
if "active_video_id" not in st.session_state: 
    st.session_state.active_video_id = None
if "is_video_processed" not in st.session_state: 
    st.session_state.is_video_processed = False
if "chat_history" not in st.session_state: 
    st.session_state.chat_history = []
if "visual_memory" not in st.session_state: 
    st.session_state.visual_memory = None
if "text_memory" not in st.session_state: 
    st.session_state.text_memory = None

# --- MAIN UI ---
st.title("πŸ¦… Visual Scout")
st.caption("Agentic Video Understanding System")

if uploaded_file is not None:
    # Generate a simple ID from the filename
    current_video_id = uploaded_file.name.split(".")[0]
    
    # Detect if a new video was uploaded
    if st.session_state.active_video_id != current_video_id:
        st.session_state.active_video_id = current_video_id
        st.session_state.is_video_processed = False
        st.session_state.chat_history = []
        st.session_state.visual_memory = None
        st.session_state.text_memory = None
        
        # Save the file locally
        local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
        with open(local_video_path, "wb") as f: 
            f.write(uploaded_file.getbuffer())
        st.toast(f"Video '{current_video_id}' loaded.")

    # --- PROCESSING PIPELINE ---
    if not st.session_state.is_video_processed:
        st.divider()
        st.header("🧠 Analyzing Video Content")
        st.info("The agent is watching the video to build a semantic index. This happens once per video.")
        
        video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
        
        # Initialize Memory Indices
        visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
        text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
        
        visual_memory_index = VectorIndex(visual_index_path)
        text_memory_index = VectorIndex(text_index_path)
        
        # Store in session
        st.session_state.visual_memory = visual_memory_index
        st.session_state.text_memory = text_memory_index
        
        memory_manager.initialize_storage(current_video_id)
        
        with st.status("πŸ¦… Scout: Scanning video timeline...", expanded=True) as status:
            
            # Step 1: Extract Frames
            status.write("Extracting frames at 1 FPS...")
            raw_frames = list(extract_frames_decord(video_path, fps=1.0))
            
            # Step 2: Semantic Cuts
            status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
            key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
            
            # Index Visuals
            for timestamp, frame in raw_frames:
                embedding = visual_scout.embed_image(frame)
                visual_memory_index.add(timestamp, embedding)
            visual_memory_index.save()
            
            status.write(f"Detected {len(key_events)} key semantic events.")
            
            # Step 3: Deep Captioning
            progress_bar = st.progress(0)
            event_log = []
            
            ANALYSIS_PROMPT = """Analyze this scene.
1. Describe the main action and subject.
2. Note any text or signs visible.
3. Describe the environment."""
            
            for i, (timestamp, frame) in enumerate(key_events):
                status.write(f"πŸ‘οΈ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
                
                # Save temp frame for the VLM
                temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
                cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
                
                description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
                
                time_string = time.strftime('%M:%S', time.gmtime(timestamp))
                memory_manager.commit_event(current_video_id, time_string, description, {})
                event_log.append(f"**{time_string}**: {description}")
                
                # Index Text (for semantic search later)
                text_embedding = visual_scout.embed_text(description)
                text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
                
                progress_bar.progress((i + 1) / len(key_events))
            
            text_memory_index.save()
            
            # Step 4: Summary
            status.write("πŸ“ Writing Global Summary...")
            full_timeline_text = "\n".join(event_log)
            summary_prompt = f"""<|im_start|>system
You are a video editor. Read the timeline below and write a concise summary of the entire video.
TIMELINE:
{full_timeline_text}
<|im_end|>
<|im_start|>assistant
"""
            summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
            memory_manager.save_summary(current_video_id, summary)
            
            status.update(label="βœ… Analysis Complete!", state="complete", expanded=False)
            
        st.session_state.is_video_processed = True
        st.success("Video Index Ready.")
        st.markdown(f"**Summary:** {summary}")
        
        with st.expander("See Detailed Timeline"):
            for event in event_log: 
                st.write(event)
        
        # Rerun to switch to Chat Mode cleanly
        st.rerun()

    # --- CHAT INTERFACE ---
    else:
        st.divider()
        
        # Display History
        for message in st.session_state.chat_history:
            with st.chat_message(message["role"]):
                st.write(message["content"])
                
        # Chat Input
        if user_query := st.chat_input("Ask about the video..."):
            
            # Add User Message
            st.session_state.chat_history.append({"role": "user", "content": user_query})
            with st.chat_message("user"):
                st.write(user_query)
                
            # Generate Answer
            with st.chat_message("assistant"):
                with st.spinner("Agent is thinking..."):
                    
                    # Inject Tools/Context into the Agent
                    video_agent.context = {
                        "scout": visual_scout, 
                        "vis_index": st.session_state.visual_memory,
                        "txt_index": st.session_state.text_memory
                    }
                    
                    response_text = video_agent.ask(user_query, st.session_state.active_video_id)
                    st.write(response_text)
            
            # Add Assistant Message
            st.session_state.chat_history.append({"role": "assistant", "content": response_text})