import streamlit as st
import shutil
import time
import os
import sys
import cv2
from pathlib import Path
from huggingface_hub import hf_hub_download

# Add project root to python path
sys.path.append(os.getcwd())

# Internal Modules
from src.perception.engine import Qwen2PerceptionEngine
from src.perception.scout import VisualScout
from src.memory.manager import SimpleMemoryManager
from src.memory.vector_index import VectorIndex
from src.core.orchestrator import VideoAgent
from src.config.settings import settings
from src.utils.video import extract_frames_decord

# --- PAGE CONFIGURATION ---
st.set_page_config(
    page_title="Visual Scout AI", 
    page_icon="🦅", 
    layout="wide",
    initial_sidebar_state="expanded"
)

# --- SYSTEM SETUP ---

@st.cache_resource
def initialize_system():
    """
    Loads the native Hugging Face model.
    """
    print("🚀 System Startup: Initializing Native Transformers Engine...")
    
    # 1. The Analyst (Native Qwen2-VL)
    perception_engine = Qwen2PerceptionEngine()
    # Model will lazy-load on first use or we can trigger it here
    
    # 2. The Scout (Fast Search, CPU)
    visual_scout = VisualScout()
    
    # 3. The Memory Manager
    memory_manager = SimpleMemoryManager(storage_dir=settings.paths.data_dir / "metadata")
    
    # 4. The Agent Orchestrator
    video_agent = VideoAgent(perception_engine, memory_manager)
    
    return perception_engine, visual_scout, memory_manager, video_agent

# Load the system
perception_engine, visual_scout, memory_manager, video_agent = initialize_system()

# --- SIDEBAR ---
st.sidebar.title("1. Upload Video")
uploaded_file = st.sidebar.file_uploader("Select a video file", type=["mp4", "avi", "mov"])

# --- SESSION STATE INITIALIZATION ---
if "active_video_id" not in st.session_state: 
    st.session_state.active_video_id = None
if "is_video_processed" not in st.session_state: 
    st.session_state.is_video_processed = False
if "chat_history" not in st.session_state: 
    st.session_state.chat_history = []
if "visual_memory" not in st.session_state: 
    st.session_state.visual_memory = None
if "text_memory" not in st.session_state: 
    st.session_state.text_memory = None

# --- MAIN UI ---
st.title("🦅 Visual Scout")
st.caption("Agentic Video Understanding System")

if uploaded_file is not None:
    # Generate a simple ID from the filename
    current_video_id = uploaded_file.name.split(".")[0]
    
    # Detect if a new video was uploaded
    if st.session_state.active_video_id != current_video_id:
        st.session_state.active_video_id = current_video_id
        st.session_state.is_video_processed = False
        st.session_state.chat_history = []
        st.session_state.visual_memory = None
        st.session_state.text_memory = None
        
        # Save the file locally
        local_video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
        with open(local_video_path, "wb") as f: 
            f.write(uploaded_file.getbuffer())
        st.toast(f"Video '{current_video_id}' loaded.")

    # --- PROCESSING PIPELINE ---
    if not st.session_state.is_video_processed:
        st.divider()
        st.header("🧠 Analyzing Video Content")
        st.info("The agent is watching the video to build a semantic index. This happens once per video.")
        
        video_path = settings.paths.data_dir / f"{current_video_id}.mp4"
        
        # Initialize Memory Indices
        visual_index_path = settings.paths.data_dir / f"{current_video_id}.visual.idx"
        text_index_path = settings.paths.data_dir / f"{current_video_id}.text.idx"
        
        visual_memory_index = VectorIndex(visual_index_path)
        text_memory_index = VectorIndex(text_index_path)
        
        # Store in session
        st.session_state.visual_memory = visual_memory_index
        st.session_state.text_memory = text_memory_index
        
        memory_manager.initialize_storage(current_video_id)
        
        with st.status("🦅 Scout: Scanning video timeline...", expanded=True) as status:
            
            # Step 1: Extract Frames
            status.write("Extracting frames at 1 FPS...")
            raw_frames = list(extract_frames_decord(video_path, fps=1.0))
            
            # Step 2: Semantic Cuts
            status.write(f"Analyzing {len(raw_frames)} frames for scene changes...")
            key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
            
            # Index Visuals
            for timestamp, frame in raw_frames:
                embedding = visual_scout.embed_image(frame)
                visual_memory_index.add(timestamp, embedding)
            visual_memory_index.save()
            
            status.write(f"Detected {len(key_events)} key semantic events.")
            
            # Step 3: Deep Captioning
            progress_bar = st.progress(0)
            event_log = []
            
            ANALYSIS_PROMPT = """Analyze this scene.
1. Describe the main action and subject.
2. Note any text or signs visible.
3. Describe the environment."""
            
            for i, (timestamp, frame) in enumerate(key_events):
                status.write(f"👁️ Analyst: Describing Event {i+1} at {timestamp:.1f}s...")
                
                # Save temp frame for the VLM
                temp_frame_path = settings.paths.data_dir / "temp_scene.jpg"
                cv2.imwrite(str(temp_frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
                
                description = perception_engine.analyze_frame(str(temp_frame_path), ANALYSIS_PROMPT)
                
                time_string = time.strftime('%M:%S', time.gmtime(timestamp))
                memory_manager.commit_event(current_video_id, time_string, description, {})
                event_log.append(f"**{time_string}**: {description}")
                
                # Index Text (for semantic search later)
                text_embedding = visual_scout.embed_text(description)
                text_memory_index.add(timestamp, text_embedding, extra_data={"text": description})
                
                progress_bar.progress((i + 1) / len(key_events))
            
            text_memory_index.save()
            
            # Step 4: Summary
            status.write("📝 Writing Global Summary...")
            full_timeline_text = "\n".join(event_log)
            summary_prompt = f"""<|im_start|>system
You are a video editor. Read the timeline below and write a concise summary of the entire video.
TIMELINE:
{full_timeline_text}
<|im_end|>
<|im_start|>assistant
"""
            summary = perception_engine.generate_text(summary_prompt, stop=["<|im_end|>"])
            memory_manager.save_summary(current_video_id, summary)
            
            status.update(label="✅ Analysis Complete!", state="complete", expanded=False)
            
        st.session_state.is_video_processed = True
        st.success("Video Index Ready.")
        st.markdown(f"**Summary:** {summary}")
        
        with st.expander("See Detailed Timeline"):
            for event in event_log: 
                st.write(event)
        
        # Rerun to switch to Chat Mode cleanly
        st.rerun()

    # --- CHAT INTERFACE ---
    else:
        st.divider()
        
        # Display History
        for message in st.session_state.chat_history:
            with st.chat_message(message["role"]):
                st.write(message["content"])
                
        # Chat Input
        if user_query := st.chat_input("Ask about the video..."):
            
            # Add User Message
            st.session_state.chat_history.append({"role": "user", "content": user_query})
            with st.chat_message("user"):
                st.write(user_query)
                
            # Generate Answer
            with st.chat_message("assistant"):
                with st.spinner("Agent is thinking..."):
                    
                    # Inject Tools/Context into the Agent
                    video_agent.context = {
                        "scout": visual_scout, 
                        "vis_index": st.session_state.visual_memory,
                        "txt_index": st.session_state.text_memory
                    }
                    
                    response_text = video_agent.ask(user_query, st.session_state.active_video_id)
                    st.write(response_text)
            
            # Add Assistant Message
            st.session_state.chat_history.append({"role": "assistant", "content": response_text})