Spaces:

ashleshp
/

Video-Scout

Runtime error

App Files Files Community

ashleshp commited on Jan 27

Commit

505e74e

1 Parent(s): ae80468

gradio

Browse files

Files changed (3) hide show

Dockerfile +0 -32
README.md +6 -33
app_gradio.py +107 -0

Dockerfile DELETED Viewed

@@ -1,32 +0,0 @@
-# Use official Python runtime
-FROM python:3.10-slim
-# Set working directory
-WORKDIR /app
-# Install system dependencies for OpenCV and Video processing
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    software-properties-common \
-    git \
-    ffmpeg \
-    libsm6 \
-    libxext6 \
-    && rm -rf /var/lib/apt/lists/*
-# Copy requirements first for caching
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application
-COPY . .
-# Expose Streamlit port
-EXPOSE 7860
-# Healthcheck
-HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
-# Run the app
-ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=7860", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -3,40 +3,13 @@ title: Visual Scout AI
 emoji: 🦅
 colorFrom: blue
 colorTo: indigo
-sdk: streamlit
-sdk_version: 1.30.0
-app_file: src/app.py
 pinned: false
-models:
-  - Qwen/Qwen2-VL-2B-Instruct
 ---
 # Visual Scout: Agentic Video Understanding
-An agentic AI system that watches videos, builds a semantic index, and answers natural language questions using **Qwen2-VL**.
-## 🚀 How to Run Locally
-1. **Install Dependencies:**
-   ```bash
-   pip install -r requirements.txt
-   ```
-2. **Download Model:**
-   ```bash
-   python scripts/download_model.py
-   ```
-3. **Run App:**
-   ```bash
-   streamlit run src/app.py
-   ```
-## ☁️ Deployment (Hugging Face Spaces)
-This repository is configured for immediate deployment on Hugging Face Spaces.
-1. Create a new Space on [Hugging Face](https://huggingface.co/spaces).
-2. Select **Streamlit** as the SDK.
-3. Connect this Git repository.
-4. The system will automatically build using `requirements.txt`.

 emoji: 🦅
 colorFrom: blue
 colorTo: indigo
+sdk: gradio
+sdk_version: 4.19.2
+app_file: app_gradio.py
 pinned: false
+sdk_params:
+  enable_queue: true
 ---
 # Visual Scout: Agentic Video Understanding
+Powered by Qwen2-VL and Hugging Face ZeroGPU.

app_gradio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import gradio as gr
+import spaces
+import torch
+import os
+import sys
+import time
+from pathlib import Path
+from PIL import Image
+# Add project root to python path
+sys.path.append(os.getcwd())
+from src.perception.engine import Qwen2PerceptionEngine
+from src.perception.scout import VisualScout
+from src.memory.manager import SimpleMemoryManager
+from src.memory.vector_index import VectorIndex
+from src.core.orchestrator import VideoAgent
+from src.utils.video import extract_frames_decord
+# Initialize the lightweight Scout and Memory (CPU-bound)
+visual_scout = VisualScout()
+memory_manager = SimpleMemoryManager(storage_dir=Path("data/metadata"))
+# We keep the engine global but it will only "activate" the GPU inside the decorated function
+perception_engine = Qwen2PerceptionEngine()
+@spaces.GPU(duration=120)
+def process_video_and_answer(video_path, user_query):
+    """
+    This function is powered by ZeroGPU.
+    It performs the indexing AND answering in one 'GPU Lease'.
+    """
+    if not video_path:
+        return "Please upload a video first."
+    video_id = Path(video_path).stem
+    # 1. Initialize System
+    perception_engine.load_model()
+    video_agent = VideoAgent(perception_engine, memory_manager)
+    # 2. Extract and Index (Lightweight)
+    visual_index_path = Path(f"data/{video_id}.visual.idx")
+    text_index_path = Path(f"data/{video_id}.text.idx")
+    visual_memory_index = VectorIndex(visual_index_path)
+    text_memory_index = VectorIndex(text_index_path)
+    memory_manager.initialize_storage(video_id)
+    # Extract frames
+    raw_frames = list(extract_frames_decord(Path(video_path), fps=1.0))
+    # Find key events
+    key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
+    # Index Visuals
+    for timestamp, frame in raw_frames:
+        embedding = visual_scout.embed_image(frame)
+        visual_memory_index.add(timestamp, embedding)
+    # Quick Analyst pass on key events
+    event_log = []
+    for i, (timestamp, frame) in enumerate(key_events[:5]): # Limit to 5 for speed
+        temp_path = f"temp_{i}.jpg"
+        Image.fromarray(frame).save(temp_path)
+        desc = perception_engine.analyze_frame(temp_path, "Describe this scene briefly.")
+        # Index the text
+        text_emb = visual_scout.embed_text(desc)
+        text_memory_index.add(timestamp, text_emb, extra_data={"text": desc})
+        event_log.append(f"{timestamp:.1f}s: {desc}")
+    # 3. Answer the Query
+    video_agent.context = {
+        "scout": visual_scout,
+        "vis_index": visual_memory_index,
+        "txt_index": text_memory_index
+    }
+    response = video_agent.ask(user_query, video_id)
+    return response
+# --- GRADIO UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🦅 Visual Scout (ZeroGPU Edition)")
+    gr.Markdown("Upload a video and ask a question. This app uses Hugging Face ZeroGPU for A100-powered reasoning.")
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Upload Video")
+            query_input = gr.Textbox(label="Ask a question about the video", placeholder="e.g. What happens at the end?")
+            btn = gr.Button("Analyze & Answer", variant="primary")
+        with gr.Column():
+            output_text = gr.Textbox(label="Agent Response", interactive=False)
+    btn.click(
+        fn=process_video_and_answer,
+        inputs=[video_input, query_input],
+        outputs=[output_text]
+    )
+if __name__ == "__main__":
+    # Ensure data dir exists
+    os.makedirs("data/metadata", exist_ok=True)
+    demo.launch()