ashleshp commited on
Commit
505e74e
·
1 Parent(s): ae80468
Files changed (3) hide show
  1. Dockerfile +0 -32
  2. README.md +6 -33
  3. app_gradio.py +107 -0
Dockerfile DELETED
@@ -1,32 +0,0 @@
1
- # Use official Python runtime
2
- FROM python:3.10-slim
3
-
4
- # Set working directory
5
- WORKDIR /app
6
-
7
- # Install system dependencies for OpenCV and Video processing
8
- RUN apt-get update && apt-get install -y \
9
- build-essential \
10
- curl \
11
- software-properties-common \
12
- git \
13
- ffmpeg \
14
- libsm6 \
15
- libxext6 \
16
- && rm -rf /var/lib/apt/lists/*
17
-
18
- # Copy requirements first for caching
19
- COPY requirements.txt .
20
- RUN pip install --no-cache-dir -r requirements.txt
21
-
22
- # Copy the rest of the application
23
- COPY . .
24
-
25
- # Expose Streamlit port
26
- EXPOSE 7860
27
-
28
- # Healthcheck
29
- HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
30
-
31
- # Run the app
32
- ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=7860", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -3,40 +3,13 @@ title: Visual Scout AI
3
  emoji: 🦅
4
  colorFrom: blue
5
  colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.30.0
8
- app_file: src/app.py
9
  pinned: false
10
- models:
11
- - Qwen/Qwen2-VL-2B-Instruct
12
  ---
13
 
14
  # Visual Scout: Agentic Video Understanding
15
-
16
- An agentic AI system that watches videos, builds a semantic index, and answers natural language questions using **Qwen2-VL**.
17
-
18
- ## 🚀 How to Run Locally
19
-
20
- 1. **Install Dependencies:**
21
- ```bash
22
- pip install -r requirements.txt
23
- ```
24
-
25
- 2. **Download Model:**
26
- ```bash
27
- python scripts/download_model.py
28
- ```
29
-
30
- 3. **Run App:**
31
- ```bash
32
- streamlit run src/app.py
33
- ```
34
-
35
- ## ☁️ Deployment (Hugging Face Spaces)
36
-
37
- This repository is configured for immediate deployment on Hugging Face Spaces.
38
-
39
- 1. Create a new Space on [Hugging Face](https://huggingface.co/spaces).
40
- 2. Select **Streamlit** as the SDK.
41
- 3. Connect this Git repository.
42
- 4. The system will automatically build using `requirements.txt`.
 
3
  emoji: 🦅
4
  colorFrom: blue
5
  colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.19.2
8
+ app_file: app_gradio.py
9
  pinned: false
10
+ sdk_params:
11
+ enable_queue: true
12
  ---
13
 
14
  # Visual Scout: Agentic Video Understanding
15
+ Powered by Qwen2-VL and Hugging Face ZeroGPU.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_gradio.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ import os
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+ from PIL import Image
9
+
10
+ # Add project root to python path
11
+ sys.path.append(os.getcwd())
12
+
13
+ from src.perception.engine import Qwen2PerceptionEngine
14
+ from src.perception.scout import VisualScout
15
+ from src.memory.manager import SimpleMemoryManager
16
+ from src.memory.vector_index import VectorIndex
17
+ from src.core.orchestrator import VideoAgent
18
+ from src.utils.video import extract_frames_decord
19
+
20
+ # Initialize the lightweight Scout and Memory (CPU-bound)
21
+ visual_scout = VisualScout()
22
+ memory_manager = SimpleMemoryManager(storage_dir=Path("data/metadata"))
23
+
24
+ # We keep the engine global but it will only "activate" the GPU inside the decorated function
25
+ perception_engine = Qwen2PerceptionEngine()
26
+
27
+ @spaces.GPU(duration=120)
28
+ def process_video_and_answer(video_path, user_query):
29
+ """
30
+ This function is powered by ZeroGPU.
31
+ It performs the indexing AND answering in one 'GPU Lease'.
32
+ """
33
+ if not video_path:
34
+ return "Please upload a video first."
35
+
36
+ video_id = Path(video_path).stem
37
+
38
+ # 1. Initialize System
39
+ perception_engine.load_model()
40
+ video_agent = VideoAgent(perception_engine, memory_manager)
41
+
42
+ # 2. Extract and Index (Lightweight)
43
+ visual_index_path = Path(f"data/{video_id}.visual.idx")
44
+ text_index_path = Path(f"data/{video_id}.text.idx")
45
+
46
+ visual_memory_index = VectorIndex(visual_index_path)
47
+ text_memory_index = VectorIndex(text_index_path)
48
+ memory_manager.initialize_storage(video_id)
49
+
50
+ # Extract frames
51
+ raw_frames = list(extract_frames_decord(Path(video_path), fps=1.0))
52
+
53
+ # Find key events
54
+ key_events = visual_scout.detect_semantic_changes(raw_frames, sensitivity=0.90)
55
+
56
+ # Index Visuals
57
+ for timestamp, frame in raw_frames:
58
+ embedding = visual_scout.embed_image(frame)
59
+ visual_memory_index.add(timestamp, embedding)
60
+
61
+ # Quick Analyst pass on key events
62
+ event_log = []
63
+ for i, (timestamp, frame) in enumerate(key_events[:5]): # Limit to 5 for speed
64
+ temp_path = f"temp_{i}.jpg"
65
+ Image.fromarray(frame).save(temp_path)
66
+
67
+ desc = perception_engine.analyze_frame(temp_path, "Describe this scene briefly.")
68
+
69
+ # Index the text
70
+ text_emb = visual_scout.embed_text(desc)
71
+ text_memory_index.add(timestamp, text_emb, extra_data={"text": desc})
72
+ event_log.append(f"{timestamp:.1f}s: {desc}")
73
+
74
+ # 3. Answer the Query
75
+ video_agent.context = {
76
+ "scout": visual_scout,
77
+ "vis_index": visual_memory_index,
78
+ "txt_index": text_memory_index
79
+ }
80
+
81
+ response = video_agent.ask(user_query, video_id)
82
+ return response
83
+
84
+ # --- GRADIO UI ---
85
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
86
+ gr.Markdown("# 🦅 Visual Scout (ZeroGPU Edition)")
87
+ gr.Markdown("Upload a video and ask a question. This app uses Hugging Face ZeroGPU for A100-powered reasoning.")
88
+
89
+ with gr.Row():
90
+ with gr.Column():
91
+ video_input = gr.Video(label="Upload Video")
92
+ query_input = gr.Textbox(label="Ask a question about the video", placeholder="e.g. What happens at the end?")
93
+ btn = gr.Button("Analyze & Answer", variant="primary")
94
+
95
+ with gr.Column():
96
+ output_text = gr.Textbox(label="Agent Response", interactive=False)
97
+
98
+ btn.click(
99
+ fn=process_video_and_answer,
100
+ inputs=[video_input, query_input],
101
+ outputs=[output_text]
102
+ )
103
+
104
+ if __name__ == "__main__":
105
+ # Ensure data dir exists
106
+ os.makedirs("data/metadata", exist_ok=True)
107
+ demo.launch()