Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,10 @@ import base64
|
|
| 5 |
import json
|
| 6 |
from PIL import Image
|
| 7 |
import io
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Global variable to store the OpenAI client
|
| 10 |
client = None
|
|
@@ -41,6 +45,145 @@ def encode_image(image):
|
|
| 41 |
# Encode to base64
|
| 42 |
return base64.b64encode(img_bytes).decode('utf-8')
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def create_message_content(text, images=None):
|
| 45 |
"""Create message content with text and optional images"""
|
| 46 |
content = []
|
|
@@ -153,6 +296,85 @@ def process_request(api_key, task_type, image1=None, image2=None, image3=None, i
|
|
| 153 |
"reasoning": ""
|
| 154 |
})
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
# Enhanced custom CSS with the React design aesthetic
|
| 157 |
custom_css = """
|
| 158 |
/* Base styling */
|
|
@@ -374,6 +596,32 @@ body, .gradio-container {
|
|
| 374 |
font-size: 0.9rem;
|
| 375 |
}
|
| 376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
/* Loading animation */
|
| 378 |
@keyframes spin {
|
| 379 |
0% { transform: rotate(0deg); }
|
|
@@ -457,7 +705,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base(), title="NVIDIA Nemotron Na
|
|
| 457 |
with gr.Column(scale=8):
|
| 458 |
gr.Markdown("""
|
| 459 |
# ⚡ NVIDIA Nemotron Nano 2 VL
|
| 460 |
-
### 12B Parameter Multimodal Reasoning Model
|
| 461 |
Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
|
| 462 |
""", elem_classes="markdown-content")
|
| 463 |
with gr.Column(scale=2):
|
|
|
|
| 5 |
import json
|
| 6 |
from PIL import Image
|
| 7 |
import io
|
| 8 |
+
import cv2
|
| 9 |
+
import tempfile
|
| 10 |
+
import numpy as np
|
| 11 |
+
from pathlib import Path
|
| 12 |
|
| 13 |
# Global variable to store the OpenAI client
|
| 14 |
client = None
|
|
|
|
| 45 |
# Encode to base64
|
| 46 |
return base64.b64encode(img_bytes).decode('utf-8')
|
| 47 |
|
| 48 |
+
def extract_frames_evs(video_path, num_frames=8, method="uniform"):
|
| 49 |
+
"""
|
| 50 |
+
Extract frames from video using Efficient Video Sampling (EVS)
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
video_path: Path to video file
|
| 54 |
+
num_frames: Number of frames to extract (default: 8)
|
| 55 |
+
method: Sampling method - "uniform", "keyframe", or "adaptive"
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
List of PIL Images
|
| 59 |
+
"""
|
| 60 |
+
frames = []
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
# Open video file
|
| 64 |
+
cap = cv2.VideoCapture(video_path)
|
| 65 |
+
|
| 66 |
+
if not cap.isOpened():
|
| 67 |
+
raise ValueError("Could not open video file")
|
| 68 |
+
|
| 69 |
+
# Get video properties
|
| 70 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 71 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 72 |
+
duration = total_frames / fps if fps > 0 else 0
|
| 73 |
+
|
| 74 |
+
if total_frames == 0:
|
| 75 |
+
raise ValueError("Video has no frames")
|
| 76 |
+
|
| 77 |
+
# Adjust num_frames if video is too short
|
| 78 |
+
num_frames = min(num_frames, total_frames)
|
| 79 |
+
|
| 80 |
+
if method == "uniform":
|
| 81 |
+
# Uniform sampling - evenly spaced frames
|
| 82 |
+
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
| 83 |
+
|
| 84 |
+
for idx in frame_indices:
|
| 85 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
| 86 |
+
ret, frame = cap.read()
|
| 87 |
+
|
| 88 |
+
if ret:
|
| 89 |
+
# Convert BGR to RGB
|
| 90 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 91 |
+
# Convert to PIL Image
|
| 92 |
+
pil_image = Image.fromarray(frame_rgb)
|
| 93 |
+
# Resize for efficiency (max 1280px on longest side)
|
| 94 |
+
pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
|
| 95 |
+
frames.append(pil_image)
|
| 96 |
+
|
| 97 |
+
elif method == "keyframe":
|
| 98 |
+
# Keyframe detection - extract frames with significant changes
|
| 99 |
+
prev_frame = None
|
| 100 |
+
frame_indices = []
|
| 101 |
+
threshold = 30.0 # Difference threshold
|
| 102 |
+
|
| 103 |
+
for i in range(0, total_frames, max(1, total_frames // (num_frames * 3))):
|
| 104 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
| 105 |
+
ret, frame = cap.read()
|
| 106 |
+
|
| 107 |
+
if not ret:
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
# Convert to grayscale for comparison
|
| 111 |
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 112 |
+
|
| 113 |
+
if prev_frame is not None:
|
| 114 |
+
# Calculate difference
|
| 115 |
+
diff = cv2.absdiff(prev_frame, gray)
|
| 116 |
+
diff_score = np.mean(diff)
|
| 117 |
+
|
| 118 |
+
if diff_score > threshold:
|
| 119 |
+
frame_indices.append(i)
|
| 120 |
+
else:
|
| 121 |
+
frame_indices.append(i)
|
| 122 |
+
|
| 123 |
+
prev_frame = gray
|
| 124 |
+
|
| 125 |
+
if len(frame_indices) >= num_frames:
|
| 126 |
+
break
|
| 127 |
+
|
| 128 |
+
# If we didn't get enough keyframes, add uniform samples
|
| 129 |
+
if len(frame_indices) < num_frames:
|
| 130 |
+
additional = num_frames - len(frame_indices)
|
| 131 |
+
uniform_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
| 132 |
+
frame_indices.extend([idx for idx in uniform_indices if idx not in frame_indices][:additional])
|
| 133 |
+
|
| 134 |
+
frame_indices = sorted(frame_indices)[:num_frames]
|
| 135 |
+
|
| 136 |
+
for idx in frame_indices:
|
| 137 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
| 138 |
+
ret, frame = cap.read()
|
| 139 |
+
|
| 140 |
+
if ret:
|
| 141 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 142 |
+
pil_image = Image.fromarray(frame_rgb)
|
| 143 |
+
pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
|
| 144 |
+
frames.append(pil_image)
|
| 145 |
+
|
| 146 |
+
elif method == "adaptive":
|
| 147 |
+
# Adaptive sampling - more frames at beginning and end, fewer in middle
|
| 148 |
+
# This is useful for videos with action at start/end
|
| 149 |
+
start_frames = num_frames // 3
|
| 150 |
+
end_frames = num_frames // 3
|
| 151 |
+
middle_frames = num_frames - start_frames - end_frames
|
| 152 |
+
|
| 153 |
+
# Start section
|
| 154 |
+
start_indices = np.linspace(0, total_frames * 0.2, start_frames, dtype=int)
|
| 155 |
+
# Middle section
|
| 156 |
+
middle_indices = np.linspace(total_frames * 0.2, total_frames * 0.8, middle_frames, dtype=int)
|
| 157 |
+
# End section
|
| 158 |
+
end_indices = np.linspace(total_frames * 0.8, total_frames - 1, end_frames, dtype=int)
|
| 159 |
+
|
| 160 |
+
frame_indices = np.concatenate([start_indices, middle_indices, end_indices])
|
| 161 |
+
|
| 162 |
+
for idx in frame_indices:
|
| 163 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
|
| 164 |
+
ret, frame = cap.read()
|
| 165 |
+
|
| 166 |
+
if ret:
|
| 167 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 168 |
+
pil_image = Image.fromarray(frame_rgb)
|
| 169 |
+
pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
|
| 170 |
+
frames.append(pil_image)
|
| 171 |
+
|
| 172 |
+
cap.release()
|
| 173 |
+
|
| 174 |
+
return frames, {
|
| 175 |
+
"total_frames": total_frames,
|
| 176 |
+
"fps": fps,
|
| 177 |
+
"duration": duration,
|
| 178 |
+
"extracted_frames": len(frames),
|
| 179 |
+
"method": method
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
if 'cap' in locals():
|
| 184 |
+
cap.release()
|
| 185 |
+
raise Exception(f"Error extracting frames: {str(e)}")
|
| 186 |
+
|
| 187 |
def create_message_content(text, images=None):
|
| 188 |
"""Create message content with text and optional images"""
|
| 189 |
content = []
|
|
|
|
| 296 |
"reasoning": ""
|
| 297 |
})
|
| 298 |
|
| 299 |
+
def process_video(api_key, video_file, question, num_frames, sampling_method, enable_reasoning):
|
| 300 |
+
"""Process video with frame extraction and analysis"""
|
| 301 |
+
|
| 302 |
+
if not initialize_client(api_key):
|
| 303 |
+
return "❌ Please enter a valid OpenRouter API key.", "", None, ""
|
| 304 |
+
|
| 305 |
+
if video_file is None:
|
| 306 |
+
return "❌ Please upload a video file.", "", None, ""
|
| 307 |
+
|
| 308 |
+
try:
|
| 309 |
+
# Update status
|
| 310 |
+
status_msg = "⏳ Extracting frames from video using EVS...\n"
|
| 311 |
+
|
| 312 |
+
# Extract frames
|
| 313 |
+
frames, video_info = extract_frames_evs(
|
| 314 |
+
video_file,
|
| 315 |
+
num_frames=num_frames,
|
| 316 |
+
method=sampling_method
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
if not frames:
|
| 320 |
+
return "❌ Could not extract frames from video.", "", None, ""
|
| 321 |
+
|
| 322 |
+
# Update status with video info
|
| 323 |
+
status_msg += f"\n✅ Video Analysis:\n"
|
| 324 |
+
status_msg += f" • Total frames: {video_info['total_frames']}\n"
|
| 325 |
+
status_msg += f" • FPS: {video_info['fps']:.2f}\n"
|
| 326 |
+
status_msg += f" • Duration: {video_info['duration']:.2f} seconds\n"
|
| 327 |
+
status_msg += f" • Extracted: {video_info['extracted_frames']} frames\n"
|
| 328 |
+
status_msg += f" • Method: {video_info['method']}\n"
|
| 329 |
+
status_msg += f"\n⏳ Analyzing frames with Nemotron AI...\n"
|
| 330 |
+
|
| 331 |
+
# Create prompt
|
| 332 |
+
if not question or not question.strip():
|
| 333 |
+
prompt = f"Analyze this video by examining these {len(frames)} frames extracted from it. Provide a comprehensive description of:\n1. What is happening in the video\n2. Key events or actions\n3. Any changes or progression throughout\n4. Overall context and meaning\n5. Temporal relationships between frames"
|
| 334 |
+
else:
|
| 335 |
+
prompt = f"Based on these {len(frames)} frames from a video, {question}"
|
| 336 |
+
|
| 337 |
+
# Create message content with all frames
|
| 338 |
+
messages = [{
|
| 339 |
+
"role": "user",
|
| 340 |
+
"content": create_message_content(prompt, frames)
|
| 341 |
+
}]
|
| 342 |
+
|
| 343 |
+
# Prepare API call
|
| 344 |
+
api_params = {
|
| 345 |
+
"model": "nvidia/nemotron-nano-12b-v2-vl:free",
|
| 346 |
+
"messages": messages,
|
| 347 |
+
"max_tokens": 4000,
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
if enable_reasoning:
|
| 351 |
+
api_params["extra_body"] = {"reasoning": {"enabled": True}}
|
| 352 |
+
|
| 353 |
+
# Make API call
|
| 354 |
+
response = client.chat.completions.create(**api_params)
|
| 355 |
+
|
| 356 |
+
result = response.choices[0].message.content
|
| 357 |
+
reasoning_details = ""
|
| 358 |
+
|
| 359 |
+
# Extract reasoning if available
|
| 360 |
+
if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
|
| 361 |
+
reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
|
| 362 |
+
|
| 363 |
+
# Create frame gallery
|
| 364 |
+
frame_gallery = frames
|
| 365 |
+
|
| 366 |
+
status_msg += f"\n✅ Analysis complete!\n"
|
| 367 |
+
|
| 368 |
+
return (
|
| 369 |
+
f"🎥 **Video Analysis Complete**\n\n{result}",
|
| 370 |
+
reasoning_details if reasoning_details else "No reasoning details available.",
|
| 371 |
+
frame_gallery,
|
| 372 |
+
status_msg
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
except Exception as e:
|
| 376 |
+
return f"❌ Error processing video: {str(e)}", "", None, f"❌ Error: {str(e)}"
|
| 377 |
+
|
| 378 |
# Enhanced custom CSS with the React design aesthetic
|
| 379 |
custom_css = """
|
| 380 |
/* Base styling */
|
|
|
|
| 596 |
font-size: 0.9rem;
|
| 597 |
}
|
| 598 |
|
| 599 |
+
/* Gallery */
|
| 600 |
+
.gr-gallery {
|
| 601 |
+
background: rgba(0, 0, 0, 0.3) !important;
|
| 602 |
+
border-radius: 16px !important;
|
| 603 |
+
border: 1px solid var(--border-color) !important;
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
/* Slider */
|
| 607 |
+
.gr-slider {
|
| 608 |
+
background: rgba(0, 0, 0, 0.3) !important;
|
| 609 |
+
border-radius: 12px !important;
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
/* Radio */
|
| 613 |
+
.gr-radio {
|
| 614 |
+
background: rgba(0, 0, 0, 0.3) !important;
|
| 615 |
+
border-radius: 12px !important;
|
| 616 |
+
padding: 12px !important;
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
/* Checkbox */
|
| 620 |
+
.gr-checkbox {
|
| 621 |
+
background: rgba(0, 0, 0, 0.2) !important;
|
| 622 |
+
border-radius: 8px !important;
|
| 623 |
+
}
|
| 624 |
+
|
| 625 |
/* Loading animation */
|
| 626 |
@keyframes spin {
|
| 627 |
0% { transform: rotate(0deg); }
|
|
|
|
| 705 |
with gr.Column(scale=8):
|
| 706 |
gr.Markdown("""
|
| 707 |
# ⚡ NVIDIA Nemotron Nano 2 VL
|
| 708 |
+
### 12B Parameter Multimodal Reasoning Model with EVS Video Analysis
|
| 709 |
Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
|
| 710 |
""", elem_classes="markdown-content")
|
| 711 |
with gr.Column(scale=2):
|