shukdevdattaEX commited on
Commit
6b9a020
·
verified ·
1 Parent(s): fe58263

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -1
app.py CHANGED
@@ -5,6 +5,10 @@ import base64
5
  import json
6
  from PIL import Image
7
  import io
 
 
 
 
8
 
9
  # Global variable to store the OpenAI client
10
  client = None
@@ -41,6 +45,145 @@ def encode_image(image):
41
  # Encode to base64
42
  return base64.b64encode(img_bytes).decode('utf-8')
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def create_message_content(text, images=None):
45
  """Create message content with text and optional images"""
46
  content = []
@@ -153,6 +296,85 @@ def process_request(api_key, task_type, image1=None, image2=None, image3=None, i
153
  "reasoning": ""
154
  })
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  # Enhanced custom CSS with the React design aesthetic
157
  custom_css = """
158
  /* Base styling */
@@ -374,6 +596,32 @@ body, .gradio-container {
374
  font-size: 0.9rem;
375
  }
376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  /* Loading animation */
378
  @keyframes spin {
379
  0% { transform: rotate(0deg); }
@@ -457,7 +705,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base(), title="NVIDIA Nemotron Na
457
  with gr.Column(scale=8):
458
  gr.Markdown("""
459
  # ⚡ NVIDIA Nemotron Nano 2 VL
460
- ### 12B Parameter Multimodal Reasoning Model
461
  Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
462
  """, elem_classes="markdown-content")
463
  with gr.Column(scale=2):
 
5
  import json
6
  from PIL import Image
7
  import io
8
+ import cv2
9
+ import tempfile
10
+ import numpy as np
11
+ from pathlib import Path
12
 
13
  # Global variable to store the OpenAI client
14
  client = None
 
45
  # Encode to base64
46
  return base64.b64encode(img_bytes).decode('utf-8')
47
 
48
+ def extract_frames_evs(video_path, num_frames=8, method="uniform"):
49
+ """
50
+ Extract frames from video using Efficient Video Sampling (EVS)
51
+
52
+ Args:
53
+ video_path: Path to video file
54
+ num_frames: Number of frames to extract (default: 8)
55
+ method: Sampling method - "uniform", "keyframe", or "adaptive"
56
+
57
+ Returns:
58
+ List of PIL Images
59
+ """
60
+ frames = []
61
+
62
+ try:
63
+ # Open video file
64
+ cap = cv2.VideoCapture(video_path)
65
+
66
+ if not cap.isOpened():
67
+ raise ValueError("Could not open video file")
68
+
69
+ # Get video properties
70
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
71
+ fps = cap.get(cv2.CAP_PROP_FPS)
72
+ duration = total_frames / fps if fps > 0 else 0
73
+
74
+ if total_frames == 0:
75
+ raise ValueError("Video has no frames")
76
+
77
+ # Adjust num_frames if video is too short
78
+ num_frames = min(num_frames, total_frames)
79
+
80
+ if method == "uniform":
81
+ # Uniform sampling - evenly spaced frames
82
+ frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
83
+
84
+ for idx in frame_indices:
85
+ cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
86
+ ret, frame = cap.read()
87
+
88
+ if ret:
89
+ # Convert BGR to RGB
90
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
91
+ # Convert to PIL Image
92
+ pil_image = Image.fromarray(frame_rgb)
93
+ # Resize for efficiency (max 1280px on longest side)
94
+ pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
95
+ frames.append(pil_image)
96
+
97
+ elif method == "keyframe":
98
+ # Keyframe detection - extract frames with significant changes
99
+ prev_frame = None
100
+ frame_indices = []
101
+ threshold = 30.0 # Difference threshold
102
+
103
+ for i in range(0, total_frames, max(1, total_frames // (num_frames * 3))):
104
+ cap.set(cv2.CAP_PROP_POS_FRAMES, i)
105
+ ret, frame = cap.read()
106
+
107
+ if not ret:
108
+ continue
109
+
110
+ # Convert to grayscale for comparison
111
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
112
+
113
+ if prev_frame is not None:
114
+ # Calculate difference
115
+ diff = cv2.absdiff(prev_frame, gray)
116
+ diff_score = np.mean(diff)
117
+
118
+ if diff_score > threshold:
119
+ frame_indices.append(i)
120
+ else:
121
+ frame_indices.append(i)
122
+
123
+ prev_frame = gray
124
+
125
+ if len(frame_indices) >= num_frames:
126
+ break
127
+
128
+ # If we didn't get enough keyframes, add uniform samples
129
+ if len(frame_indices) < num_frames:
130
+ additional = num_frames - len(frame_indices)
131
+ uniform_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
132
+ frame_indices.extend([idx for idx in uniform_indices if idx not in frame_indices][:additional])
133
+
134
+ frame_indices = sorted(frame_indices)[:num_frames]
135
+
136
+ for idx in frame_indices:
137
+ cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
138
+ ret, frame = cap.read()
139
+
140
+ if ret:
141
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
142
+ pil_image = Image.fromarray(frame_rgb)
143
+ pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
144
+ frames.append(pil_image)
145
+
146
+ elif method == "adaptive":
147
+ # Adaptive sampling - more frames at beginning and end, fewer in middle
148
+ # This is useful for videos with action at start/end
149
+ start_frames = num_frames // 3
150
+ end_frames = num_frames // 3
151
+ middle_frames = num_frames - start_frames - end_frames
152
+
153
+ # Start section
154
+ start_indices = np.linspace(0, total_frames * 0.2, start_frames, dtype=int)
155
+ # Middle section
156
+ middle_indices = np.linspace(total_frames * 0.2, total_frames * 0.8, middle_frames, dtype=int)
157
+ # End section
158
+ end_indices = np.linspace(total_frames * 0.8, total_frames - 1, end_frames, dtype=int)
159
+
160
+ frame_indices = np.concatenate([start_indices, middle_indices, end_indices])
161
+
162
+ for idx in frame_indices:
163
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
164
+ ret, frame = cap.read()
165
+
166
+ if ret:
167
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
168
+ pil_image = Image.fromarray(frame_rgb)
169
+ pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
170
+ frames.append(pil_image)
171
+
172
+ cap.release()
173
+
174
+ return frames, {
175
+ "total_frames": total_frames,
176
+ "fps": fps,
177
+ "duration": duration,
178
+ "extracted_frames": len(frames),
179
+ "method": method
180
+ }
181
+
182
+ except Exception as e:
183
+ if 'cap' in locals():
184
+ cap.release()
185
+ raise Exception(f"Error extracting frames: {str(e)}")
186
+
187
  def create_message_content(text, images=None):
188
  """Create message content with text and optional images"""
189
  content = []
 
296
  "reasoning": ""
297
  })
298
 
299
+ def process_video(api_key, video_file, question, num_frames, sampling_method, enable_reasoning):
300
+ """Process video with frame extraction and analysis"""
301
+
302
+ if not initialize_client(api_key):
303
+ return "❌ Please enter a valid OpenRouter API key.", "", None, ""
304
+
305
+ if video_file is None:
306
+ return "❌ Please upload a video file.", "", None, ""
307
+
308
+ try:
309
+ # Update status
310
+ status_msg = "⏳ Extracting frames from video using EVS...\n"
311
+
312
+ # Extract frames
313
+ frames, video_info = extract_frames_evs(
314
+ video_file,
315
+ num_frames=num_frames,
316
+ method=sampling_method
317
+ )
318
+
319
+ if not frames:
320
+ return "❌ Could not extract frames from video.", "", None, ""
321
+
322
+ # Update status with video info
323
+ status_msg += f"\n✅ Video Analysis:\n"
324
+ status_msg += f" • Total frames: {video_info['total_frames']}\n"
325
+ status_msg += f" • FPS: {video_info['fps']:.2f}\n"
326
+ status_msg += f" • Duration: {video_info['duration']:.2f} seconds\n"
327
+ status_msg += f" • Extracted: {video_info['extracted_frames']} frames\n"
328
+ status_msg += f" • Method: {video_info['method']}\n"
329
+ status_msg += f"\n⏳ Analyzing frames with Nemotron AI...\n"
330
+
331
+ # Create prompt
332
+ if not question or not question.strip():
333
+ prompt = f"Analyze this video by examining these {len(frames)} frames extracted from it. Provide a comprehensive description of:\n1. What is happening in the video\n2. Key events or actions\n3. Any changes or progression throughout\n4. Overall context and meaning\n5. Temporal relationships between frames"
334
+ else:
335
+ prompt = f"Based on these {len(frames)} frames from a video, {question}"
336
+
337
+ # Create message content with all frames
338
+ messages = [{
339
+ "role": "user",
340
+ "content": create_message_content(prompt, frames)
341
+ }]
342
+
343
+ # Prepare API call
344
+ api_params = {
345
+ "model": "nvidia/nemotron-nano-12b-v2-vl:free",
346
+ "messages": messages,
347
+ "max_tokens": 4000,
348
+ }
349
+
350
+ if enable_reasoning:
351
+ api_params["extra_body"] = {"reasoning": {"enabled": True}}
352
+
353
+ # Make API call
354
+ response = client.chat.completions.create(**api_params)
355
+
356
+ result = response.choices[0].message.content
357
+ reasoning_details = ""
358
+
359
+ # Extract reasoning if available
360
+ if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
361
+ reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
362
+
363
+ # Create frame gallery
364
+ frame_gallery = frames
365
+
366
+ status_msg += f"\n✅ Analysis complete!\n"
367
+
368
+ return (
369
+ f"🎥 **Video Analysis Complete**\n\n{result}",
370
+ reasoning_details if reasoning_details else "No reasoning details available.",
371
+ frame_gallery,
372
+ status_msg
373
+ )
374
+
375
+ except Exception as e:
376
+ return f"❌ Error processing video: {str(e)}", "", None, f"❌ Error: {str(e)}"
377
+
378
  # Enhanced custom CSS with the React design aesthetic
379
  custom_css = """
380
  /* Base styling */
 
596
  font-size: 0.9rem;
597
  }
598
 
599
+ /* Gallery */
600
+ .gr-gallery {
601
+ background: rgba(0, 0, 0, 0.3) !important;
602
+ border-radius: 16px !important;
603
+ border: 1px solid var(--border-color) !important;
604
+ }
605
+
606
+ /* Slider */
607
+ .gr-slider {
608
+ background: rgba(0, 0, 0, 0.3) !important;
609
+ border-radius: 12px !important;
610
+ }
611
+
612
+ /* Radio */
613
+ .gr-radio {
614
+ background: rgba(0, 0, 0, 0.3) !important;
615
+ border-radius: 12px !important;
616
+ padding: 12px !important;
617
+ }
618
+
619
+ /* Checkbox */
620
+ .gr-checkbox {
621
+ background: rgba(0, 0, 0, 0.2) !important;
622
+ border-radius: 8px !important;
623
+ }
624
+
625
  /* Loading animation */
626
  @keyframes spin {
627
  0% { transform: rotate(0deg); }
 
705
  with gr.Column(scale=8):
706
  gr.Markdown("""
707
  # ⚡ NVIDIA Nemotron Nano 2 VL
708
+ ### 12B Parameter Multimodal Reasoning Model with EVS Video Analysis
709
  Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
710
  """, elem_classes="markdown-content")
711
  with gr.Column(scale=2):