Spaces:

shukdevdattaEX
/

NemoVision

Paused

App Files Files Community

shukdevdattaEX commited on Dec 26, 2025

Commit

6b9a020

verified ·

1 Parent(s): fe58263

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -1

app.py CHANGED Viewed

@@ -5,6 +5,10 @@ import base64
 import json
 from PIL import Image
 import io
 # Global variable to store the OpenAI client
 client = None
@@ -41,6 +45,145 @@ def encode_image(image):
     # Encode to base64
     return base64.b64encode(img_bytes).decode('utf-8')
 def create_message_content(text, images=None):
     """Create message content with text and optional images"""
     content = []
@@ -153,6 +296,85 @@ def process_request(api_key, task_type, image1=None, image2=None, image3=None, i
             "reasoning": ""
         })
 # Enhanced custom CSS with the React design aesthetic
 custom_css = """
 /* Base styling */
@@ -374,6 +596,32 @@ body, .gradio-container {
     font-size: 0.9rem;
 }
 /* Loading animation */
 @keyframes spin {
     0% { transform: rotate(0deg); }
@@ -457,7 +705,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base(), title="NVIDIA Nemotron Na
         with gr.Column(scale=8):
             gr.Markdown("""
             # ⚡ NVIDIA Nemotron Nano 2 VL
-            ### 12B Parameter Multimodal Reasoning Model
             Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
             """, elem_classes="markdown-content")
         with gr.Column(scale=2):

 import json
 from PIL import Image
 import io
+import cv2
+import tempfile
+import numpy as np
+from pathlib import Path
 # Global variable to store the OpenAI client
 client = None
     # Encode to base64
     return base64.b64encode(img_bytes).decode('utf-8')
+def extract_frames_evs(video_path, num_frames=8, method="uniform"):
+    """
+    Extract frames from video using Efficient Video Sampling (EVS)
+    Args:
+        video_path: Path to video file
+        num_frames: Number of frames to extract (default: 8)
+        method: Sampling method - "uniform", "keyframe", or "adaptive"
+    Returns:
+        List of PIL Images
+    """
+    frames = []
+    try:
+        # Open video file
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise ValueError("Could not open video file")
+        # Get video properties
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames / fps if fps > 0 else 0
+        if total_frames == 0:
+            raise ValueError("Video has no frames")
+        # Adjust num_frames if video is too short
+        num_frames = min(num_frames, total_frames)
+        if method == "uniform":
+            # Uniform sampling - evenly spaced frames
+            frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+            for idx in frame_indices:
+                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+                ret, frame = cap.read()
+                if ret:
+                    # Convert BGR to RGB
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    # Convert to PIL Image
+                    pil_image = Image.fromarray(frame_rgb)
+                    # Resize for efficiency (max 1280px on longest side)
+                    pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
+                    frames.append(pil_image)
+        elif method == "keyframe":
+            # Keyframe detection - extract frames with significant changes
+            prev_frame = None
+            frame_indices = []
+            threshold = 30.0  # Difference threshold
+            for i in range(0, total_frames, max(1, total_frames // (num_frames * 3))):
+                cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+                ret, frame = cap.read()
+                if not ret:
+                    continue
+                # Convert to grayscale for comparison
+                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                if prev_frame is not None:
+                    # Calculate difference
+                    diff = cv2.absdiff(prev_frame, gray)
+                    diff_score = np.mean(diff)
+                    if diff_score > threshold:
+                        frame_indices.append(i)
+                else:
+                    frame_indices.append(i)
+                prev_frame = gray
+                if len(frame_indices) >= num_frames:
+                    break
+            # If we didn't get enough keyframes, add uniform samples
+            if len(frame_indices) < num_frames:
+                additional = num_frames - len(frame_indices)
+                uniform_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+                frame_indices.extend([idx for idx in uniform_indices if idx not in frame_indices][:additional])
+            frame_indices = sorted(frame_indices)[:num_frames]
+            for idx in frame_indices:
+                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+                ret, frame = cap.read()
+                if ret:
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    pil_image = Image.fromarray(frame_rgb)
+                    pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
+                    frames.append(pil_image)
+        elif method == "adaptive":
+            # Adaptive sampling - more frames at beginning and end, fewer in middle
+            # This is useful for videos with action at start/end
+            start_frames = num_frames // 3
+            end_frames = num_frames // 3
+            middle_frames = num_frames - start_frames - end_frames
+            # Start section
+            start_indices = np.linspace(0, total_frames * 0.2, start_frames, dtype=int)
+            # Middle section
+            middle_indices = np.linspace(total_frames * 0.2, total_frames * 0.8, middle_frames, dtype=int)
+            # End section
+            end_indices = np.linspace(total_frames * 0.8, total_frames - 1, end_frames, dtype=int)
+            frame_indices = np.concatenate([start_indices, middle_indices, end_indices])
+            for idx in frame_indices:
+                cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
+                ret, frame = cap.read()
+                if ret:
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    pil_image = Image.fromarray(frame_rgb)
+                    pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
+                    frames.append(pil_image)
+        cap.release()
+        return frames, {
+            "total_frames": total_frames,
+            "fps": fps,
+            "duration": duration,
+            "extracted_frames": len(frames),
+            "method": method
+        }
+    except Exception as e:
+        if 'cap' in locals():
+            cap.release()
+        raise Exception(f"Error extracting frames: {str(e)}")
 def create_message_content(text, images=None):
     """Create message content with text and optional images"""
     content = []
             "reasoning": ""
         })
+def process_video(api_key, video_file, question, num_frames, sampling_method, enable_reasoning):
+    """Process video with frame extraction and analysis"""
+    if not initialize_client(api_key):
+        return "❌ Please enter a valid OpenRouter API key.", "", None, ""
+    if video_file is None:
+        return "❌ Please upload a video file.", "", None, ""
+    try:
+        # Update status
+        status_msg = "⏳ Extracting frames from video using EVS...\n"
+        # Extract frames
+        frames, video_info = extract_frames_evs(
+            video_file,
+            num_frames=num_frames,
+            method=sampling_method
+        )
+        if not frames:
+            return "❌ Could not extract frames from video.", "", None, ""
+        # Update status with video info
+        status_msg += f"\n✅ Video Analysis:\n"
+        status_msg += f"   • Total frames: {video_info['total_frames']}\n"
+        status_msg += f"   • FPS: {video_info['fps']:.2f}\n"
+        status_msg += f"   • Duration: {video_info['duration']:.2f} seconds\n"
+        status_msg += f"   • Extracted: {video_info['extracted_frames']} frames\n"
+        status_msg += f"   • Method: {video_info['method']}\n"
+        status_msg += f"\n⏳ Analyzing frames with Nemotron AI...\n"
+        # Create prompt
+        if not question or not question.strip():
+            prompt = f"Analyze this video by examining these {len(frames)} frames extracted from it. Provide a comprehensive description of:\n1. What is happening in the video\n2. Key events or actions\n3. Any changes or progression throughout\n4. Overall context and meaning\n5. Temporal relationships between frames"
+        else:
+            prompt = f"Based on these {len(frames)} frames from a video, {question}"
+        # Create message content with all frames
+        messages = [{
+            "role": "user",
+            "content": create_message_content(prompt, frames)
+        }]
+        # Prepare API call
+        api_params = {
+            "model": "nvidia/nemotron-nano-12b-v2-vl:free",
+            "messages": messages,
+            "max_tokens": 4000,
+        }
+        if enable_reasoning:
+            api_params["extra_body"] = {"reasoning": {"enabled": True}}
+        # Make API call
+        response = client.chat.completions.create(**api_params)
+        result = response.choices[0].message.content
+        reasoning_details = ""
+        # Extract reasoning if available
+        if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
+            reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
+        # Create frame gallery
+        frame_gallery = frames
+        status_msg += f"\n✅ Analysis complete!\n"
+        return (
+            f"🎥 **Video Analysis Complete**\n\n{result}",
+            reasoning_details if reasoning_details else "No reasoning details available.",
+            frame_gallery,
+            status_msg
+        )
+    except Exception as e:
+        return f"❌ Error processing video: {str(e)}", "", None, f"❌ Error: {str(e)}"
 # Enhanced custom CSS with the React design aesthetic
 custom_css = """
 /* Base styling */
     font-size: 0.9rem;
 }
+/* Gallery */
+.gr-gallery {
+    background: rgba(0, 0, 0, 0.3) !important;
+    border-radius: 16px !important;
+    border: 1px solid var(--border-color) !important;
+}
+/* Slider */
+.gr-slider {
+    background: rgba(0, 0, 0, 0.3) !important;
+    border-radius: 12px !important;
+}
+/* Radio */
+.gr-radio {
+    background: rgba(0, 0, 0, 0.3) !important;
+    border-radius: 12px !important;
+    padding: 12px !important;
+}
+/* Checkbox */
+.gr-checkbox {
+    background: rgba(0, 0, 0, 0.2) !important;
+    border-radius: 8px !important;
+}
 /* Loading animation */
 @keyframes spin {
     0% { transform: rotate(0deg); }
         with gr.Column(scale=8):
             gr.Markdown("""
             # ⚡ NVIDIA Nemotron Nano 2 VL
+            ### 12B Parameter Multimodal Reasoning Model with EVS Video Analysis
             Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
             """, elem_classes="markdown-content")
         with gr.Column(scale=2):