Spaces:

shukdevdattaEX
/

NemoVision

Paused

App Files Files Community

shukdevdattaEX commited on Dec 26, 2025

Commit

c0c6801

verified ·

1 Parent(s): 389bfd4

Update app.py

Browse files

Files changed (1) hide show

app.py +262 -933

app.py CHANGED Viewed

@@ -1,974 +1,303 @@
 import gradio as gr
-import os
 from openai import OpenAI
 import base64
-import json
 from PIL import Image
 import io
-import cv2
-import tempfile
-import numpy as np
-from pathlib import Path
-# Global variable to store the OpenAI client
-client = None
-def initialize_client(api_key):
-    """Initialize the OpenAI client with the provided API key"""
-    global client
-    if api_key and api_key.strip():
-        client = OpenAI(
-            base_url="https://openrouter.ai/api/v1",
-            api_key=api_key.strip(),
-        )
-        return True
-    return False
-def encode_image(image):
-    """Encode image to base64 string"""
-    if image is None:
-        return None
-    # Convert to PIL Image if it's not already
-    if not isinstance(image, Image.Image):
-        image = Image.fromarray(image)
-    # Convert to RGB if needed
-    if image.mode != 'RGB':
-        image = image.convert('RGB')
-    # Save to bytes
-    buffered = io.BytesIO()
-    image.save(buffered, format="JPEG", quality=95)
-    img_bytes = buffered.getvalue()
-    # Encode to base64
-    return base64.b64encode(img_bytes).decode('utf-8')
-def extract_frames_evs(video_path, num_frames=8, method="uniform"):
-    """
-    Extract frames from video using Efficient Video Sampling (EVS)
-    Args:
-        video_path: Path to video file
-        num_frames: Number of frames to extract (default: 8)
-        method: Sampling method - "uniform", "keyframe", or "adaptive"
-    Returns:
-        List of PIL Images
-    """
-    frames = []
-    try:
-        # Open video file
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError("Could not open video file")
-        # Get video properties
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        duration = total_frames / fps if fps > 0 else 0
-        if total_frames == 0:
-            raise ValueError("Video has no frames")
-        # Adjust num_frames if video is too short
-        num_frames = min(num_frames, total_frames)
-        if method == "uniform":
-            # Uniform sampling - evenly spaced frames
-            frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-            for idx in frame_indices:
-                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-                ret, frame = cap.read()
-                if ret:
-                    # Convert BGR to RGB
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    # Convert to PIL Image
-                    pil_image = Image.fromarray(frame_rgb)
-                    # Resize for efficiency (max 1280px on longest side)
-                    pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
-                    frames.append(pil_image)
-        elif method == "keyframe":
-            # Keyframe detection - extract frames with significant changes
-            prev_frame = None
-            frame_indices = []
-            threshold = 30.0  # Difference threshold
-            for i in range(0, total_frames, max(1, total_frames // (num_frames * 3))):
-                cap.set(cv2.CAP_PROP_POS_FRAMES, i)
-                ret, frame = cap.read()
-                if not ret:
-                    continue
-                # Convert to grayscale for comparison
-                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-                if prev_frame is not None:
-                    # Calculate difference
-                    diff = cv2.absdiff(prev_frame, gray)
-                    diff_score = np.mean(diff)
-                    if diff_score > threshold:
-                        frame_indices.append(i)
-                else:
-                    frame_indices.append(i)
-                prev_frame = gray
-                if len(frame_indices) >= num_frames:
-                    break
-            # If we didn't get enough keyframes, add uniform samples
-            if len(frame_indices) < num_frames:
-                additional = num_frames - len(frame_indices)
-                uniform_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-                frame_indices.extend([idx for idx in uniform_indices if idx not in frame_indices][:additional])
-            frame_indices = sorted(frame_indices)[:num_frames]
-            for idx in frame_indices:
-                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-                ret, frame = cap.read()
-                if ret:
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(frame_rgb)
-                    pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
-                    frames.append(pil_image)
-        elif method == "adaptive":
-            # Adaptive sampling - more frames at beginning and end, fewer in middle
-            # This is useful for videos with action at start/end
-            start_frames = num_frames // 3
-            end_frames = num_frames // 3
-            middle_frames = num_frames - start_frames - end_frames
-            # Start section
-            start_indices = np.linspace(0, total_frames * 0.2, start_frames, dtype=int)
-            # Middle section
-            middle_indices = np.linspace(total_frames * 0.2, total_frames * 0.8, middle_frames, dtype=int)
-            # End section
-            end_indices = np.linspace(total_frames * 0.8, total_frames - 1, end_frames, dtype=int)
-            frame_indices = np.concatenate([start_indices, middle_indices, end_indices])
-            for idx in frame_indices:
-                cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
-                ret, frame = cap.read()
-                if ret:
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(frame_rgb)
-                    pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
-                    frames.append(pil_image)
-        cap.release()
-        return frames, {
-            "total_frames": total_frames,
-            "fps": fps,
-            "duration": duration,
-            "extracted_frames": len(frames),
-            "method": method
-        }
-    except Exception as e:
-        if 'cap' in locals():
-            cap.release()
-        raise Exception(f"Error extracting frames: {str(e)}")
-def create_message_content(text, images=None):
-    """Create message content with text and optional images"""
-    content = []
-    # Add images first if provided
-    if images:
-        for img in images:
-            if img is not None:
-                img_base64 = encode_image(img)
-                if img_base64:
-                    content.append({
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/jpeg;base64,{img_base64}"
-                        }
-                    })
-    # Add text
-    if text and text.strip():
-        content.append({
-            "type": "text",
-            "text": text
-        })
-    return content if content else [{"type": "text", "text": "Please analyze this content."}]
-def process_request(api_key, task_type, image1=None, image2=None, image3=None, image4=None, text_input="", enable_reasoning=False):
-    """Main processing function that handles all types of requests"""
-    if not initialize_client(api_key):
-        return json.dumps({
-            "success": False,
-            "error": "Please enter a valid OpenRouter API key.",
-            "response": "",
-            "reasoning": ""
-        })
-    try:
-        # Collect all valid images
-        images = [img for img in [image1, image2, image3, image4] if img is not None]
-        # Validate inputs based on task type
-        if task_type in ["ocr", "chart", "multimodal"] and not images and not text_input.strip():
-            return json.dumps({
-                "success": False,
-                "error": "Please upload at least one image or enter text.",
-                "response": "",
-                "reasoning": ""
-            })
-        if task_type == "reasoning" and not text_input.strip():
-            return json.dumps({
-                "success": False,
-                "error": "Please enter a question or problem to solve.",
-                "response": "",
-                "reasoning": ""
-            })
-        # Set default prompts based on task type
-        if not text_input.strip():
-            prompts = {
-                "ocr": "Extract and analyze all text from this image. Provide a detailed analysis of the content, structure, and any key information.",
-                "chart": "Analyze this chart in detail. Describe the type of chart, extract all data points, identify trends, and provide insights.",
-                "video": "Analyze this video content frame by frame. Describe what you see and provide comprehensive insights.",
-                "multimodal": f"Analyze these {len(images)} images. Compare and contrast them, identify relationships, and provide comprehensive insights."
-            }
-            text_input = prompts.get(task_type, "Please analyze this content.")
-        # Create message content
-        messages = [{
-            "role": "user",
-            "content": create_message_content(text_input, images if images else None)
-        }]
-        # Prepare API call parameters
-        api_params = {
-            "model": "nvidia/nemotron-nano-12b-v2-vl:free",
-            "messages": messages,
-            "max_tokens": 3000,
-        }
-        # Add reasoning if enabled
-        if enable_reasoning or task_type == "reasoning":
-            api_params["extra_body"] = {"reasoning": {"enabled": True}}
-        # Make API call
-        response = client.chat.completions.create(**api_params)
-        result = response.choices[0].message.content
-        reasoning_details = ""
-        # Extract reasoning details if available
-        if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
-            reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
-        return json.dumps({
-            "success": True,
-            "error": "",
-            "response": result,
-            "reasoning": reasoning_details,
-            "task_type": task_type,
-            "image_count": len(images)
-        })
-    except Exception as e:
-        return json.dumps({
-            "success": False,
-            "error": f"Error: {str(e)}",
-            "response": "",
-            "reasoning": ""
-        })
-def process_video(api_key, video_file, question, num_frames, sampling_method, enable_reasoning):
-    """Process video with frame extraction and analysis"""
-    if not initialize_client(api_key):
-        return "❌ Please enter a valid OpenRouter API key.", "", None, ""
-    if video_file is None:
-        return "❌ Please upload a video file.", "", None, ""
-    try:
-        # Update status
-        status_msg = "⏳ Extracting frames from video using EVS...\n"
-        # Extract frames
-        frames, video_info = extract_frames_evs(
-            video_file,
-            num_frames=num_frames,
-            method=sampling_method
-        )
-        if not frames:
-            return "❌ Could not extract frames from video.", "", None, ""
-        # Update status with video info
-        status_msg += f"\n✅ Video Analysis:\n"
-        status_msg += f"   • Total frames: {video_info['total_frames']}\n"
-        status_msg += f"   • FPS: {video_info['fps']:.2f}\n"
-        status_msg += f"   • Duration: {video_info['duration']:.2f} seconds\n"
-        status_msg += f"   • Extracted: {video_info['extracted_frames']} frames\n"
-        status_msg += f"   • Method: {video_info['method']}\n"
-        status_msg += f"\n⏳ Analyzing frames with Nemotron AI...\n"
-        # Create prompt
-        if not question or not question.strip():
-            prompt = f"Analyze this video by examining these {len(frames)} frames extracted from it. Provide a comprehensive description of:\n1. What is happening in the video\n2. Key events or actions\n3. Any changes or progression throughout\n4. Overall context and meaning\n5. Temporal relationships between frames"
-        else:
-            prompt = f"Based on these {len(frames)} frames from a video, {question}"
-        # Create message content with all frames
-        messages = [{
-            "role": "user",
-            "content": create_message_content(prompt, frames)
-        }]
-        # Prepare API call
-        api_params = {
-            "model": "nvidia/nemotron-nano-12b-v2-vl:free",
-            "messages": messages,
-            "max_tokens": 4000,
-        }
-        if enable_reasoning:
-            api_params["extra_body"] = {"reasoning": {"enabled": True}}
-        # Make API call
-        response = client.chat.completions.create(**api_params)
-        result = response.choices[0].message.content
-        reasoning_details = ""
-        # Extract reasoning if available
-        if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
-            reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
-        # Create frame gallery
-        frame_gallery = frames
-        status_msg += f"\n✅ Analysis complete!\n"
-        return (
-            f"🎥 **Video Analysis Complete**\n\n{result}",
-            reasoning_details if reasoning_details else "No reasoning details available.",
-            frame_gallery,
-            status_msg
-        )
-    except Exception as e:
-        return f"❌ Error processing video: {str(e)}", "", None, f"❌ Error: {str(e)}"
-# Enhanced custom CSS with the React design aesthetic
-custom_css = """
-/* Base styling */
-:root {
-    --primary-purple: #7e22ce;
-    --primary-pink: #db2777;
-    --bg-dark: #0f172a;
-    --bg-darker: #020617;
-    --border-color: rgba(168, 85, 247, 0.3);
-}
-body, .gradio-container {
-    background: linear-gradient(135deg, #1e1b4b 0%, #7e22ce 50%, #1e1b4b 100%) !important;
-    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
 }
-/* Main container */
-.main-container {
-    max-width: 1400px;
     margin: 0 auto;
-    padding: 20px;
-}
-/* Header styling */
-#header-section {
-    background: rgba(0, 0, 0, 0.3);
-    backdrop-filter: blur(20px);
-    border-radius: 24px;
-    padding: 32px;
-    margin-bottom: 24px;
-    border: 1px solid var(--border-color);
-    box-shadow: 0 8px 32px rgba(126, 34, 206, 0.2);
-}
-#header-section h1 {
-    color: white;
-    font-size: 2.5rem;
-    font-weight: 700;
-    margin: 0;
-    letter-spacing: -0.02em;
-}
-#header-section p {
-    color: #c084fc;
-    font-size: 1.1rem;
-    margin: 8px 0 0 0;
-}
-/* API Key Section */
-#api-key-container {
-    background: linear-gradient(135deg, rgba(126, 34, 206, 0.4) 0%, rgba(219, 39, 119, 0.4) 100%);
-    backdrop-filter: blur(20px);
     border-radius: 20px;
-    padding: 28px;
-    margin-bottom: 24px;
-    border: 1px solid rgba(168, 85, 247, 0.4);
-    box-shadow: 0 8px 32px rgba(219, 39, 119, 0.2);
 }
-#api-key-container .label-wrap {
-    color: white !important;
-    font-weight: 600;
 }
-/* Input fields */
-.gr-textbox, .gr-file, .gr-image {
-    background: rgba(0, 0, 0, 0.4) !important;
-    border: 1px solid var(--border-color) !important;
-    border-radius: 16px !important;
-    color: white !important;
     backdrop-filter: blur(10px);
 }
-.gr-textbox:focus, .gr-file:focus, .gr-image:focus {
-    border-color: #a855f7 !important;
-    box-shadow: 0 0 0 3px rgba(168, 85, 247, 0.2) !important;
-}
-/* Tabs */
-.tab-nav {
-    background: rgba(0, 0, 0, 0.3) !important;
-    backdrop-filter: blur(20px) !important;
-    border-radius: 20px !important;
-    padding: 8px !important;
-    border: 1px solid rgba(168, 85, 247, 0.2) !important;
-    gap: 8px !important;
-}
-.tab-nav button {
-    background: transparent !important;
-    color: #c084fc !important;
-    border-radius: 14px !important;
-    padding: 14px 24px !important;
-    font-weight: 600 !important;
-    transition: all 0.3s ease !important;
-    border: none !important;
-}
-.tab-nav button:hover {
-    background: rgba(255, 255, 255, 0.05) !important;
-    color: white !important;
-}
-.tab-nav button.selected {
-    background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%) !important;
-    color: white !important;
-    box-shadow: 0 4px 16px rgba(126, 34, 206, 0.5) !important;
-}
-/* Buttons */
 .gr-button {
-    background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%) !important;
-    color: white !important;
-    border: none !important;
-    border-radius: 14px !important;
-    padding: 14px 28px !important;
-    font-weight: 600 !important;
-    font-size: 1rem !important;
-    cursor: pointer !important;
-    transition: all 0.3s ease !important;
-    box-shadow: 0 4px 16px rgba(126, 34, 206, 0.4) !important;
 }
 .gr-button:hover {
     transform: translateY(-2px);
-    box-shadow: 0 6px 24px rgba(126, 34, 206, 0.6) !important;
-}
-.gr-button:active {
-    transform: translateY(0px);
-}
-.gr-button.secondary {
-    background: rgba(255, 255, 255, 0.1) !important;
-    backdrop-filter: blur(10px);
-}
-/* Output boxes */
-.output-container {
-    background: rgba(0, 0, 0, 0.5) !important;
-    backdrop-filter: blur(20px);
-    border-radius: 20px !important;
-    padding: 24px !important;
-    border: 1px solid var(--border-color) !important;
-    min-height: 400px;
-    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
-}
-.output-container .label-wrap {
-    color: white !important;
-    font-weight: 600;
-    font-size: 1.1rem;
-}
-.output-container textarea {
-    background: rgba(0, 0, 0, 0.3) !important;
-    color: #e9d5ff !important;
-    border: none !important;
-    font-family: 'SF Mono', 'Monaco', 'Courier New', monospace;
-    font-size: 0.95rem;
-    line-height: 1.6;
-}
-/* Reasoning box */
-.reasoning-container {
-    background: linear-gradient(135deg, rgba(219, 39, 119, 0.3) 0%, rgba(126, 34, 206, 0.3) 100%) !important;
-    backdrop-filter: blur(20px);
-    border-radius: 20px !important;
-    padding: 24px !important;
-    border: 1px solid rgba(236, 72, 153, 0.4) !important;
-    margin-top: 20px;
-    box-shadow: 0 8px 32px rgba(219, 39, 119, 0.2);
-}
-.reasoning-container .label-wrap {
-    color: #fda4af !important;
-    font-weight: 600;
-    font-size: 1.1rem;
-}
-/* Feature cards */
-.feature-card {
-    background: rgba(0, 0, 0, 0.4);
-    backdrop-filter: blur(20px);
-    border-radius: 20px;
-    padding: 28px;
-    border: 1px solid rgba(168, 85, 247, 0.2);
-    transition: all 0.3s ease;
-}
-.feature-card:hover {
-    transform: translateY(-4px);
-    border-color: rgba(168, 85, 247, 0.5);
-    box-shadow: 0 12px 32px rgba(126, 34, 206, 0.3);
 }
-.feature-card h3 {
     color: white;
-    font-size: 1.3rem;
-    margin-bottom: 12px;
-    font-weight: 700;
-}
-.feature-card p {
-    color: #c084fc;
-    font-size: 0.95rem;
-    line-height: 1.6;
-}
-/* Status badge */
-.status-badge {
-    display: inline-block;
-    background: rgba(34, 197, 94, 0.2);
-    border: 1px solid rgba(34, 197, 94, 0.5);
-    padding: 8px 20px;
-    border-radius: 12px;
-    color: #86efac;
-    font-weight: 600;
-    font-size: 0.9rem;
-}
-/* Gallery */
-.gr-gallery {
-    background: rgba(0, 0, 0, 0.3) !important;
-    border-radius: 16px !important;
-    border: 1px solid var(--border-color) !important;
-}
-/* Slider */
-.gr-slider {
-    background: rgba(0, 0, 0, 0.3) !important;
-    border-radius: 12px !important;
-}
-/* Radio */
-.gr-radio {
-    background: rgba(0, 0, 0, 0.3) !important;
-    border-radius: 12px !important;
-    padding: 12px !important;
-}
-/* Checkbox */
-.gr-checkbox {
-    background: rgba(0, 0, 0, 0.2) !important;
-    border-radius: 8px !important;
 }
-/* Loading animation */
-@keyframes spin {
-    0% { transform: rotate(0deg); }
-    100% { transform: rotate(360deg); }
 }
-.loading-spinner {
-    border: 4px solid rgba(168, 85, 247, 0.2);
-    border-top: 4px solid #a855f7;
-    border-radius: 50%;
-    width: 48px;
-    height: 48px;
-    animation: spin 1s linear infinite;
-    margin: 0 auto;
 }
-/* Footer */
-#footer-section {
-    background: rgba(0, 0, 0, 0.3);
-    backdrop-filter: blur(20px);
-    border-radius: 20px;
-    padding: 24px;
-    margin-top: 32px;
-    text-align: center;
-    border: 1px solid rgba(168, 85, 247, 0.2);
-    color: #c084fc;
-}
-/* Markdown styling */
-.markdown-content h1, .markdown-content h2, .markdown-content h3 {
-    color: white !important;
-}
-.markdown-content p {
-    color: #e9d5ff !important;
-}
-/* Scrollbar */
-::-webkit-scrollbar {
-    width: 10px;
-}
-::-webkit-scrollbar-track {
-    background: rgba(0, 0, 0, 0.3);
-    border-radius: 10px;
-}
-::-webkit-scrollbar-thumb {
-    background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%);
-    border-radius: 10px;
-}
-::-webkit-scrollbar-thumb:hover {
-    background: linear-gradient(135deg, #6b21a8 0%, #be185d 100%);
-}
-/* Responsive adjustments */
-@media (max-width: 768px) {
-    #header-section h1 {
-        font-size: 1.8rem;
-    }
-    #header-section p {
-        font-size: 0.95rem;
-    }
-    .feature-card {
-        padding: 20px;
-    }
-}
-"""
-# Build the Gradio interface with React-inspired design
-with gr.Blocks(css=custom_css, theme=gr.themes.Ocean(), title="NVIDIA Nemotron Nano 2 VL") as demo:
-    # Hidden state for API key
-    api_key_state = gr.State("")
-    # Header
-    with gr.Row(elem_id="header-section"):
-        with gr.Column(scale=8):
-            gr.Markdown("""
-            # ⚡ NVIDIA Nemotron Nano 2 VL
-            ### 12B Parameter Multimodal Reasoning Model with EVS Video Analysis
-            Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
-            """, elem_classes="markdown-content")
-        with gr.Column(scale=2):
-            gr.HTML("""
-            <div style='text-align: right; padding: 12px 20px; background: rgba(34, 197, 94, 0.2); border-radius: 12px; border: 1px solid rgba(34, 197, 94, 0.5);'>
-            <b style='color: #86efac; font-size: 0.9rem;'>✓ FREE ACCESS</b>
-            </div>
-            """)
-    # API Key Section
-    with gr.Row(elem_id="api-key-container"):
-        with gr.Column():
-            gr.Markdown("""
-            ### 🔐 OpenRouter API Key
-            Enter your OpenRouter API key to access the NVIDIA Nemotron model. Get yours at [openrouter.ai](https://openrouter.ai)
-            """, elem_classes="markdown-content")
             api_key_input = gr.Textbox(
-                label="API Key",
-                placeholder="sk-or-v1-...",
                 type="password",
-                scale=4,
-                elem_classes="api-key-input"
-            )
-    # Tabs for different functionalities
-    with gr.Tabs(elem_classes="tab-nav"):
-        # OCR & Document Intelligence Tab
-        with gr.Tab("📄 OCR & Document", elem_classes="tab-item"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📤 Upload Document")
-                    ocr_image = gr.Image(type="pil", label="Upload Image/Document", height=300)
-                    ocr_text = gr.Textbox(
-                        label="Instructions (Optional)",
-                        placeholder="Describe what you want to extract or analyze...",
-                        lines=4
-                    )
-                    ocr_btn = gr.Button("🔍 Analyze Document", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📊 Analysis Result")
-                    ocr_output = gr.Textbox(
-                        label="Response",
-                        lines=15,
-                        elem_classes="output-container",
-                        show_copy_button=True
-                    )
-                    ocr_reasoning = gr.Textbox(
-                        label="Reasoning Details",
-                        lines=5,
-                        elem_classes="reasoning-container",
-                        visible=False
-                    )
-            def ocr_wrapper(api_key, image, text):
-                result = process_request(api_key, "ocr", image1=image, text_input=text)
-                data = json.loads(result)
-                if data["success"]:
-                    return data["response"], data["reasoning"] if data["reasoning"] else ""
-                else:
-                    return f"❌ {data['error']}", ""
-            ocr_btn.click(
-                fn=ocr_wrapper,
-                inputs=[api_key_input, ocr_image, ocr_text],
-                outputs=[ocr_output, ocr_reasoning]
             )
-        # Chart Analysis Tab
-        with gr.Tab("📊 Chart Analysis", elem_classes="tab-item"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📈 Upload Chart/Graph")
-                    chart_image = gr.Image(type="pil", label="Upload Chart", height=300)
-                    chart_question = gr.Textbox(
-                        label="Question (Optional)",
-                        placeholder="What insights do you want from this chart?",
-                        lines=3
-                    )
-                    chart_btn = gr.Button("📈 Analyze Chart", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📊 Chart Insights")
-                    chart_output = gr.Textbox(
-                        label="Response",
-                        lines=15,
-                        elem_classes="output-container",
-                        show_copy_button=True
-                    )
-            def chart_wrapper(api_key, image, question):
-                result = process_request(api_key, "chart", image1=image, text_input=question)
-                data = json.loads(result)
-                if data["success"]:
-                    return data["response"]
-                else:
-                    return f"❌ {data['error']}"
-            chart_btn.click(
-                fn=chart_wrapper,
-                inputs=[api_key_input, chart_image, chart_question],
-                outputs=[chart_output]
-            )
-        # Video Understanding Tab
-        with gr.Tab("🎥 Video Understanding", elem_classes="tab-item"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🎬 Upload Video")
-                    gr.Markdown("""
-                    **Note**: Full video analysis requires frame extraction and EVS implementation.
-                    Upload video frames as images in the Multi-Image tab for now.
-                    """)
-                    video_input = gr.Video(label="Upload Video")
-                    video_question = gr.Textbox(
-                        label="Question",
-                        placeholder="What would you like to know about this video?",
-                        lines=4
-                    )
-                    video_btn = gr.Button("🎬 Analyze Video", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🎥 Video Analysis")
-                    video_output = gr.Textbox(
-                        label="Response",
-                        lines=15,
-                        elem_classes="output-container"
-                    )
-            def video_wrapper(api_key, video, question):
-                return "🎥 **Video Analysis Placeholder**\n\nVideo analysis requires:\n\n1. Frame extraction from video\n2. EVS (Efficient Video Sampling) implementation\n3. Multi-frame context processing\n\nFor now, extract key frames and use the Multi-Image Analysis tab.\n\nFull implementation coming soon!"
-            video_btn.click(
-                fn=video_wrapper,
-                inputs=[api_key_input, video_input, video_question],
-                outputs=[video_output]
-            )
-        # Advanced Reasoning Tab
-        with gr.Tab("🧠 Advanced Reasoning", elem_classes="tab-item"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("""
-                    ### 💡 Complex Problem Solving
-                    Ask complex questions and get detailed step-by-step reasoning
-                    """)
-                    reasoning_input = gr.Textbox(
-                        label="Question",
-                        placeholder="Ask a complex reasoning question...\n\nExamples:\n- How many R's are in 'strawberry'?\n- Solve this logic puzzle...\n- Calculate the average speed...",
-                        lines=10
-                    )
-                    reasoning_btn = gr.Button("💡 Start Reasoning", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🎯 Answer & Reasoning")
-                    reasoning_output = gr.Textbox(
-                        label="Response",
-                        lines=12,
-                        elem_classes="output-container",
-                        show_copy_button=True
-                    )
-                    reasoning_details = gr.Textbox(
-                        label="🧠 Reasoning Process",
-                        lines=8,
-                        elem_classes="reasoning-container",
-                        show_copy_button=True
-                    )
-            def reasoning_wrapper(api_key, question):
-                result = process_request(api_key, "reasoning", text_input=question, enable_reasoning=True)
-                data = json.loads(result)
-                if data["success"]:
-                    reasoning_text = data["reasoning"] if data["reasoning"] else "Reasoning details not available for this response."
-                    return data["response"], reasoning_text
-                else:
-                    return f"❌ {data['error']}", ""
-            reasoning_btn.click(
-                fn=reasoning_wrapper,
-                inputs=[api_key_input, reasoning_input],
-                outputs=[reasoning_output, reasoning_details]
-            )
-        # Multi-Image Analysis Tab
-        with gr.Tab("🖼️ Multi-Image Analysis", elem_classes="tab-item"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🖼️ Upload Multiple Images (1-4)")
-                    with gr.Row():
-                        multi_image1 = gr.Image(type="pil", label="Image 1", height=200)
-                        multi_image2 = gr.Image(type="pil", label="Image 2", height=200)
-                    with gr.Row():
-                        multi_image3 = gr.Image(type="pil", label="Image 3", height=200)
-                        multi_image4 = gr.Image(type="pil", label="Image 4", height=200)
-                    multi_question = gr.Textbox(
-                        label="Question (Optional)",
-                        placeholder="Compare these images, find differences, identify patterns...",
-                        lines=3
-                    )
-                    multi_btn = gr.Button("🔍 Analyze Images", variant="primary", size="lg")
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🎨 Multi-Image Insights")
-                    multi_output = gr.Textbox(
-                        label="Response",
-                        lines=20,
-                        elem_classes="output-container",
-                        show_copy_button=True
-                    )
-            def multi_wrapper(api_key, img1, img2, img3, img4, question):
-                result = process_request(
-                    api_key, "multimodal",
-                    image1=img1, image2=img2, image3=img3, image4=img4,
-                    text_input=question
-                )
-                data = json.loads(result)
-                if data["success"]:
-                    return f"🖼️ **Analyzing {data['image_count']} image(s)**\n\n{data['response']}"
-                else:
-                    return f"❌ {data['error']}"
-            multi_btn.click(
-                fn=multi_wrapper,
-                inputs=[api_key_input, multi_image1, multi_image2, multi_image3, multi_image4, multi_question],
-                outputs=[multi_output]
             )
-    # Features Section
-    gr.Markdown("## 🚀 Key Features", elem_classes="markdown-content")
     with gr.Row():
-        with gr.Column(elem_classes="feature-card"):
-            gr.Markdown("""
-            ### ⚡ Hybrid Architecture
-            Transformer-Mamba fusion for efficient processing with higher throughput and lower latency
-            """)
-        with gr.Column(elem_classes="feature-card"):
-            gr.Markdown("""
-            ### 📊 74% Benchmark Average
-            Leading performance across MMMU, MathVista, AI2D, OCRBench, ChartQA, DocVQA, and more
-            """)
-        with gr.Column(elem_classes="feature-card"):
-            gr.Markdown("""
-            ### 🎥 EVS Technology
-            Efficient Video Sampling for long-form video understanding with reduced inference cost
-            """)
     # Footer
-    with gr.Row(elem_id="footer-section"):
-        gr.Markdown("""
-        Powered by **NVIDIA Nemotron Nano 12B 2 VL** via OpenRouter API | Open-weights model with permissive NVIDIA license
-        Built with ❤️ using Gradio | [Documentation](https://docs.nvidia.com) | [Report Issues](https://github.com)
-        """, elem_classes="markdown-content")
-# Launch the app
 if __name__ == "__main__":
     demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-        show_error=True
     )

 import gradio as gr
 from openai import OpenAI
 import base64
+import os
+from typing import List, Tuple, Any, Dict, Optional
 from PIL import Image
 import io
+# Custom CSS for premium, stunning design
+CUSTOM_CSS = """
+body {
+    background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 50%, #16213e 100%);
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    color: #e0e0e0;
 }
+.gradio-container {
+    max-width: 1400px !important;
     margin: 0 auto;
+    background: rgba(0, 0, 0, 0.1);
     border-radius: 20px;
+    box-shadow: 0 20px 40px rgba(0, 0, 0, 0.5);
+    overflow: hidden;
 }
+h1 {
+    background: linear-gradient(45deg, #00d4ff, #0099cc);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    text-align: center;
+    margin: 0;
+    padding: 20px;
+    font-size: 2.5em;
+    text-shadow: 0 0 20px rgba(0, 212, 255, 0.5);
 }
+.gr-chatbot {
+    background: rgba(255, 255, 255, 0.05);
+    border-radius: 15px;
+    border: 1px solid rgba(0, 212, 255, 0.2);
     backdrop-filter: blur(10px);
 }
 .gr-button {
+    background: linear-gradient(45deg, #00d4ff, #0099cc);
+    border: none;
+    border-radius: 10px;
+    color: white;
+    font-weight: bold;
+    transition: all 0.3s ease;
+    box-shadow: 0 5px 15px rgba(0, 212, 255, 0.3);
 }
 .gr-button:hover {
     transform: translateY(-2px);
+    box-shadow: 0 8px 25px rgba(0, 212, 255, 0.4);
 }
+.gr-textbox, .gr-file {
+    background: rgba(255, 255, 255, 0.1);
+    border: 1px solid rgba(0, 212, 255, 0.3);
+    border-radius: 10px;
     color: white;
+    backdrop-filter: blur(5px);
 }
+.gr-textbox::placeholder {
+    color: #a0a0a0;
 }
+.sidebar {
+    background: rgba(0, 0, 0, 0.2);
+    padding: 20px;
+    border-radius: 15px;
+    margin: 10px;
+    border: 1px solid rgba(0, 212, 255, 0.1);
 }
+"""
+# Function to encode image to base64
+def encode_image_to_base64(image_path: str) -> str:
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+# Function to build user content for multimodal input
+def build_user_content(message: str, files: List[str], video_url: str) -> List[Dict[str, Any]]:
+    content = [{"type": "text", "text": message}]
+    if files:
+        for file_path in files:
+            if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
+                base64_image = encode_image_to_base64(file_path)
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
+                })
+            # Note: For PDFs, we'd need extraction (e.g., via pdf2image), but skipped for simplicity
+            # Users can upload image screenshots of documents
+    if video_url and video_url.strip():
+        content.append({
+            "type": "video_url",
+            "video_url": {"url": video_url.strip()}
+        })
+    return content
+# Main response function
+def respond_to_query(
+    message: str,
+    history: List[Tuple[str, str]],
+    files: Optional[List[str]],
+    video_url: str,
+    api_key: str,
+    messages_state: List[Dict[str, Any]]
+) -> Tuple[List[Tuple[str, str]], str, Optional[List[str]], str, List[Dict[str, Any]], str]:
+    if not api_key or not api_key.strip():
+        return history, "", None, "", messages_state, "⚠️ Please enter your OpenRouter API key to start chatting."
+    if not message.strip():
+        return history, "", None, "", messages_state, "⚠️ Please enter a message."
+    client = OpenAI(
+        base_url="https://openrouter.ai/api/v1",
+        api_key=api_key.strip(),
+    )
+    # Copy current messages state
+    current_messages = messages_state.copy() if messages_state else []
+    # Add user input
+    user_content = build_user_content(message, files or [], video_url)
+    current_messages.append({"role": "user", "content": user_content})
+    try:
+        # API call with reasoning enabled
+        response = client.chat.completions.create(
+            model="nvidia/nemotron-nano-12b-v2-vl:free",
+            messages=current_messages,
+            extra_body={"reasoning": {"enabled": True}}
+        )
+        resp_message = response.choices[0].message
+        content = resp_message.content or "No response generated."
+        # Preserve reasoning details for multi-turn continuity
+        assistant_msg = {"role": "assistant", "content": content}
+        if hasattr(resp_message, 'reasoning_details') and resp_message.reasoning_details:
+            assistant_msg["reasoning_details"] = resp_message.reasoning_details
+        current_messages.append(assistant_msg)
+        # Append to history (text-only for display; attachments noted)
+        attachment_note = ""
+        if files:
+            attachment_note += f" + {len(files)} image(s)"
+        if video_url.strip():
+            attachment_note += f" + video URL"
+        display_message = message + (attachment_note if attachment_note else "")
+        display_response = content + ("\n\n*(Reasoning preserved for follow-up)*" if "reasoning_details" in assistant_msg else "")
+        history.append((display_message, display_response))
+        # Clear inputs
+        return history, "", None, "", current_messages, ""
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}. Check your API key, file sizes (keep images <5MB), or video URL."
+        history.append((message, error_msg))
+        return history, "", None, "", current_messages, error_msg
+# Examples for creativity and to showcase capabilities
+EXAMPLES = [
+    [
+        "How many 'r's are in the word 'strawberry'? Think step by step.",
+        None,  # No files
+        ""    # No video
+    ],
+    [
+        "Describe this image in detail and reason about its contents.",
+        None,
+        ""
+    ],
+    [
+        "Analyze this chart: What trends do you see? Extract key data points.",
+        None,
+        ""
+    ],
+    [
+        "Read the text in this document image and summarize the main points.",
+        None,
+        ""
+    ],
+    [
+        "Count the objects in these multiple images and compare them.",
+        None,
+        ""
+    ],
+    [
+        "What happens in this video? Summarize the key events.",
+        None,
+        "https://example.com/sample-video.mp4"  # Placeholder; replace with real public URL
+    ]
+]
+# Main Gradio Blocks layout
+with gr.Blocks(theme=gr.themes.Dark(), css=CUSTOM_CSS) as demo:
+    gr.HTML("""
+    <div style='text-align: center; padding: 10px;'>
+        <h1>🚀 Nemotron Nano 2 VL Premium Demo</h1>
+        <p style='color: #a0a0a0; font-size: 1.1em;'>Unleash multimodal magic: Text, Images, Documents & Videos | Powered by NVIDIA's Hybrid Transformer-Mamba</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Sidebar for info and controls
+            with gr.Accordion("📖 Model Capabilities & Tips", open=False):
+                gr.Markdown("""
+                **Key Features:**
+                - **Text Reasoning:** Chain-of-thought with preserved reasoning.
+                - **Image/Document Intelligence:** OCR, chart analysis, multi-image docs (upload screenshots).
+                - **Video Understanding:** Enter public video URL (supports long-form with EVS).
+                - **Pro Tip:** For documents, upload multiple page images. Keep files small for fast inference.
+                - **License:** NVIDIA Open | Free tier via OpenRouter.
+                """)
             api_key_input = gr.Textbox(
+                label="🔑 OpenRouter API Key",
+                placeholder="Enter your API key here (keep secure!)",
                 type="password",
+                lines=1
             )
+        with gr.Column(scale=4):
+            # Chat interface
+            chatbot = gr.Chatbot(
+                height=600,
+                show_label=False,
+                avatar_images=("user_avatar.png", None),  # Optional: add custom avatars
+                bubble_full_width=False
             )
     with gr.Row():
+        msg_input = gr.Textbox(
+            label="💭 Your Message",
+            placeholder="Ask anything: 'Count the apples' or 'Summarize this video'...",
+            lines=2,
+            scale=3
+        )
+        file_upload = gr.File(
+            label="🖼️ Attachments (Images for OCR/Charts/Docs)",
+            file_types=["image"],
+            file_count="multiple",
+            scale=1
+        )
+        video_input = gr.Textbox(
+            label="🎥 Video URL (Optional)",
+            placeholder="e.g., https://example.com/video.mp4",
+            lines=1
+        )
+    with gr.Row():
+        submit_btn = gr.Button("✨ Send & Reason", variant="primary", scale=3)
+        clear_btn = gr.Button("🗑️ Clear Chat", scale=1)
+    # State for multi-turn messages
+    messages_state = gr.State([])
+    # Event handlers
+    submit_btn.click(
+        fn=respond_to_query,
+        inputs=[msg_input, chatbot, file_upload, video_input, api_key_input, messages_state],
+        outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input]
+    ).then(
+        fn=lambda: gr.Info("Message sent! Reasoning active."),
+        outputs=[]
+    )
+    clear_btn.click(
+        fn=lambda: ([], "", None, "", [], ""),
+        outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input]
+    ).then(
+        fn=lambda: gr.Info("Chat cleared."),
+        outputs=[]
+    )
+    # Examples
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[msg_input, file_upload, video_input],
+        label="💡 Quick Starts",
+        examples_per_page=6,
+        run_on_click=True,
+        fn=respond_to_query,
+        outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input],
+        cache_examples=False  # Since files are dynamic
+    ).style(container=False)
     # Footer
+    gr.Markdown("""
+    <div style='text-align: center; padding: 20px; color: #a0a0a0;'>
+        Built with ❤️ for creative multimodal exploration | © 2025 Inspired by NVIDIA Nemotron
+    </div>
+    """)
 if __name__ == "__main__":
     demo.launch(
+        share=True,  # Enable public link for demo
+        # server_name="0.0.0.0",
+        # server_port=7860,
+        # show_error=True,
+        # quiet=False
     )