Spaces:

shukdevdattaEX
/

NemoVision

Paused

App Files Files Community

shukdevdattaEX commited on Dec 26, 2025

Commit

514f4b0

verified ·

1 Parent(s): ceca5d1

Create app.py

Browse files

Files changed (1) hide show

app.py +726 -0

app.py ADDED Viewed

	@@ -0,0 +1,726 @@

+import gradio as gr
+import os
+from openai import OpenAI
+import base64
+import json
+from PIL import Image
+import io
+# Global variable to store the OpenAI client
+client = None
+def initialize_client(api_key):
+    """Initialize the OpenAI client with the provided API key"""
+    global client
+    if api_key and api_key.strip():
+        client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key.strip(),
+        )
+        return True
+    return False
+def encode_image(image):
+    """Encode image to base64 string"""
+    if image is None:
+        return None
+    # Convert to PIL Image if it's not already
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    # Convert to RGB if needed
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    # Save to bytes
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG", quality=95)
+    img_bytes = buffered.getvalue()
+    # Encode to base64
+    return base64.b64encode(img_bytes).decode('utf-8')
+def create_message_content(text, images=None):
+    """Create message content with text and optional images"""
+    content = []
+    # Add images first if provided
+    if images:
+        for img in images:
+            if img is not None:
+                img_base64 = encode_image(img)
+                if img_base64:
+                    content.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{img_base64}"
+                        }
+                    })
+    # Add text
+    if text and text.strip():
+        content.append({
+            "type": "text",
+            "text": text
+        })
+    return content if content else [{"type": "text", "text": "Please analyze this content."}]
+def process_request(api_key, task_type, image1=None, image2=None, image3=None, image4=None, text_input="", enable_reasoning=False):
+    """Main processing function that handles all types of requests"""
+    if not initialize_client(api_key):
+        return json.dumps({
+            "success": False,
+            "error": "Please enter a valid OpenRouter API key.",
+            "response": "",
+            "reasoning": ""
+        })
+    try:
+        # Collect all valid images
+        images = [img for img in [image1, image2, image3, image4] if img is not None]
+        # Validate inputs based on task type
+        if task_type in ["ocr", "chart", "multimodal"] and not images and not text_input.strip():
+            return json.dumps({
+                "success": False,
+                "error": "Please upload at least one image or enter text.",
+                "response": "",
+                "reasoning": ""
+            })
+        if task_type == "reasoning" and not text_input.strip():
+            return json.dumps({
+                "success": False,
+                "error": "Please enter a question or problem to solve.",
+                "response": "",
+                "reasoning": ""
+            })
+        # Set default prompts based on task type
+        if not text_input.strip():
+            prompts = {
+                "ocr": "Extract and analyze all text from this image. Provide a detailed analysis of the content, structure, and any key information.",
+                "chart": "Analyze this chart in detail. Describe the type of chart, extract all data points, identify trends, and provide insights.",
+                "video": "Analyze this video content frame by frame. Describe what you see and provide comprehensive insights.",
+                "multimodal": f"Analyze these {len(images)} images. Compare and contrast them, identify relationships, and provide comprehensive insights."
+            }
+            text_input = prompts.get(task_type, "Please analyze this content.")
+        # Create message content
+        messages = [{
+            "role": "user",
+            "content": create_message_content(text_input, images if images else None)
+        }]
+        # Prepare API call parameters
+        api_params = {
+            "model": "nvidia/nemotron-nano-12b-v2-vl:free",
+            "messages": messages,
+            "max_tokens": 3000,
+        }
+        # Add reasoning if enabled
+        if enable_reasoning or task_type == "reasoning":
+            api_params["extra_body"] = {"reasoning": {"enabled": True}}
+        # Make API call
+        response = client.chat.completions.create(**api_params)
+        result = response.choices[0].message.content
+        reasoning_details = ""
+        # Extract reasoning details if available
+        if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
+            reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
+        return json.dumps({
+            "success": True,
+            "error": "",
+            "response": result,
+            "reasoning": reasoning_details,
+            "task_type": task_type,
+            "image_count": len(images)
+        })
+    except Exception as e:
+        return json.dumps({
+            "success": False,
+            "error": f"Error: {str(e)}",
+            "response": "",
+            "reasoning": ""
+        })
+# Enhanced custom CSS with the React design aesthetic
+custom_css = """
+/* Base styling */
+:root {
+    --primary-purple: #7e22ce;
+    --primary-pink: #db2777;
+    --bg-dark: #0f172a;
+    --bg-darker: #020617;
+    --border-color: rgba(168, 85, 247, 0.3);
+}
+body, .gradio-container {
+    background: linear-gradient(135deg, #1e1b4b 0%, #7e22ce 50%, #1e1b4b 100%) !important;
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
+}
+/* Main container */
+.main-container {
+    max-width: 1400px;
+    margin: 0 auto;
+    padding: 20px;
+}
+/* Header styling */
+#header-section {
+    background: rgba(0, 0, 0, 0.3);
+    backdrop-filter: blur(20px);
+    border-radius: 24px;
+    padding: 32px;
+    margin-bottom: 24px;
+    border: 1px solid var(--border-color);
+    box-shadow: 0 8px 32px rgba(126, 34, 206, 0.2);
+}
+#header-section h1 {
+    color: white;
+    font-size: 2.5rem;
+    font-weight: 700;
+    margin: 0;
+    letter-spacing: -0.02em;
+}
+#header-section p {
+    color: #c084fc;
+    font-size: 1.1rem;
+    margin: 8px 0 0 0;
+}
+/* API Key Section */
+#api-key-container {
+    background: linear-gradient(135deg, rgba(126, 34, 206, 0.4) 0%, rgba(219, 39, 119, 0.4) 100%);
+    backdrop-filter: blur(20px);
+    border-radius: 20px;
+    padding: 28px;
+    margin-bottom: 24px;
+    border: 1px solid rgba(168, 85, 247, 0.4);
+    box-shadow: 0 8px 32px rgba(219, 39, 119, 0.2);
+}
+#api-key-container .label-wrap {
+    color: white !important;
+    font-weight: 600;
+}
+/* Input fields */
+.gr-textbox, .gr-file, .gr-image {
+    background: rgba(0, 0, 0, 0.4) !important;
+    border: 1px solid var(--border-color) !important;
+    border-radius: 16px !important;
+    color: white !important;
+    backdrop-filter: blur(10px);
+}
+.gr-textbox:focus, .gr-file:focus, .gr-image:focus {
+    border-color: #a855f7 !important;
+    box-shadow: 0 0 0 3px rgba(168, 85, 247, 0.2) !important;
+}
+/* Tabs */
+.tab-nav {
+    background: rgba(0, 0, 0, 0.3) !important;
+    backdrop-filter: blur(20px) !important;
+    border-radius: 20px !important;
+    padding: 8px !important;
+    border: 1px solid rgba(168, 85, 247, 0.2) !important;
+    gap: 8px !important;
+}
+.tab-nav button {
+    background: transparent !important;
+    color: #c084fc !important;
+    border-radius: 14px !important;
+    padding: 14px 24px !important;
+    font-weight: 600 !important;
+    transition: all 0.3s ease !important;
+    border: none !important;
+}
+.tab-nav button:hover {
+    background: rgba(255, 255, 255, 0.05) !important;
+    color: white !important;
+}
+.tab-nav button.selected {
+    background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%) !important;
+    color: white !important;
+    box-shadow: 0 4px 16px rgba(126, 34, 206, 0.5) !important;
+}
+/* Buttons */
+.gr-button {
+    background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%) !important;
+    color: white !important;
+    border: none !important;
+    border-radius: 14px !important;
+    padding: 14px 28px !important;
+    font-weight: 600 !important;
+    font-size: 1rem !important;
+    cursor: pointer !important;
+    transition: all 0.3s ease !important;
+    box-shadow: 0 4px 16px rgba(126, 34, 206, 0.4) !important;
+}
+.gr-button:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 6px 24px rgba(126, 34, 206, 0.6) !important;
+}
+.gr-button:active {
+    transform: translateY(0px);
+}
+.gr-button.secondary {
+    background: rgba(255, 255, 255, 0.1) !important;
+    backdrop-filter: blur(10px);
+}
+/* Output boxes */
+.output-container {
+    background: rgba(0, 0, 0, 0.5) !important;
+    backdrop-filter: blur(20px);
+    border-radius: 20px !important;
+    padding: 24px !important;
+    border: 1px solid var(--border-color) !important;
+    min-height: 400px;
+    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
+}
+.output-container .label-wrap {
+    color: white !important;
+    font-weight: 600;
+    font-size: 1.1rem;
+}
+.output-container textarea {
+    background: rgba(0, 0, 0, 0.3) !important;
+    color: #e9d5ff !important;
+    border: none !important;
+    font-family: 'SF Mono', 'Monaco', 'Courier New', monospace;
+    font-size: 0.95rem;
+    line-height: 1.6;
+}
+/* Reasoning box */
+.reasoning-container {
+    background: linear-gradient(135deg, rgba(219, 39, 119, 0.3) 0%, rgba(126, 34, 206, 0.3) 100%) !important;
+    backdrop-filter: blur(20px);
+    border-radius: 20px !important;
+    padding: 24px !important;
+    border: 1px solid rgba(236, 72, 153, 0.4) !important;
+    margin-top: 20px;
+    box-shadow: 0 8px 32px rgba(219, 39, 119, 0.2);
+}
+.reasoning-container .label-wrap {
+    color: #fda4af !important;
+    font-weight: 600;
+    font-size: 1.1rem;
+}
+/* Feature cards */
+.feature-card {
+    background: rgba(0, 0, 0, 0.4);
+    backdrop-filter: blur(20px);
+    border-radius: 20px;
+    padding: 28px;
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    transition: all 0.3s ease;
+}
+.feature-card:hover {
+    transform: translateY(-4px);
+    border-color: rgba(168, 85, 247, 0.5);
+    box-shadow: 0 12px 32px rgba(126, 34, 206, 0.3);
+}
+.feature-card h3 {
+    color: white;
+    font-size: 1.3rem;
+    margin-bottom: 12px;
+    font-weight: 700;
+}
+.feature-card p {
+    color: #c084fc;
+    font-size: 0.95rem;
+    line-height: 1.6;
+}
+/* Status badge */
+.status-badge {
+    display: inline-block;
+    background: rgba(34, 197, 94, 0.2);
+    border: 1px solid rgba(34, 197, 94, 0.5);
+    padding: 8px 20px;
+    border-radius: 12px;
+    color: #86efac;
+    font-weight: 600;
+    font-size: 0.9rem;
+}
+/* Loading animation */
+@keyframes spin {
+    0% { transform: rotate(0deg); }
+    100% { transform: rotate(360deg); }
+}
+.loading-spinner {
+    border: 4px solid rgba(168, 85, 247, 0.2);
+    border-top: 4px solid #a855f7;
+    border-radius: 50%;
+    width: 48px;
+    height: 48px;
+    animation: spin 1s linear infinite;
+    margin: 0 auto;
+}
+/* Footer */
+#footer-section {
+    background: rgba(0, 0, 0, 0.3);
+    backdrop-filter: blur(20px);
+    border-radius: 20px;
+    padding: 24px;
+    margin-top: 32px;
+    text-align: center;
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    color: #c084fc;
+}
+/* Markdown styling */
+.markdown-content h1, .markdown-content h2, .markdown-content h3 {
+    color: white !important;
+}
+.markdown-content p {
+    color: #e9d5ff !important;
+}
+/* Scrollbar */
+::-webkit-scrollbar {
+    width: 10px;
+}
+::-webkit-scrollbar-track {
+    background: rgba(0, 0, 0, 0.3);
+    border-radius: 10px;
+}
+::-webkit-scrollbar-thumb {
+    background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%);
+    border-radius: 10px;
+}
+::-webkit-scrollbar-thumb:hover {
+    background: linear-gradient(135deg, #6b21a8 0%, #be185d 100%);
+}
+/* Responsive adjustments */
+@media (max-width: 768px) {
+    #header-section h1 {
+        font-size: 1.8rem;
+    }
+    #header-section p {
+        font-size: 0.95rem;
+    }
+    .feature-card {
+        padding: 20px;
+    }
+}
+"""
+# Build the Gradio interface with React-inspired design
+with gr.Blocks(css=custom_css, theme=gr.themes.Base(), title="NVIDIA Nemotron Nano 2 VL") as demo:
+    # Hidden state for API key
+    api_key_state = gr.State("")
+    # Header
+    with gr.Row(elem_id="header-section"):
+        with gr.Column(scale=8):
+            gr.Markdown("""
+            # ⚡ NVIDIA Nemotron Nano 2 VL
+            ### 12B Parameter Multimodal Reasoning Model
+            Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
+            """, elem_classes="markdown-content")
+        with gr.Column(scale=2):
+            gr.HTML("""
+            <div style='text-align: right; padding: 12px 20px; background: rgba(34, 197, 94, 0.2); border-radius: 12px; border: 1px solid rgba(34, 197, 94, 0.5);'>
+            <b style='color: #86efac; font-size: 0.9rem;'>✓ FREE ACCESS</b>
+            </div>
+            """)
+    # API Key Section
+    with gr.Row(elem_id="api-key-container"):
+        with gr.Column():
+            gr.Markdown("""
+            ### 🔐 OpenRouter API Key
+            Enter your OpenRouter API key to access the NVIDIA Nemotron model. Get yours at [openrouter.ai](https://openrouter.ai)
+            """, elem_classes="markdown-content")
+            api_key_input = gr.Textbox(
+                label="API Key",
+                placeholder="sk-or-v1-...",
+                type="password",
+                scale=4,
+                elem_classes="api-key-input"
+            )
+    # Tabs for different functionalities
+    with gr.Tabs(elem_classes="tab-nav"):
+        # OCR & Document Intelligence Tab
+        with gr.Tab("📄 OCR & Document", elem_classes="tab-item"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 📤 Upload Document")
+                    ocr_image = gr.Image(type="pil", label="Upload Image/Document", height=300)
+                    ocr_text = gr.Textbox(
+                        label="Instructions (Optional)",
+                        placeholder="Describe what you want to extract or analyze...",
+                        lines=4
+                    )
+                    ocr_btn = gr.Button("🔍 Analyze Document", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    gr.Markdown("### 📊 Analysis Result")
+                    ocr_output = gr.Textbox(
+                        label="Response",
+                        lines=15,
+                        elem_classes="output-container",
+                        show_copy_button=True
+                    )
+                    ocr_reasoning = gr.Textbox(
+                        label="Reasoning Details",
+                        lines=5,
+                        elem_classes="reasoning-container",
+                        visible=False
+                    )
+            def ocr_wrapper(api_key, image, text):
+                result = process_request(api_key, "ocr", image1=image, text_input=text)
+                data = json.loads(result)
+                if data["success"]:
+                    return data["response"], data["reasoning"] if data["reasoning"] else ""
+                else:
+                    return f"❌ {data['error']}", ""
+            ocr_btn.click(
+                fn=ocr_wrapper,
+                inputs=[api_key_input, ocr_image, ocr_text],
+                outputs=[ocr_output, ocr_reasoning]
+            )
+        # Chart Analysis Tab
+        with gr.Tab("📊 Chart Analysis", elem_classes="tab-item"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 📈 Upload Chart/Graph")
+                    chart_image = gr.Image(type="pil", label="Upload Chart", height=300)
+                    chart_question = gr.Textbox(
+                        label="Question (Optional)",
+                        placeholder="What insights do you want from this chart?",
+                        lines=3
+                    )
+                    chart_btn = gr.Button("📈 Analyze Chart", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    gr.Markdown("### 📊 Chart Insights")
+                    chart_output = gr.Textbox(
+                        label="Response",
+                        lines=15,
+                        elem_classes="output-container",
+                        show_copy_button=True
+                    )
+            def chart_wrapper(api_key, image, question):
+                result = process_request(api_key, "chart", image1=image, text_input=question)
+                data = json.loads(result)
+                if data["success"]:
+                    return data["response"]
+                else:
+                    return f"❌ {data['error']}"
+            chart_btn.click(
+                fn=chart_wrapper,
+                inputs=[api_key_input, chart_image, chart_question],
+                outputs=[chart_output]
+            )
+        # Video Understanding Tab
+        with gr.Tab("🎥 Video Understanding", elem_classes="tab-item"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🎬 Upload Video")
+                    gr.Markdown("""
+                    **Note**: Full video analysis requires frame extraction and EVS implementation.
+                    Upload video frames as images in the Multi-Image tab for now.
+                    """)
+                    video_input = gr.Video(label="Upload Video")
+                    video_question = gr.Textbox(
+                        label="Question",
+                        placeholder="What would you like to know about this video?",
+                        lines=4
+                    )
+                    video_btn = gr.Button("🎬 Analyze Video", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🎥 Video Analysis")
+                    video_output = gr.Textbox(
+                        label="Response",
+                        lines=15,
+                        elem_classes="output-container"
+                    )
+            def video_wrapper(api_key, video, question):
+                return "🎥 **Video Analysis Placeholder**\n\nVideo analysis requires:\n\n1. Frame extraction from video\n2. EVS (Efficient Video Sampling) implementation\n3. Multi-frame context processing\n\nFor now, extract key frames and use the Multi-Image Analysis tab.\n\nFull implementation coming soon!"
+            video_btn.click(
+                fn=video_wrapper,
+                inputs=[api_key_input, video_input, video_question],
+                outputs=[video_output]
+            )
+        # Advanced Reasoning Tab
+        with gr.Tab("🧠 Advanced Reasoning", elem_classes="tab-item"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("""
+                    ### 💡 Complex Problem Solving
+                    Ask complex questions and get detailed step-by-step reasoning
+                    """)
+                    reasoning_input = gr.Textbox(
+                        label="Question",
+                        placeholder="Ask a complex reasoning question...\n\nExamples:\n- How many R's are in 'strawberry'?\n- Solve this logic puzzle...\n- Calculate the average speed...",
+                        lines=10
+                    )
+                    reasoning_btn = gr.Button("💡 Start Reasoning", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🎯 Answer & Reasoning")
+                    reasoning_output = gr.Textbox(
+                        label="Response",
+                        lines=12,
+                        elem_classes="output-container",
+                        show_copy_button=True
+                    )
+                    reasoning_details = gr.Textbox(
+                        label="🧠 Reasoning Process",
+                        lines=8,
+                        elem_classes="reasoning-container",
+                        show_copy_button=True
+                    )
+            def reasoning_wrapper(api_key, question):
+                result = process_request(api_key, "reasoning", text_input=question, enable_reasoning=True)
+                data = json.loads(result)
+                if data["success"]:
+                    reasoning_text = data["reasoning"] if data["reasoning"] else "Reasoning details not available for this response."
+                    return data["response"], reasoning_text
+                else:
+                    return f"❌ {data['error']}", ""
+            reasoning_btn.click(
+                fn=reasoning_wrapper,
+                inputs=[api_key_input, reasoning_input],
+                outputs=[reasoning_output, reasoning_details]
+            )
+        # Multi-Image Analysis Tab
+        with gr.Tab("🖼️ Multi-Image Analysis", elem_classes="tab-item"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🖼️ Upload Multiple Images (1-4)")
+                    with gr.Row():
+                        multi_image1 = gr.Image(type="pil", label="Image 1", height=200)
+                        multi_image2 = gr.Image(type="pil", label="Image 2", height=200)
+                    with gr.Row():
+                        multi_image3 = gr.Image(type="pil", label="Image 3", height=200)
+                        multi_image4 = gr.Image(type="pil", label="Image 4", height=200)
+                    multi_question = gr.Textbox(
+                        label="Question (Optional)",
+                        placeholder="Compare these images, find differences, identify patterns...",
+                        lines=3
+                    )
+                    multi_btn = gr.Button("🔍 Analyze Images", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🎨 Multi-Image Insights")
+                    multi_output = gr.Textbox(
+                        label="Response",
+                        lines=20,
+                        elem_classes="output-container",
+                        show_copy_button=True
+                    )
+            def multi_wrapper(api_key, img1, img2, img3, img4, question):
+                result = process_request(
+                    api_key, "multimodal",
+                    image1=img1, image2=img2, image3=img3, image4=img4,
+                    text_input=question
+                )
+                data = json.loads(result)
+                if data["success"]:
+                    return f"🖼️ **Analyzing {data['image_count']} image(s)**\n\n{data['response']}"
+                else:
+                    return f"❌ {data['error']}"
+            multi_btn.click(
+                fn=multi_wrapper,
+                inputs=[api_key_input, multi_image1, multi_image2, multi_image3, multi_image4, multi_question],
+                outputs=[multi_output]
+            )
+    # Features Section
+    gr.Markdown("## 🚀 Key Features", elem_classes="markdown-content")
+    with gr.Row():
+        with gr.Column(elem_classes="feature-card"):
+            gr.Markdown("""
+            ### ⚡ Hybrid Architecture
+            Transformer-Mamba fusion for efficient processing with higher throughput and lower latency
+            """)
+        with gr.Column(elem_classes="feature-card"):
+            gr.Markdown("""
+            ### 📊 74% Benchmark Average
+            Leading performance across MMMU, MathVista, AI2D, OCRBench, ChartQA, DocVQA, and more
+            """)
+        with gr.Column(elem_classes="feature-card"):
+            gr.Markdown("""
+            ### 🎥 EVS Technology
+            Efficient Video Sampling for long-form video understanding with reduced inference cost
+            """)
+    # Footer
+    with gr.Row(elem_id="footer-section"):
+        gr.Markdown("""
+        Powered by **NVIDIA Nemotron Nano 12B 2 VL** via OpenRouter API | Open-weights model with permissive NVIDIA license
+        Built with ❤️ using Gradio | [Documentation](https://docs.nvidia.com) | [Report Issues](https://github.com)
+        """, elem_classes="markdown-content")
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True
+    )