shukdevdattaEX commited on
Commit
a4b7006
Β·
verified Β·
1 Parent(s): f4d47c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +478 -266
app.py CHANGED
@@ -1,303 +1,515 @@
1
  import gradio as gr
2
  from openai import OpenAI
3
  import base64
4
- import os
5
- from typing import List, Tuple, Any, Dict, Optional
6
- from PIL import Image
7
- import io
8
 
9
- # Custom CSS for premium, stunning design
10
- CUSTOM_CSS = """
11
- body {
12
- background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 50%, #16213e 100%);
13
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
14
- color: #e0e0e0;
15
- }
16
- .gradio-container {
17
- max-width: 1400px !important;
18
- margin: 0 auto;
19
- background: rgba(0, 0, 0, 0.1);
20
- border-radius: 20px;
21
- box-shadow: 0 20px 40px rgba(0, 0, 0, 0.5);
22
- overflow: hidden;
23
- }
24
- h1 {
25
- background: linear-gradient(45deg, #00d4ff, #0099cc);
26
- -webkit-background-clip: text;
27
- -webkit-text-fill-color: transparent;
28
- text-align: center;
29
- margin: 0;
30
- padding: 20px;
31
- font-size: 2.5em;
32
- text-shadow: 0 0 20px rgba(0, 212, 255, 0.5);
33
- }
34
- .gr-chatbot {
35
- background: rgba(255, 255, 255, 0.05);
36
- border-radius: 15px;
37
- border: 1px solid rgba(0, 212, 255, 0.2);
38
- backdrop-filter: blur(10px);
39
- }
40
- .gr-button {
41
- background: linear-gradient(45deg, #00d4ff, #0099cc);
42
- border: none;
43
- border-radius: 10px;
44
- color: white;
45
- font-weight: bold;
46
- transition: all 0.3s ease;
47
- box-shadow: 0 5px 15px rgba(0, 212, 255, 0.3);
48
- }
49
- .gr-button:hover {
50
- transform: translateY(-2px);
51
- box-shadow: 0 8px 25px rgba(0, 212, 255, 0.4);
52
- }
53
- .gr-textbox, .gr-file {
54
- background: rgba(255, 255, 255, 0.1);
55
- border: 1px solid rgba(0, 212, 255, 0.3);
56
- border-radius: 10px;
57
- color: white;
58
- backdrop-filter: blur(5px);
59
- }
60
- .gr-textbox::placeholder {
61
- color: #a0a0a0;
62
- }
63
- .sidebar {
64
- background: rgba(0, 0, 0, 0.2);
65
- padding: 20px;
66
- border-radius: 15px;
67
- margin: 10px;
68
- border: 1px solid rgba(0, 212, 255, 0.1);
69
- }
70
- """
71
 
72
- # Function to encode image to base64
73
- def encode_image_to_base64(image_path: str) -> str:
74
  with open(image_path, "rb") as image_file:
75
- return base64.b64encode(image_file.read()).decode("utf-8")
76
 
77
- # Function to build user content for multimodal input
78
- def build_user_content(message: str, files: List[str], video_url: str) -> List[Dict[str, Any]]:
79
- content = [{"type": "text", "text": message}]
80
- if files:
81
- for file_path in files:
82
- if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
83
- base64_image = encode_image_to_base64(file_path)
84
- content.append({
85
- "type": "image_url",
86
- "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
87
- })
88
- # Note: For PDFs, we'd need extraction (e.g., via pdf2image), but skipped for simplicity
89
- # Users can upload image screenshots of documents
90
- if video_url and video_url.strip():
91
- content.append({
92
- "type": "video_url",
93
- "video_url": {"url": video_url.strip()}
94
- })
95
- return content
96
 
97
- # Main response function
98
- def respond_to_query(
99
  message: str,
100
  history: List[Tuple[str, str]],
101
- files: Optional[List[str]],
102
- video_url: str,
103
- api_key: str,
104
- messages_state: List[Dict[str, Any]]
105
- ) -> Tuple[List[Tuple[str, str]], str, Optional[List[str]], str, List[Dict[str, Any]], str]:
106
- if not api_key or not api_key.strip():
107
- return history, "", None, "", messages_state, "⚠️ Please enter your OpenRouter API key to start chatting."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- if not message.strip():
110
- return history, "", None, "", messages_state, "⚠️ Please enter a message."
 
111
 
112
- client = OpenAI(
113
- base_url="https://openrouter.ai/api/v1",
114
- api_key=api_key.strip(),
115
- )
116
 
117
- # Copy current messages state
118
- current_messages = messages_state.copy() if messages_state else []
 
119
 
120
- # Add user input
121
- user_content = build_user_content(message, files or [], video_url)
122
- current_messages.append({"role": "user", "content": user_content})
123
 
124
- try:
125
- # API call with reasoning enabled
126
- response = client.chat.completions.create(
127
- model="nvidia/nemotron-nano-12b-v2-vl:free",
128
- messages=current_messages,
129
- extra_body={"reasoning": {"enabled": True}}
130
- )
131
 
132
- resp_message = response.choices[0].message
133
- content = resp_message.content or "No response generated."
134
-
135
- # Preserve reasoning details for multi-turn continuity
136
- assistant_msg = {"role": "assistant", "content": content}
137
- if hasattr(resp_message, 'reasoning_details') and resp_message.reasoning_details:
138
- assistant_msg["reasoning_details"] = resp_message.reasoning_details
 
 
 
139
 
140
- current_messages.append(assistant_msg)
 
 
 
 
 
 
141
 
142
- # Append to history (text-only for display; attachments noted)
143
- attachment_note = ""
144
- if files:
145
- attachment_note += f" + {len(files)} image(s)"
146
- if video_url.strip():
147
- attachment_note += f" + video URL"
148
- display_message = message + (attachment_note if attachment_note else "")
149
- display_response = content + ("\n\n*(Reasoning preserved for follow-up)*" if "reasoning_details" in assistant_msg else "")
 
 
 
150
 
151
- history.append((display_message, display_response))
 
 
 
 
 
 
 
152
 
153
- # Clear inputs
154
- return history, "", None, "", current_messages, ""
 
 
 
155
 
156
- except Exception as e:
157
- error_msg = f"❌ Error: {str(e)}. Check your API key, file sizes (keep images <5MB), or video URL."
158
- history.append((message, error_msg))
159
- return history, "", None, "", current_messages, error_msg
 
160
 
161
- # Examples for creativity and to showcase capabilities
162
- EXAMPLES = [
163
- [
164
- "How many 'r's are in the word 'strawberry'? Think step by step.",
165
- None, # No files
166
- "" # No video
167
- ],
168
- [
169
- "Describe this image in detail and reason about its contents.",
170
- None,
171
- ""
172
- ],
173
- [
174
- "Analyze this chart: What trends do you see? Extract key data points.",
175
- None,
176
- ""
177
- ],
178
- [
179
- "Read the text in this document image and summarize the main points.",
180
- None,
181
- ""
182
- ],
183
- [
184
- "Count the objects in these multiple images and compare them.",
185
- None,
186
- ""
187
- ],
188
- [
189
- "What happens in this video? Summarize the key events.",
190
- None,
191
- "https://example.com/sample-video.mp4" # Placeholder; replace with real public URL
192
- ]
193
- ]
194
 
195
- # Main Gradio Blocks layout
196
- with gr.Blocks(theme=gr.themes.Ocean(), css=CUSTOM_CSS) as demo:
197
- gr.HTML("""
198
- <div style='text-align: center; padding: 10px;'>
199
- <h1>πŸš€ Nemotron Nano 2 VL Premium Demo</h1>
200
- <p style='color: #a0a0a0; font-size: 1.1em;'>Unleash multimodal magic: Text, Images, Documents & Videos | Powered by NVIDIA's Hybrid Transformer-Mamba</p>
201
- </div>
202
- """)
 
 
203
 
204
- with gr.Row():
205
- with gr.Column(scale=1):
206
- # Sidebar for info and controls
207
- with gr.Accordion("πŸ“– Model Capabilities & Tips", open=False):
208
- gr.Markdown("""
209
- **Key Features:**
210
- - **Text Reasoning:** Chain-of-thought with preserved reasoning.
211
- - **Image/Document Intelligence:** OCR, chart analysis, multi-image docs (upload screenshots).
212
- - **Video Understanding:** Enter public video URL (supports long-form with EVS).
213
- - **Pro Tip:** For documents, upload multiple page images. Keep files small for fast inference.
214
- - **License:** NVIDIA Open | Free tier via OpenRouter.
215
- """)
216
-
217
- api_key_input = gr.Textbox(
218
- label="πŸ”‘ OpenRouter API Key",
219
- placeholder="Enter your API key here (keep secure!)",
220
- type="password",
221
- lines=1
222
- )
223
 
224
- with gr.Column(scale=4):
225
- # Chat interface
226
- chatbot = gr.Chatbot(
227
- height=600,
228
- show_label=False,
229
- avatar_images=("user_avatar.png", None), # Optional: add custom avatars
230
- bubble_full_width=False
231
- )
232
 
233
- with gr.Row():
234
- msg_input = gr.Textbox(
235
- label="πŸ’­ Your Message",
236
- placeholder="Ask anything: 'Count the apples' or 'Summarize this video'...",
237
- lines=2,
238
- scale=3
239
- )
240
- file_upload = gr.File(
241
- label="πŸ–ΌοΈ Attachments (Images for OCR/Charts/Docs)",
242
- file_types=["image"],
243
- file_count="multiple",
244
- scale=1
245
- )
246
- video_input = gr.Textbox(
247
- label="πŸŽ₯ Video URL (Optional)",
248
- placeholder="e.g., https://example.com/video.mp4",
249
- lines=1
250
- )
251
 
252
- with gr.Row():
253
- submit_btn = gr.Button("✨ Send & Reason", variant="primary", scale=3)
254
- clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", scale=1)
 
 
 
 
 
 
255
 
256
- # State for multi-turn messages
257
- messages_state = gr.State([])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
- # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  submit_btn.click(
261
- fn=respond_to_query,
262
- inputs=[msg_input, chatbot, file_upload, video_input, api_key_input, messages_state],
263
- outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input]
264
  ).then(
265
- fn=lambda: gr.Info("Message sent! Reasoning active."),
266
- outputs=[]
267
  )
268
-
269
- clear_btn.click(
270
- fn=lambda: ([], "", None, "", [], ""),
271
- outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input]
 
272
  ).then(
273
- fn=lambda: gr.Info("Chat cleared."),
274
- outputs=[]
 
 
 
 
 
275
  )
276
 
277
- # Examples
278
- gr.Examples(
279
- examples=EXAMPLES,
280
- inputs=[msg_input, file_upload, video_input],
281
- label="πŸ’‘ Quick Starts",
282
- examples_per_page=6,
283
- run_on_click=True,
284
- fn=respond_to_query,
285
- outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input],
286
- cache_examples=False # Since files are dynamic
287
- ).style(container=False)
288
-
289
- # Footer
290
- gr.Markdown("""
291
- <div style='text-align: center; padding: 20px; color: #a0a0a0;'>
292
- Built with ❀️ for creative multimodal exploration | © 2025 Inspired by NVIDIA Nemotron
293
- </div>
294
- """)
295
-
296
  if __name__ == "__main__":
297
- demo.launch(
298
- share=True, # Enable public link for demo
299
- # server_name="0.0.0.0",
300
- # server_port=7860,
301
- # show_error=True,
302
- # quiet=False
303
  )
 
1
  import gradio as gr
2
  from openai import OpenAI
3
  import base64
4
+ from pathlib import Path
5
+ import json
6
+ from typing import List, Tuple, Optional
7
+ import time
8
 
9
+ # Global client variable
10
+ client = None
11
+
12
+ def initialize_client(api_key: str) -> Tuple[str, bool]:
13
+ """Initialize OpenAI client with OpenRouter"""
14
+ global client
15
+ if not api_key or not api_key.strip():
16
+ return "⚠️ Please enter a valid API key", False
17
+
18
+ try:
19
+ client = OpenAI(
20
+ base_url="https://openrouter.ai/api/v1",
21
+ api_key=api_key.strip()
22
+ )
23
+ return "βœ… API Key configured successfully! You can now start chatting.", True
24
+ except Exception as e:
25
+ return f"❌ Error initializing client: {str(e)}", False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ def encode_image(image_path: str) -> str:
28
+ """Encode image to base64"""
29
  with open(image_path, "rb") as image_file:
30
+ return base64.b64encode(image_file.read()).decode('utf-8')
31
 
32
+ def create_image_content(image_path: str, mime_type: str = "image/jpeg") -> dict:
33
+ """Create image content for API"""
34
+ base64_image = encode_image(image_path)
35
+ return {
36
+ "type": "image_url",
37
+ "image_url": {
38
+ "url": f"data:{mime_type};base64,{base64_image}"
39
+ }
40
+ }
 
 
 
 
 
 
 
 
 
 
41
 
42
+ def process_message(
 
43
  message: str,
44
  history: List[Tuple[str, str]],
45
+ images: Optional[List] = None,
46
+ enable_reasoning: bool = True,
47
+ temperature: float = 0.7,
48
+ max_tokens: int = 2000
49
+ ) -> Tuple[List[Tuple[str, str]], str]:
50
+ """Process user message and generate response"""
51
+ global client
52
+
53
+ if client is None:
54
+ return history + [(message, "❌ Please configure your API key first in the Settings tab.")], ""
55
+
56
+ try:
57
+ # Build messages array
58
+ messages = []
59
+
60
+ # Add conversation history
61
+ for user_msg, assistant_msg in history:
62
+ messages.append({"role": "user", "content": user_msg})
63
+ if assistant_msg:
64
+ messages.append({"role": "assistant", "content": assistant_msg})
65
+
66
+ # Build current message content
67
+ content = []
68
+
69
+ # Add images if provided
70
+ if images:
71
+ for img in images:
72
+ if img is not None:
73
+ # Determine MIME type
74
+ img_path = Path(img)
75
+ mime_type = "image/jpeg"
76
+ if img_path.suffix.lower() in ['.png']:
77
+ mime_type = "image/png"
78
+ elif img_path.suffix.lower() in ['.webp']:
79
+ mime_type = "image/webp"
80
+
81
+ content.append(create_image_content(img, mime_type))
82
+
83
+ # Add text message
84
+ content.append({"type": "text", "text": message})
85
+
86
+ messages.append({"role": "user", "content": content})
87
+
88
+ # Prepare API call parameters
89
+ api_params = {
90
+ "model": "nvidia/nemotron-nano-12b-v2-vl:free",
91
+ "messages": messages,
92
+ "temperature": temperature,
93
+ "max_tokens": max_tokens
94
+ }
95
+
96
+ # Add reasoning if enabled
97
+ if enable_reasoning:
98
+ api_params["extra_body"] = {"reasoning": {"enabled": True}}
99
+
100
+ # Make API call
101
+ response = client.chat.completions.create(**api_params)
102
+
103
+ assistant_message = response.choices[0].message.content
104
+
105
+ # Extract reasoning if available
106
+ reasoning_text = ""
107
+ if enable_reasoning and hasattr(response.choices[0].message, 'reasoning_details'):
108
+ reasoning_details = response.choices[0].message.reasoning_details
109
+ if reasoning_details:
110
+ reasoning_text = f"\n\n**🧠 Reasoning Process:**\n{json.dumps(reasoning_details, indent=2)}"
111
+
112
+ # Update history
113
+ new_history = history + [(message, assistant_message)]
114
+
115
+ return new_history, reasoning_text
116
+
117
+ except Exception as e:
118
+ error_message = f"❌ Error: {str(e)}"
119
+ return history + [(message, error_message)], ""
120
 
121
+ def clear_conversation():
122
+ """Clear conversation history"""
123
+ return [], ""
124
 
125
+ # Custom CSS for premium design
126
+ custom_css = """
127
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
 
128
 
129
+ * {
130
+ font-family: 'Inter', sans-serif;
131
+ }
132
 
133
+ .gradio-container {
134
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
135
+ }
136
 
137
+ #main-container {
138
+ background: rgba(255, 255, 255, 0.98);
139
+ border-radius: 24px;
140
+ padding: 32px;
141
+ box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
142
+ backdrop-filter: blur(10px);
143
+ }
144
 
145
+ .header-title {
146
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
147
+ -webkit-background-clip: text;
148
+ -webkit-text-fill-color: transparent;
149
+ font-size: 3em;
150
+ font-weight: 700;
151
+ text-align: center;
152
+ margin-bottom: 0.3em;
153
+ letter-spacing: -0.02em;
154
+ }
155
 
156
+ .header-subtitle {
157
+ text-align: center;
158
+ color: #666;
159
+ font-size: 1.1em;
160
+ margin-bottom: 1.5em;
161
+ font-weight: 500;
162
+ }
163
 
164
+ .feature-badge {
165
+ display: inline-block;
166
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
167
+ color: white;
168
+ padding: 6px 16px;
169
+ border-radius: 20px;
170
+ font-size: 0.85em;
171
+ font-weight: 600;
172
+ margin: 4px;
173
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
174
+ }
175
 
176
+ .capability-card {
177
+ background: linear-gradient(135deg, #f6f8fb 0%, #ffffff 100%);
178
+ border: 2px solid #e0e7ff;
179
+ border-radius: 16px;
180
+ padding: 20px;
181
+ margin: 10px 0;
182
+ transition: all 0.3s ease;
183
+ }
184
 
185
+ .capability-card:hover {
186
+ transform: translateY(-4px);
187
+ box-shadow: 0 12px 24px rgba(102, 126, 234, 0.15);
188
+ border-color: #667eea;
189
+ }
190
 
191
+ .tab-nav button {
192
+ font-weight: 600 !important;
193
+ border-radius: 12px !important;
194
+ transition: all 0.3s ease !important;
195
+ }
196
 
197
+ .tab-nav button.selected {
198
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
199
+ color: white !important;
200
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ button.primary {
203
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
204
+ border: none !important;
205
+ color: white !important;
206
+ font-weight: 600 !important;
207
+ border-radius: 12px !important;
208
+ padding: 12px 32px !important;
209
+ transition: all 0.3s ease !important;
210
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3) !important;
211
+ }
212
 
213
+ button.primary:hover {
214
+ transform: translateY(-2px) !important;
215
+ box-shadow: 0 8px 20px rgba(102, 126, 234, 0.4) !important;
216
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ button.secondary {
219
+ background: white !important;
220
+ border: 2px solid #667eea !important;
221
+ color: #667eea !important;
222
+ font-weight: 600 !important;
223
+ border-radius: 12px !important;
224
+ transition: all 0.3s ease !important;
225
+ }
226
 
227
+ button.secondary:hover {
228
+ background: #f0f4ff !important;
229
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ .info-box {
232
+ background: linear-gradient(135deg, #e0e7ff 0%, #f0f4ff 100%);
233
+ border-left: 4px solid #667eea;
234
+ border-radius: 12px;
235
+ padding: 16px 20px;
236
+ margin: 16px 0;
237
+ font-size: 0.95em;
238
+ line-height: 1.6;
239
+ }
240
 
241
+ .success-box {
242
+ background: linear-gradient(135deg, #d4edda 0%, #e8f5e9 100%);
243
+ border-left: 4px solid #28a745;
244
+ border-radius: 12px;
245
+ padding: 16px 20px;
246
+ margin: 16px 0;
247
+ color: #155724;
248
+ font-weight: 500;
249
+ }
250
+
251
+ .chatbot {
252
+ border-radius: 16px !important;
253
+ border: 2px solid #e0e7ff !important;
254
+ box-shadow: 0 8px 24px rgba(102, 126, 234, 0.1) !important;
255
+ }
256
+ """
257
 
258
+ # Build Gradio Interface
259
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
260
+ gr.HTML("""
261
+ <div style='text-align: center; padding: 20px 0;'>
262
+ <h1 class='header-title'>πŸš€ Nemotron Nano VL Studio</h1>
263
+ <p class='header-subtitle'>Advanced Multimodal AI Assistant powered by NVIDIA Nemotron Nano 12B 2 VL</p>
264
+ <div style='margin: 20px 0;'>
265
+ <span class='feature-badge'>πŸ“Š Document Intelligence</span>
266
+ <span class='feature-badge'>🎬 Video Understanding</span>
267
+ <span class='feature-badge'>🧠 Reasoning Engine</span>
268
+ <span class='feature-badge'>πŸ“ˆ Chart Analysis</span>
269
+ <span class='feature-badge'>πŸ”€ OCR Excellence</span>
270
+ </div>
271
+ </div>
272
+ """)
273
+
274
+ with gr.Row(elem_id="main-container"):
275
+ with gr.Column():
276
+ with gr.Tabs():
277
+ # Chat Tab
278
+ with gr.Tab("πŸ’¬ Chat Interface", elem_classes=["tab-nav"]):
279
+ gr.HTML("""
280
+ <div class='info-box'>
281
+ <strong>🎯 What can I do?</strong><br>
282
+ β€’ Analyze images, documents, and charts<br>
283
+ β€’ Perform OCR and text extraction<br>
284
+ β€’ Reason through complex problems<br>
285
+ β€’ Answer questions about visual content<br>
286
+ β€’ Process multi-image documents
287
+ </div>
288
+ """)
289
+
290
+ chatbot = gr.Chatbot(
291
+ label="Conversation",
292
+ height=500,
293
+ show_copy_button=True,
294
+ avatar_images=(None, "https://www.nvidia.com/favicon.ico"),
295
+ elem_classes=["chatbot"]
296
+ )
297
+
298
+ with gr.Row():
299
+ msg = gr.Textbox(
300
+ label="Your Message",
301
+ placeholder="Ask me anything about images, documents, or reasoning tasks...",
302
+ lines=3,
303
+ scale=4
304
+ )
305
+
306
+ with gr.Row():
307
+ images = gr.File(
308
+ label="πŸ“Ž Upload Images/Documents (Multi-image support)",
309
+ file_count="multiple",
310
+ file_types=["image"],
311
+ scale=3
312
+ )
313
+
314
+ with gr.Row():
315
+ submit_btn = gr.Button("πŸš€ Send", variant="primary", scale=2, elem_classes=["primary"])
316
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", scale=1, elem_classes=["secondary"])
317
+
318
+ reasoning_display = gr.Textbox(
319
+ label="🧠 Reasoning Process (when enabled)",
320
+ lines=6,
321
+ interactive=False
322
+ )
323
+
324
+ # Settings Tab
325
+ with gr.Tab("βš™οΈ Settings", elem_classes=["tab-nav"]):
326
+ gr.HTML("""
327
+ <div class='info-box'>
328
+ <strong>πŸ”‘ API Configuration</strong><br>
329
+ Get your free API key from <a href='https://openrouter.ai/keys' target='_blank'>OpenRouter</a>
330
+ </div>
331
+ """)
332
+
333
+ api_key_input = gr.Textbox(
334
+ label="OpenRouter API Key",
335
+ placeholder="sk-or-v1-...",
336
+ type="password",
337
+ lines=1
338
+ )
339
+
340
+ api_status = gr.Textbox(label="Status", interactive=False)
341
+
342
+ save_key_btn = gr.Button("πŸ’Ύ Save API Key", variant="primary", elem_classes=["primary"])
343
+
344
+ gr.HTML("<hr style='margin: 30px 0; border: none; border-top: 2px solid #e0e7ff;'>")
345
+
346
+ gr.HTML("""
347
+ <div class='info-box'>
348
+ <strong>πŸŽ›οΈ Model Parameters</strong><br>
349
+ Fine-tune the model's behavior
350
+ </div>
351
+ """)
352
+
353
+ enable_reasoning = gr.Checkbox(
354
+ label="🧠 Enable Reasoning Mode",
355
+ value=True,
356
+ info="Show the model's step-by-step thinking process"
357
+ )
358
+
359
+ temperature = gr.Slider(
360
+ minimum=0.0,
361
+ maximum=2.0,
362
+ value=0.7,
363
+ step=0.1,
364
+ label="🌑️ Temperature",
365
+ info="Higher = more creative, Lower = more focused"
366
+ )
367
+
368
+ max_tokens = gr.Slider(
369
+ minimum=256,
370
+ maximum=4096,
371
+ value=2000,
372
+ step=256,
373
+ label="πŸ“ Max Tokens",
374
+ info="Maximum length of response"
375
+ )
376
+
377
+ # Examples Tab
378
+ with gr.Tab("πŸ“š Examples & Capabilities", elem_classes=["tab-nav"]):
379
+ gr.HTML("""
380
+ <div class='capability-card'>
381
+ <h3>πŸ“Š Document Intelligence</h3>
382
+ <p><strong>Example:</strong> "Extract all the key metrics from this financial report"</p>
383
+ <p>Nemotron excels at understanding complex documents, tables, and structured data.</p>
384
+ </div>
385
+
386
+ <div class='capability-card'>
387
+ <h3>πŸ”€ OCR Excellence</h3>
388
+ <p><strong>Example:</strong> "What text appears in this image?"</p>
389
+ <p>State-of-the-art optical character recognition for any text in images.</p>
390
+ </div>
391
+
392
+ <div class='capability-card'>
393
+ <h3>πŸ“ˆ Chart & Graph Analysis</h3>
394
+ <p><strong>Example:</strong> "What trends do you see in this chart?"</p>
395
+ <p>Analyze charts, graphs, and data visualizations with high accuracy.</p>
396
+ </div>
397
+
398
+ <div class='capability-card'>
399
+ <h3>🧠 Advanced Reasoning</h3>
400
+ <p><strong>Example:</strong> "How many r's are in 'strawberry'? Think step by step."</p>
401
+ <p>Transparent reasoning process shows how the model arrives at answers.</p>
402
+ </div>
403
+
404
+ <div class='capability-card'>
405
+ <h3>🎬 Video Understanding</h3>
406
+ <p><strong>Example:</strong> Upload video frames and ask "What's happening in this sequence?"</p>
407
+ <p>Process multiple frames to understand temporal sequences and events.</p>
408
+ </div>
409
+
410
+ <div class='capability-card'>
411
+ <h3>πŸ“‘ Multi-Image Documents</h3>
412
+ <p><strong>Example:</strong> Upload multiple pages and ask "Summarize this document"</p>
413
+ <p>Handle multi-page documents and complex layouts with ease.</p>
414
+ </div>
415
+ """)
416
+
417
+ gr.HTML("""
418
+ <div class='success-box' style='margin-top: 30px;'>
419
+ <strong>πŸ’‘ Pro Tips:</strong><br>
420
+ β€’ Upload multiple images for document analysis<br>
421
+ β€’ Enable reasoning mode for complex problems<br>
422
+ β€’ Adjust temperature for creative vs precise outputs<br>
423
+ β€’ Use specific questions for better OCR results<br>
424
+ β€’ Try video frame sequences for temporal analysis
425
+ </div>
426
+ """)
427
+
428
+ # About Tab
429
+ with gr.Tab("ℹ️ About", elem_classes=["tab-nav"]):
430
+ gr.Markdown("""
431
+ # πŸš€ About Nemotron Nano 12B 2 VL
432
+
433
+ ## 🎯 Model Overview
434
+
435
+ **NVIDIA Nemotron Nano 2 VL** is a cutting-edge 12-billion-parameter open multimodal reasoning model
436
+ designed for video understanding and document intelligence.
437
+
438
+ ## ✨ Key Features
439
+
440
+ - **πŸ—οΈ Hybrid Architecture**: Combines Transformer accuracy with Mamba's efficient sequence modeling
441
+ - **⚑ High Performance**: Superior throughput and lower latency
442
+ - **πŸ“Š Leading Results**: ~74 average score across major benchmarks
443
+ - **🎯 Specialized Training**: NVIDIA-curated synthetic datasets
444
+ - **🎬 Video Support**: Efficient Video Sampling (EVS) for long-form content
445
+ - **πŸ“– Open Source**: Released under permissive NVIDIA open license
446
+
447
+ ## πŸ“ˆ Benchmark Performance
448
+
449
+ Achieves leading results on:
450
+ - OCRBench v2
451
+ - MMMU
452
+ - MathVista
453
+ - AI2D
454
+ - OCR-Reasoning
455
+ - ChartQA
456
+ - DocVQA
457
+ - Video-MME
458
+
459
+ ## πŸ”§ Deployment
460
+
461
+ Supported across:
462
+ - NVIDIA NeMo
463
+ - NVIDIA NIM
464
+ - Major inference runtimes
465
+
466
+ ## 🌐 Learn More
467
+
468
+ - [OpenRouter API](https://openrouter.ai/)
469
+ - [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/products/nemo/)
470
+
471
+ ---
472
+
473
+ <div style='text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; color: white;'>
474
+ <strong>Built with ❀️ using Gradio and powered by NVIDIA Nemotron</strong>
475
+ </div>
476
+ """)
477
+
478
+ # Event Handlers
479
+ save_key_btn.click(
480
+ fn=initialize_client,
481
+ inputs=[api_key_input],
482
+ outputs=[api_status]
483
+ )
484
+
485
  submit_btn.click(
486
+ fn=process_message,
487
+ inputs=[msg, chatbot, images, enable_reasoning, temperature, max_tokens],
488
+ outputs=[chatbot, reasoning_display]
489
  ).then(
490
+ lambda: ("", None),
491
+ outputs=[msg, images]
492
  )
493
+
494
+ msg.submit(
495
+ fn=process_message,
496
+ inputs=[msg, chatbot, images, enable_reasoning, temperature, max_tokens],
497
+ outputs=[chatbot, reasoning_display]
498
  ).then(
499
+ lambda: ("", None),
500
+ outputs=[msg, images]
501
+ )
502
+
503
+ clear_btn.click(
504
+ fn=clear_conversation,
505
+ outputs=[chatbot, reasoning_display]
506
  )
507
 
508
+ # Launch the app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  if __name__ == "__main__":
510
+ app.launch(
511
+ share=True,
512
+ server_name="0.0.0.0",
513
+ server_port=7860,
514
+ show_error=True
 
515
  )