shukdevdattaEX commited on
Commit
c0c6801
·
verified ·
1 Parent(s): 389bfd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -933
app.py CHANGED
@@ -1,974 +1,303 @@
1
  import gradio as gr
2
- import os
3
  from openai import OpenAI
4
  import base64
5
- import json
 
6
  from PIL import Image
7
  import io
8
- import cv2
9
- import tempfile
10
- import numpy as np
11
- from pathlib import Path
12
-
13
- # Global variable to store the OpenAI client
14
- client = None
15
-
16
- def initialize_client(api_key):
17
- """Initialize the OpenAI client with the provided API key"""
18
- global client
19
- if api_key and api_key.strip():
20
- client = OpenAI(
21
- base_url="https://openrouter.ai/api/v1",
22
- api_key=api_key.strip(),
23
- )
24
- return True
25
- return False
26
-
27
- def encode_image(image):
28
- """Encode image to base64 string"""
29
- if image is None:
30
- return None
31
-
32
- # Convert to PIL Image if it's not already
33
- if not isinstance(image, Image.Image):
34
- image = Image.fromarray(image)
35
-
36
- # Convert to RGB if needed
37
- if image.mode != 'RGB':
38
- image = image.convert('RGB')
39
-
40
- # Save to bytes
41
- buffered = io.BytesIO()
42
- image.save(buffered, format="JPEG", quality=95)
43
- img_bytes = buffered.getvalue()
44
-
45
- # Encode to base64
46
- return base64.b64encode(img_bytes).decode('utf-8')
47
-
48
- def extract_frames_evs(video_path, num_frames=8, method="uniform"):
49
- """
50
- Extract frames from video using Efficient Video Sampling (EVS)
51
-
52
- Args:
53
- video_path: Path to video file
54
- num_frames: Number of frames to extract (default: 8)
55
- method: Sampling method - "uniform", "keyframe", or "adaptive"
56
-
57
- Returns:
58
- List of PIL Images
59
- """
60
- frames = []
61
-
62
- try:
63
- # Open video file
64
- cap = cv2.VideoCapture(video_path)
65
-
66
- if not cap.isOpened():
67
- raise ValueError("Could not open video file")
68
-
69
- # Get video properties
70
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
71
- fps = cap.get(cv2.CAP_PROP_FPS)
72
- duration = total_frames / fps if fps > 0 else 0
73
-
74
- if total_frames == 0:
75
- raise ValueError("Video has no frames")
76
-
77
- # Adjust num_frames if video is too short
78
- num_frames = min(num_frames, total_frames)
79
-
80
- if method == "uniform":
81
- # Uniform sampling - evenly spaced frames
82
- frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
83
-
84
- for idx in frame_indices:
85
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
86
- ret, frame = cap.read()
87
-
88
- if ret:
89
- # Convert BGR to RGB
90
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
91
- # Convert to PIL Image
92
- pil_image = Image.fromarray(frame_rgb)
93
- # Resize for efficiency (max 1280px on longest side)
94
- pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
95
- frames.append(pil_image)
96
-
97
- elif method == "keyframe":
98
- # Keyframe detection - extract frames with significant changes
99
- prev_frame = None
100
- frame_indices = []
101
- threshold = 30.0 # Difference threshold
102
-
103
- for i in range(0, total_frames, max(1, total_frames // (num_frames * 3))):
104
- cap.set(cv2.CAP_PROP_POS_FRAMES, i)
105
- ret, frame = cap.read()
106
-
107
- if not ret:
108
- continue
109
-
110
- # Convert to grayscale for comparison
111
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
112
-
113
- if prev_frame is not None:
114
- # Calculate difference
115
- diff = cv2.absdiff(prev_frame, gray)
116
- diff_score = np.mean(diff)
117
-
118
- if diff_score > threshold:
119
- frame_indices.append(i)
120
- else:
121
- frame_indices.append(i)
122
-
123
- prev_frame = gray
124
-
125
- if len(frame_indices) >= num_frames:
126
- break
127
-
128
- # If we didn't get enough keyframes, add uniform samples
129
- if len(frame_indices) < num_frames:
130
- additional = num_frames - len(frame_indices)
131
- uniform_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
132
- frame_indices.extend([idx for idx in uniform_indices if idx not in frame_indices][:additional])
133
-
134
- frame_indices = sorted(frame_indices)[:num_frames]
135
-
136
- for idx in frame_indices:
137
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
138
- ret, frame = cap.read()
139
-
140
- if ret:
141
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
142
- pil_image = Image.fromarray(frame_rgb)
143
- pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
144
- frames.append(pil_image)
145
-
146
- elif method == "adaptive":
147
- # Adaptive sampling - more frames at beginning and end, fewer in middle
148
- # This is useful for videos with action at start/end
149
- start_frames = num_frames // 3
150
- end_frames = num_frames // 3
151
- middle_frames = num_frames - start_frames - end_frames
152
-
153
- # Start section
154
- start_indices = np.linspace(0, total_frames * 0.2, start_frames, dtype=int)
155
- # Middle section
156
- middle_indices = np.linspace(total_frames * 0.2, total_frames * 0.8, middle_frames, dtype=int)
157
- # End section
158
- end_indices = np.linspace(total_frames * 0.8, total_frames - 1, end_frames, dtype=int)
159
-
160
- frame_indices = np.concatenate([start_indices, middle_indices, end_indices])
161
-
162
- for idx in frame_indices:
163
- cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
164
- ret, frame = cap.read()
165
-
166
- if ret:
167
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
168
- pil_image = Image.fromarray(frame_rgb)
169
- pil_image.thumbnail((1280, 1280), Image.Resampling.LANCZOS)
170
- frames.append(pil_image)
171
-
172
- cap.release()
173
-
174
- return frames, {
175
- "total_frames": total_frames,
176
- "fps": fps,
177
- "duration": duration,
178
- "extracted_frames": len(frames),
179
- "method": method
180
- }
181
-
182
- except Exception as e:
183
- if 'cap' in locals():
184
- cap.release()
185
- raise Exception(f"Error extracting frames: {str(e)}")
186
-
187
- def create_message_content(text, images=None):
188
- """Create message content with text and optional images"""
189
- content = []
190
-
191
- # Add images first if provided
192
- if images:
193
- for img in images:
194
- if img is not None:
195
- img_base64 = encode_image(img)
196
- if img_base64:
197
- content.append({
198
- "type": "image_url",
199
- "image_url": {
200
- "url": f"data:image/jpeg;base64,{img_base64}"
201
- }
202
- })
203
-
204
- # Add text
205
- if text and text.strip():
206
- content.append({
207
- "type": "text",
208
- "text": text
209
- })
210
-
211
- return content if content else [{"type": "text", "text": "Please analyze this content."}]
212
-
213
- def process_request(api_key, task_type, image1=None, image2=None, image3=None, image4=None, text_input="", enable_reasoning=False):
214
- """Main processing function that handles all types of requests"""
215
-
216
- if not initialize_client(api_key):
217
- return json.dumps({
218
- "success": False,
219
- "error": "Please enter a valid OpenRouter API key.",
220
- "response": "",
221
- "reasoning": ""
222
- })
223
-
224
- try:
225
- # Collect all valid images
226
- images = [img for img in [image1, image2, image3, image4] if img is not None]
227
-
228
- # Validate inputs based on task type
229
- if task_type in ["ocr", "chart", "multimodal"] and not images and not text_input.strip():
230
- return json.dumps({
231
- "success": False,
232
- "error": "Please upload at least one image or enter text.",
233
- "response": "",
234
- "reasoning": ""
235
- })
236
-
237
- if task_type == "reasoning" and not text_input.strip():
238
- return json.dumps({
239
- "success": False,
240
- "error": "Please enter a question or problem to solve.",
241
- "response": "",
242
- "reasoning": ""
243
- })
244
-
245
- # Set default prompts based on task type
246
- if not text_input.strip():
247
- prompts = {
248
- "ocr": "Extract and analyze all text from this image. Provide a detailed analysis of the content, structure, and any key information.",
249
- "chart": "Analyze this chart in detail. Describe the type of chart, extract all data points, identify trends, and provide insights.",
250
- "video": "Analyze this video content frame by frame. Describe what you see and provide comprehensive insights.",
251
- "multimodal": f"Analyze these {len(images)} images. Compare and contrast them, identify relationships, and provide comprehensive insights."
252
- }
253
- text_input = prompts.get(task_type, "Please analyze this content.")
254
-
255
- # Create message content
256
- messages = [{
257
- "role": "user",
258
- "content": create_message_content(text_input, images if images else None)
259
- }]
260
-
261
- # Prepare API call parameters
262
- api_params = {
263
- "model": "nvidia/nemotron-nano-12b-v2-vl:free",
264
- "messages": messages,
265
- "max_tokens": 3000,
266
- }
267
-
268
- # Add reasoning if enabled
269
- if enable_reasoning or task_type == "reasoning":
270
- api_params["extra_body"] = {"reasoning": {"enabled": True}}
271
-
272
- # Make API call
273
- response = client.chat.completions.create(**api_params)
274
-
275
- result = response.choices[0].message.content
276
- reasoning_details = ""
277
-
278
- # Extract reasoning details if available
279
- if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
280
- reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
281
-
282
- return json.dumps({
283
- "success": True,
284
- "error": "",
285
- "response": result,
286
- "reasoning": reasoning_details,
287
- "task_type": task_type,
288
- "image_count": len(images)
289
- })
290
-
291
- except Exception as e:
292
- return json.dumps({
293
- "success": False,
294
- "error": f"Error: {str(e)}",
295
- "response": "",
296
- "reasoning": ""
297
- })
298
 
299
- def process_video(api_key, video_file, question, num_frames, sampling_method, enable_reasoning):
300
- """Process video with frame extraction and analysis"""
301
-
302
- if not initialize_client(api_key):
303
- return "❌ Please enter a valid OpenRouter API key.", "", None, ""
304
-
305
- if video_file is None:
306
- return "❌ Please upload a video file.", "", None, ""
307
-
308
- try:
309
- # Update status
310
- status_msg = "⏳ Extracting frames from video using EVS...\n"
311
-
312
- # Extract frames
313
- frames, video_info = extract_frames_evs(
314
- video_file,
315
- num_frames=num_frames,
316
- method=sampling_method
317
- )
318
-
319
- if not frames:
320
- return "❌ Could not extract frames from video.", "", None, ""
321
-
322
- # Update status with video info
323
- status_msg += f"\n✅ Video Analysis:\n"
324
- status_msg += f" • Total frames: {video_info['total_frames']}\n"
325
- status_msg += f" • FPS: {video_info['fps']:.2f}\n"
326
- status_msg += f" • Duration: {video_info['duration']:.2f} seconds\n"
327
- status_msg += f" • Extracted: {video_info['extracted_frames']} frames\n"
328
- status_msg += f" • Method: {video_info['method']}\n"
329
- status_msg += f"\n⏳ Analyzing frames with Nemotron AI...\n"
330
-
331
- # Create prompt
332
- if not question or not question.strip():
333
- prompt = f"Analyze this video by examining these {len(frames)} frames extracted from it. Provide a comprehensive description of:\n1. What is happening in the video\n2. Key events or actions\n3. Any changes or progression throughout\n4. Overall context and meaning\n5. Temporal relationships between frames"
334
- else:
335
- prompt = f"Based on these {len(frames)} frames from a video, {question}"
336
-
337
- # Create message content with all frames
338
- messages = [{
339
- "role": "user",
340
- "content": create_message_content(prompt, frames)
341
- }]
342
-
343
- # Prepare API call
344
- api_params = {
345
- "model": "nvidia/nemotron-nano-12b-v2-vl:free",
346
- "messages": messages,
347
- "max_tokens": 4000,
348
- }
349
-
350
- if enable_reasoning:
351
- api_params["extra_body"] = {"reasoning": {"enabled": True}}
352
-
353
- # Make API call
354
- response = client.chat.completions.create(**api_params)
355
-
356
- result = response.choices[0].message.content
357
- reasoning_details = ""
358
-
359
- # Extract reasoning if available
360
- if hasattr(response.choices[0].message, 'reasoning_details') and response.choices[0].message.reasoning_details:
361
- reasoning_details = json.dumps(response.choices[0].message.reasoning_details, indent=2)
362
-
363
- # Create frame gallery
364
- frame_gallery = frames
365
-
366
- status_msg += f"\n✅ Analysis complete!\n"
367
-
368
- return (
369
- f"🎥 **Video Analysis Complete**\n\n{result}",
370
- reasoning_details if reasoning_details else "No reasoning details available.",
371
- frame_gallery,
372
- status_msg
373
- )
374
-
375
- except Exception as e:
376
- return f"❌ Error processing video: {str(e)}", "", None, f"❌ Error: {str(e)}"
377
-
378
- # Enhanced custom CSS with the React design aesthetic
379
- custom_css = """
380
- /* Base styling */
381
- :root {
382
- --primary-purple: #7e22ce;
383
- --primary-pink: #db2777;
384
- --bg-dark: #0f172a;
385
- --bg-darker: #020617;
386
- --border-color: rgba(168, 85, 247, 0.3);
387
- }
388
-
389
- body, .gradio-container {
390
- background: linear-gradient(135deg, #1e1b4b 0%, #7e22ce 50%, #1e1b4b 100%) !important;
391
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
392
  }
393
-
394
- /* Main container */
395
- .main-container {
396
- max-width: 1400px;
397
  margin: 0 auto;
398
- padding: 20px;
399
- }
400
-
401
- /* Header styling */
402
- #header-section {
403
- background: rgba(0, 0, 0, 0.3);
404
- backdrop-filter: blur(20px);
405
- border-radius: 24px;
406
- padding: 32px;
407
- margin-bottom: 24px;
408
- border: 1px solid var(--border-color);
409
- box-shadow: 0 8px 32px rgba(126, 34, 206, 0.2);
410
- }
411
-
412
- #header-section h1 {
413
- color: white;
414
- font-size: 2.5rem;
415
- font-weight: 700;
416
- margin: 0;
417
- letter-spacing: -0.02em;
418
- }
419
-
420
- #header-section p {
421
- color: #c084fc;
422
- font-size: 1.1rem;
423
- margin: 8px 0 0 0;
424
- }
425
-
426
- /* API Key Section */
427
- #api-key-container {
428
- background: linear-gradient(135deg, rgba(126, 34, 206, 0.4) 0%, rgba(219, 39, 119, 0.4) 100%);
429
- backdrop-filter: blur(20px);
430
  border-radius: 20px;
431
- padding: 28px;
432
- margin-bottom: 24px;
433
- border: 1px solid rgba(168, 85, 247, 0.4);
434
- box-shadow: 0 8px 32px rgba(219, 39, 119, 0.2);
435
  }
436
-
437
- #api-key-container .label-wrap {
438
- color: white !important;
439
- font-weight: 600;
 
 
 
 
 
440
  }
441
-
442
- /* Input fields */
443
- .gr-textbox, .gr-file, .gr-image {
444
- background: rgba(0, 0, 0, 0.4) !important;
445
- border: 1px solid var(--border-color) !important;
446
- border-radius: 16px !important;
447
- color: white !important;
448
  backdrop-filter: blur(10px);
449
  }
450
-
451
- .gr-textbox:focus, .gr-file:focus, .gr-image:focus {
452
- border-color: #a855f7 !important;
453
- box-shadow: 0 0 0 3px rgba(168, 85, 247, 0.2) !important;
454
- }
455
-
456
- /* Tabs */
457
- .tab-nav {
458
- background: rgba(0, 0, 0, 0.3) !important;
459
- backdrop-filter: blur(20px) !important;
460
- border-radius: 20px !important;
461
- padding: 8px !important;
462
- border: 1px solid rgba(168, 85, 247, 0.2) !important;
463
- gap: 8px !important;
464
- }
465
-
466
- .tab-nav button {
467
- background: transparent !important;
468
- color: #c084fc !important;
469
- border-radius: 14px !important;
470
- padding: 14px 24px !important;
471
- font-weight: 600 !important;
472
- transition: all 0.3s ease !important;
473
- border: none !important;
474
- }
475
-
476
- .tab-nav button:hover {
477
- background: rgba(255, 255, 255, 0.05) !important;
478
- color: white !important;
479
- }
480
-
481
- .tab-nav button.selected {
482
- background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%) !important;
483
- color: white !important;
484
- box-shadow: 0 4px 16px rgba(126, 34, 206, 0.5) !important;
485
- }
486
-
487
- /* Buttons */
488
  .gr-button {
489
- background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%) !important;
490
- color: white !important;
491
- border: none !important;
492
- border-radius: 14px !important;
493
- padding: 14px 28px !important;
494
- font-weight: 600 !important;
495
- font-size: 1rem !important;
496
- cursor: pointer !important;
497
- transition: all 0.3s ease !important;
498
- box-shadow: 0 4px 16px rgba(126, 34, 206, 0.4) !important;
499
  }
500
-
501
  .gr-button:hover {
502
  transform: translateY(-2px);
503
- box-shadow: 0 6px 24px rgba(126, 34, 206, 0.6) !important;
504
- }
505
-
506
- .gr-button:active {
507
- transform: translateY(0px);
508
- }
509
-
510
- .gr-button.secondary {
511
- background: rgba(255, 255, 255, 0.1) !important;
512
- backdrop-filter: blur(10px);
513
- }
514
-
515
- /* Output boxes */
516
- .output-container {
517
- background: rgba(0, 0, 0, 0.5) !important;
518
- backdrop-filter: blur(20px);
519
- border-radius: 20px !important;
520
- padding: 24px !important;
521
- border: 1px solid var(--border-color) !important;
522
- min-height: 400px;
523
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
524
- }
525
-
526
- .output-container .label-wrap {
527
- color: white !important;
528
- font-weight: 600;
529
- font-size: 1.1rem;
530
- }
531
-
532
- .output-container textarea {
533
- background: rgba(0, 0, 0, 0.3) !important;
534
- color: #e9d5ff !important;
535
- border: none !important;
536
- font-family: 'SF Mono', 'Monaco', 'Courier New', monospace;
537
- font-size: 0.95rem;
538
- line-height: 1.6;
539
- }
540
-
541
- /* Reasoning box */
542
- .reasoning-container {
543
- background: linear-gradient(135deg, rgba(219, 39, 119, 0.3) 0%, rgba(126, 34, 206, 0.3) 100%) !important;
544
- backdrop-filter: blur(20px);
545
- border-radius: 20px !important;
546
- padding: 24px !important;
547
- border: 1px solid rgba(236, 72, 153, 0.4) !important;
548
- margin-top: 20px;
549
- box-shadow: 0 8px 32px rgba(219, 39, 119, 0.2);
550
- }
551
-
552
- .reasoning-container .label-wrap {
553
- color: #fda4af !important;
554
- font-weight: 600;
555
- font-size: 1.1rem;
556
- }
557
-
558
- /* Feature cards */
559
- .feature-card {
560
- background: rgba(0, 0, 0, 0.4);
561
- backdrop-filter: blur(20px);
562
- border-radius: 20px;
563
- padding: 28px;
564
- border: 1px solid rgba(168, 85, 247, 0.2);
565
- transition: all 0.3s ease;
566
- }
567
-
568
- .feature-card:hover {
569
- transform: translateY(-4px);
570
- border-color: rgba(168, 85, 247, 0.5);
571
- box-shadow: 0 12px 32px rgba(126, 34, 206, 0.3);
572
  }
573
-
574
- .feature-card h3 {
 
 
575
  color: white;
576
- font-size: 1.3rem;
577
- margin-bottom: 12px;
578
- font-weight: 700;
579
- }
580
-
581
- .feature-card p {
582
- color: #c084fc;
583
- font-size: 0.95rem;
584
- line-height: 1.6;
585
- }
586
-
587
- /* Status badge */
588
- .status-badge {
589
- display: inline-block;
590
- background: rgba(34, 197, 94, 0.2);
591
- border: 1px solid rgba(34, 197, 94, 0.5);
592
- padding: 8px 20px;
593
- border-radius: 12px;
594
- color: #86efac;
595
- font-weight: 600;
596
- font-size: 0.9rem;
597
- }
598
-
599
- /* Gallery */
600
- .gr-gallery {
601
- background: rgba(0, 0, 0, 0.3) !important;
602
- border-radius: 16px !important;
603
- border: 1px solid var(--border-color) !important;
604
- }
605
-
606
- /* Slider */
607
- .gr-slider {
608
- background: rgba(0, 0, 0, 0.3) !important;
609
- border-radius: 12px !important;
610
- }
611
-
612
- /* Radio */
613
- .gr-radio {
614
- background: rgba(0, 0, 0, 0.3) !important;
615
- border-radius: 12px !important;
616
- padding: 12px !important;
617
- }
618
-
619
- /* Checkbox */
620
- .gr-checkbox {
621
- background: rgba(0, 0, 0, 0.2) !important;
622
- border-radius: 8px !important;
623
  }
624
-
625
- /* Loading animation */
626
- @keyframes spin {
627
- 0% { transform: rotate(0deg); }
628
- 100% { transform: rotate(360deg); }
629
  }
630
-
631
- .loading-spinner {
632
- border: 4px solid rgba(168, 85, 247, 0.2);
633
- border-top: 4px solid #a855f7;
634
- border-radius: 50%;
635
- width: 48px;
636
- height: 48px;
637
- animation: spin 1s linear infinite;
638
- margin: 0 auto;
639
  }
 
640
 
641
- /* Footer */
642
- #footer-section {
643
- background: rgba(0, 0, 0, 0.3);
644
- backdrop-filter: blur(20px);
645
- border-radius: 20px;
646
- padding: 24px;
647
- margin-top: 32px;
648
- text-align: center;
649
- border: 1px solid rgba(168, 85, 247, 0.2);
650
- color: #c084fc;
651
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
- /* Markdown styling */
654
- .markdown-content h1, .markdown-content h2, .markdown-content h3 {
655
- color: white !important;
656
- }
 
 
 
657
 
658
- .markdown-content p {
659
- color: #e9d5ff !important;
660
- }
 
 
 
 
661
 
662
- /* Scrollbar */
663
- ::-webkit-scrollbar {
664
- width: 10px;
665
- }
666
 
667
- ::-webkit-scrollbar-track {
668
- background: rgba(0, 0, 0, 0.3);
669
- border-radius: 10px;
670
- }
 
 
 
 
671
 
672
- ::-webkit-scrollbar-thumb {
673
- background: linear-gradient(135deg, #7e22ce 0%, #db2777 100%);
674
- border-radius: 10px;
675
- }
676
 
677
- ::-webkit-scrollbar-thumb:hover {
678
- background: linear-gradient(135deg, #6b21a8 0%, #be185d 100%);
679
- }
680
 
681
- /* Responsive adjustments */
682
- @media (max-width: 768px) {
683
- #header-section h1 {
684
- font-size: 1.8rem;
685
- }
686
-
687
- #header-section p {
688
- font-size: 0.95rem;
689
- }
690
-
691
- .feature-card {
692
- padding: 20px;
693
- }
694
- }
695
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
 
697
- # Build the Gradio interface with React-inspired design
698
- with gr.Blocks(css=custom_css, theme=gr.themes.Ocean(), title="NVIDIA Nemotron Nano 2 VL") as demo:
699
-
700
- # Hidden state for API key
701
- api_key_state = gr.State("")
702
-
703
- # Header
704
- with gr.Row(elem_id="header-section"):
705
- with gr.Column(scale=8):
706
- gr.Markdown("""
707
- # NVIDIA Nemotron Nano 2 VL
708
- ### 12B Parameter Multimodal Reasoning Model with EVS Video Analysis
709
- Advanced document intelligence, chart analysis, video understanding, and reasoning capabilities
710
- """, elem_classes="markdown-content")
711
- with gr.Column(scale=2):
712
- gr.HTML("""
713
- <div style='text-align: right; padding: 12px 20px; background: rgba(34, 197, 94, 0.2); border-radius: 12px; border: 1px solid rgba(34, 197, 94, 0.5);'>
714
- <b style='color: #86efac; font-size: 0.9rem;'>✓ FREE ACCESS</b>
715
- </div>
716
- """)
717
-
718
- # API Key Section
719
- with gr.Row(elem_id="api-key-container"):
720
- with gr.Column():
721
- gr.Markdown("""
722
- ### 🔐 OpenRouter API Key
723
- Enter your OpenRouter API key to access the NVIDIA Nemotron model. Get yours at [openrouter.ai](https://openrouter.ai)
724
- """, elem_classes="markdown-content")
725
  api_key_input = gr.Textbox(
726
- label="API Key",
727
- placeholder="sk-or-v1-...",
728
  type="password",
729
- scale=4,
730
- elem_classes="api-key-input"
731
- )
732
-
733
- # Tabs for different functionalities
734
- with gr.Tabs(elem_classes="tab-nav"):
735
-
736
- # OCR & Document Intelligence Tab
737
- with gr.Tab("📄 OCR & Document", elem_classes="tab-item"):
738
- with gr.Row():
739
- with gr.Column(scale=1):
740
- gr.Markdown("### 📤 Upload Document")
741
- ocr_image = gr.Image(type="pil", label="Upload Image/Document", height=300)
742
- ocr_text = gr.Textbox(
743
- label="Instructions (Optional)",
744
- placeholder="Describe what you want to extract or analyze...",
745
- lines=4
746
- )
747
- ocr_btn = gr.Button("🔍 Analyze Document", variant="primary", size="lg")
748
-
749
- with gr.Column(scale=1):
750
- gr.Markdown("### 📊 Analysis Result")
751
- ocr_output = gr.Textbox(
752
- label="Response",
753
- lines=15,
754
- elem_classes="output-container",
755
- show_copy_button=True
756
- )
757
- ocr_reasoning = gr.Textbox(
758
- label="Reasoning Details",
759
- lines=5,
760
- elem_classes="reasoning-container",
761
- visible=False
762
- )
763
-
764
- def ocr_wrapper(api_key, image, text):
765
- result = process_request(api_key, "ocr", image1=image, text_input=text)
766
- data = json.loads(result)
767
- if data["success"]:
768
- return data["response"], data["reasoning"] if data["reasoning"] else ""
769
- else:
770
- return f"❌ {data['error']}", ""
771
-
772
- ocr_btn.click(
773
- fn=ocr_wrapper,
774
- inputs=[api_key_input, ocr_image, ocr_text],
775
- outputs=[ocr_output, ocr_reasoning]
776
  )
777
-
778
- # Chart Analysis Tab
779
- with gr.Tab("📊 Chart Analysis", elem_classes="tab-item"):
780
- with gr.Row():
781
- with gr.Column(scale=1):
782
- gr.Markdown("### 📈 Upload Chart/Graph")
783
- chart_image = gr.Image(type="pil", label="Upload Chart", height=300)
784
- chart_question = gr.Textbox(
785
- label="Question (Optional)",
786
- placeholder="What insights do you want from this chart?",
787
- lines=3
788
- )
789
- chart_btn = gr.Button("📈 Analyze Chart", variant="primary", size="lg")
790
-
791
- with gr.Column(scale=1):
792
- gr.Markdown("### 📊 Chart Insights")
793
- chart_output = gr.Textbox(
794
- label="Response",
795
- lines=15,
796
- elem_classes="output-container",
797
- show_copy_button=True
798
- )
799
-
800
- def chart_wrapper(api_key, image, question):
801
- result = process_request(api_key, "chart", image1=image, text_input=question)
802
- data = json.loads(result)
803
- if data["success"]:
804
- return data["response"]
805
- else:
806
- return f"❌ {data['error']}"
807
-
808
- chart_btn.click(
809
- fn=chart_wrapper,
810
- inputs=[api_key_input, chart_image, chart_question],
811
- outputs=[chart_output]
812
- )
813
-
814
- # Video Understanding Tab
815
- with gr.Tab("🎥 Video Understanding", elem_classes="tab-item"):
816
- with gr.Row():
817
- with gr.Column(scale=1):
818
- gr.Markdown("### 🎬 Upload Video")
819
- gr.Markdown("""
820
- **Note**: Full video analysis requires frame extraction and EVS implementation.
821
- Upload video frames as images in the Multi-Image tab for now.
822
- """)
823
- video_input = gr.Video(label="Upload Video")
824
- video_question = gr.Textbox(
825
- label="Question",
826
- placeholder="What would you like to know about this video?",
827
- lines=4
828
- )
829
- video_btn = gr.Button("🎬 Analyze Video", variant="primary", size="lg")
830
-
831
- with gr.Column(scale=1):
832
- gr.Markdown("### 🎥 Video Analysis")
833
- video_output = gr.Textbox(
834
- label="Response",
835
- lines=15,
836
- elem_classes="output-container"
837
- )
838
-
839
- def video_wrapper(api_key, video, question):
840
- return "🎥 **Video Analysis Placeholder**\n\nVideo analysis requires:\n\n1. Frame extraction from video\n2. EVS (Efficient Video Sampling) implementation\n3. Multi-frame context processing\n\nFor now, extract key frames and use the Multi-Image Analysis tab.\n\nFull implementation coming soon!"
841
-
842
- video_btn.click(
843
- fn=video_wrapper,
844
- inputs=[api_key_input, video_input, video_question],
845
- outputs=[video_output]
846
- )
847
-
848
- # Advanced Reasoning Tab
849
- with gr.Tab("🧠 Advanced Reasoning", elem_classes="tab-item"):
850
- with gr.Row():
851
- with gr.Column(scale=1):
852
- gr.Markdown("""
853
- ### 💡 Complex Problem Solving
854
- Ask complex questions and get detailed step-by-step reasoning
855
- """)
856
- reasoning_input = gr.Textbox(
857
- label="Question",
858
- placeholder="Ask a complex reasoning question...\n\nExamples:\n- How many R's are in 'strawberry'?\n- Solve this logic puzzle...\n- Calculate the average speed...",
859
- lines=10
860
- )
861
- reasoning_btn = gr.Button("💡 Start Reasoning", variant="primary", size="lg")
862
-
863
- with gr.Column(scale=1):
864
- gr.Markdown("### 🎯 Answer & Reasoning")
865
- reasoning_output = gr.Textbox(
866
- label="Response",
867
- lines=12,
868
- elem_classes="output-container",
869
- show_copy_button=True
870
- )
871
- reasoning_details = gr.Textbox(
872
- label="🧠 Reasoning Process",
873
- lines=8,
874
- elem_classes="reasoning-container",
875
- show_copy_button=True
876
- )
877
-
878
- def reasoning_wrapper(api_key, question):
879
- result = process_request(api_key, "reasoning", text_input=question, enable_reasoning=True)
880
- data = json.loads(result)
881
- if data["success"]:
882
- reasoning_text = data["reasoning"] if data["reasoning"] else "Reasoning details not available for this response."
883
- return data["response"], reasoning_text
884
- else:
885
- return f"❌ {data['error']}", ""
886
-
887
- reasoning_btn.click(
888
- fn=reasoning_wrapper,
889
- inputs=[api_key_input, reasoning_input],
890
- outputs=[reasoning_output, reasoning_details]
891
- )
892
-
893
- # Multi-Image Analysis Tab
894
- with gr.Tab("🖼️ Multi-Image Analysis", elem_classes="tab-item"):
895
- with gr.Row():
896
- with gr.Column(scale=1):
897
- gr.Markdown("### 🖼️ Upload Multiple Images (1-4)")
898
- with gr.Row():
899
- multi_image1 = gr.Image(type="pil", label="Image 1", height=200)
900
- multi_image2 = gr.Image(type="pil", label="Image 2", height=200)
901
- with gr.Row():
902
- multi_image3 = gr.Image(type="pil", label="Image 3", height=200)
903
- multi_image4 = gr.Image(type="pil", label="Image 4", height=200)
904
- multi_question = gr.Textbox(
905
- label="Question (Optional)",
906
- placeholder="Compare these images, find differences, identify patterns...",
907
- lines=3
908
- )
909
- multi_btn = gr.Button("🔍 Analyze Images", variant="primary", size="lg")
910
-
911
- with gr.Column(scale=1):
912
- gr.Markdown("### 🎨 Multi-Image Insights")
913
- multi_output = gr.Textbox(
914
- label="Response",
915
- lines=20,
916
- elem_classes="output-container",
917
- show_copy_button=True
918
- )
919
-
920
- def multi_wrapper(api_key, img1, img2, img3, img4, question):
921
- result = process_request(
922
- api_key, "multimodal",
923
- image1=img1, image2=img2, image3=img3, image4=img4,
924
- text_input=question
925
- )
926
- data = json.loads(result)
927
- if data["success"]:
928
- return f"🖼️ **Analyzing {data['image_count']} image(s)**\n\n{data['response']}"
929
- else:
930
- return f"❌ {data['error']}"
931
-
932
- multi_btn.click(
933
- fn=multi_wrapper,
934
- inputs=[api_key_input, multi_image1, multi_image2, multi_image3, multi_image4, multi_question],
935
- outputs=[multi_output]
936
  )
937
-
938
- # Features Section
939
- gr.Markdown("## 🚀 Key Features", elem_classes="markdown-content")
940
  with gr.Row():
941
- with gr.Column(elem_classes="feature-card"):
942
- gr.Markdown("""
943
- ### Hybrid Architecture
944
- Transformer-Mamba fusion for efficient processing with higher throughput and lower latency
945
- """)
946
-
947
- with gr.Column(elem_classes="feature-card"):
948
- gr.Markdown("""
949
- ### 📊 74% Benchmark Average
950
- Leading performance across MMMU, MathVista, AI2D, OCRBench, ChartQA, DocVQA, and more
951
- """)
952
-
953
- with gr.Column(elem_classes="feature-card"):
954
- gr.Markdown("""
955
- ### 🎥 EVS Technology
956
- Efficient Video Sampling for long-form video understanding with reduced inference cost
957
- """)
958
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959
  # Footer
960
- with gr.Row(elem_id="footer-section"):
961
- gr.Markdown("""
962
- Powered by **NVIDIA Nemotron Nano 12B 2 VL** via OpenRouter API | Open-weights model with permissive NVIDIA license
963
-
964
- Built with ❤️ using Gradio | [Documentation](https://docs.nvidia.com) | [Report Issues](https://github.com)
965
- """, elem_classes="markdown-content")
966
 
967
- # Launch the app
968
  if __name__ == "__main__":
969
  demo.launch(
970
- server_name="0.0.0.0",
971
- server_port=7860,
972
- share=True,
973
- show_error=True
 
974
  )
 
1
  import gradio as gr
 
2
  from openai import OpenAI
3
  import base64
4
+ import os
5
+ from typing import List, Tuple, Any, Dict, Optional
6
  from PIL import Image
7
  import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Custom CSS for premium, stunning design
10
+ CUSTOM_CSS = """
11
+ body {
12
+ background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 50%, #16213e 100%);
13
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
14
+ color: #e0e0e0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
16
+ .gradio-container {
17
+ max-width: 1400px !important;
 
 
18
  margin: 0 auto;
19
+ background: rgba(0, 0, 0, 0.1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  border-radius: 20px;
21
+ box-shadow: 0 20px 40px rgba(0, 0, 0, 0.5);
22
+ overflow: hidden;
 
 
23
  }
24
+ h1 {
25
+ background: linear-gradient(45deg, #00d4ff, #0099cc);
26
+ -webkit-background-clip: text;
27
+ -webkit-text-fill-color: transparent;
28
+ text-align: center;
29
+ margin: 0;
30
+ padding: 20px;
31
+ font-size: 2.5em;
32
+ text-shadow: 0 0 20px rgba(0, 212, 255, 0.5);
33
  }
34
+ .gr-chatbot {
35
+ background: rgba(255, 255, 255, 0.05);
36
+ border-radius: 15px;
37
+ border: 1px solid rgba(0, 212, 255, 0.2);
 
 
 
38
  backdrop-filter: blur(10px);
39
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  .gr-button {
41
+ background: linear-gradient(45deg, #00d4ff, #0099cc);
42
+ border: none;
43
+ border-radius: 10px;
44
+ color: white;
45
+ font-weight: bold;
46
+ transition: all 0.3s ease;
47
+ box-shadow: 0 5px 15px rgba(0, 212, 255, 0.3);
 
 
 
48
  }
 
49
  .gr-button:hover {
50
  transform: translateY(-2px);
51
+ box-shadow: 0 8px 25px rgba(0, 212, 255, 0.4);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
+ .gr-textbox, .gr-file {
54
+ background: rgba(255, 255, 255, 0.1);
55
+ border: 1px solid rgba(0, 212, 255, 0.3);
56
+ border-radius: 10px;
57
  color: white;
58
+ backdrop-filter: blur(5px);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
+ .gr-textbox::placeholder {
61
+ color: #a0a0a0;
 
 
 
62
  }
63
+ .sidebar {
64
+ background: rgba(0, 0, 0, 0.2);
65
+ padding: 20px;
66
+ border-radius: 15px;
67
+ margin: 10px;
68
+ border: 1px solid rgba(0, 212, 255, 0.1);
 
 
 
69
  }
70
+ """
71
 
72
+ # Function to encode image to base64
73
+ def encode_image_to_base64(image_path: str) -> str:
74
+ with open(image_path, "rb") as image_file:
75
+ return base64.b64encode(image_file.read()).decode("utf-8")
76
+
77
+ # Function to build user content for multimodal input
78
+ def build_user_content(message: str, files: List[str], video_url: str) -> List[Dict[str, Any]]:
79
+ content = [{"type": "text", "text": message}]
80
+ if files:
81
+ for file_path in files:
82
+ if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
83
+ base64_image = encode_image_to_base64(file_path)
84
+ content.append({
85
+ "type": "image_url",
86
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
87
+ })
88
+ # Note: For PDFs, we'd need extraction (e.g., via pdf2image), but skipped for simplicity
89
+ # Users can upload image screenshots of documents
90
+ if video_url and video_url.strip():
91
+ content.append({
92
+ "type": "video_url",
93
+ "video_url": {"url": video_url.strip()}
94
+ })
95
+ return content
96
+
97
+ # Main response function
98
+ def respond_to_query(
99
+ message: str,
100
+ history: List[Tuple[str, str]],
101
+ files: Optional[List[str]],
102
+ video_url: str,
103
+ api_key: str,
104
+ messages_state: List[Dict[str, Any]]
105
+ ) -> Tuple[List[Tuple[str, str]], str, Optional[List[str]], str, List[Dict[str, Any]], str]:
106
+ if not api_key or not api_key.strip():
107
+ return history, "", None, "", messages_state, "⚠️ Please enter your OpenRouter API key to start chatting."
108
+
109
+ if not message.strip():
110
+ return history, "", None, "", messages_state, "⚠️ Please enter a message."
111
+
112
+ client = OpenAI(
113
+ base_url="https://openrouter.ai/api/v1",
114
+ api_key=api_key.strip(),
115
+ )
116
+
117
+ # Copy current messages state
118
+ current_messages = messages_state.copy() if messages_state else []
119
+
120
+ # Add user input
121
+ user_content = build_user_content(message, files or [], video_url)
122
+ current_messages.append({"role": "user", "content": user_content})
123
 
124
+ try:
125
+ # API call with reasoning enabled
126
+ response = client.chat.completions.create(
127
+ model="nvidia/nemotron-nano-12b-v2-vl:free",
128
+ messages=current_messages,
129
+ extra_body={"reasoning": {"enabled": True}}
130
+ )
131
 
132
+ resp_message = response.choices[0].message
133
+ content = resp_message.content or "No response generated."
134
+
135
+ # Preserve reasoning details for multi-turn continuity
136
+ assistant_msg = {"role": "assistant", "content": content}
137
+ if hasattr(resp_message, 'reasoning_details') and resp_message.reasoning_details:
138
+ assistant_msg["reasoning_details"] = resp_message.reasoning_details
139
 
140
+ current_messages.append(assistant_msg)
 
 
 
141
 
142
+ # Append to history (text-only for display; attachments noted)
143
+ attachment_note = ""
144
+ if files:
145
+ attachment_note += f" + {len(files)} image(s)"
146
+ if video_url.strip():
147
+ attachment_note += f" + video URL"
148
+ display_message = message + (attachment_note if attachment_note else "")
149
+ display_response = content + ("\n\n*(Reasoning preserved for follow-up)*" if "reasoning_details" in assistant_msg else "")
150
 
151
+ history.append((display_message, display_response))
 
 
 
152
 
153
+ # Clear inputs
154
+ return history, "", None, "", current_messages, ""
 
155
 
156
+ except Exception as e:
157
+ error_msg = f"❌ Error: {str(e)}. Check your API key, file sizes (keep images <5MB), or video URL."
158
+ history.append((message, error_msg))
159
+ return history, "", None, "", current_messages, error_msg
160
+
161
+ # Examples for creativity and to showcase capabilities
162
+ EXAMPLES = [
163
+ [
164
+ "How many 'r's are in the word 'strawberry'? Think step by step.",
165
+ None, # No files
166
+ "" # No video
167
+ ],
168
+ [
169
+ "Describe this image in detail and reason about its contents.",
170
+ None,
171
+ ""
172
+ ],
173
+ [
174
+ "Analyze this chart: What trends do you see? Extract key data points.",
175
+ None,
176
+ ""
177
+ ],
178
+ [
179
+ "Read the text in this document image and summarize the main points.",
180
+ None,
181
+ ""
182
+ ],
183
+ [
184
+ "Count the objects in these multiple images and compare them.",
185
+ None,
186
+ ""
187
+ ],
188
+ [
189
+ "What happens in this video? Summarize the key events.",
190
+ None,
191
+ "https://example.com/sample-video.mp4" # Placeholder; replace with real public URL
192
+ ]
193
+ ]
194
+
195
+ # Main Gradio Blocks layout
196
+ with gr.Blocks(theme=gr.themes.Dark(), css=CUSTOM_CSS) as demo:
197
+ gr.HTML("""
198
+ <div style='text-align: center; padding: 10px;'>
199
+ <h1>🚀 Nemotron Nano 2 VL Premium Demo</h1>
200
+ <p style='color: #a0a0a0; font-size: 1.1em;'>Unleash multimodal magic: Text, Images, Documents & Videos | Powered by NVIDIA's Hybrid Transformer-Mamba</p>
201
+ </div>
202
+ """)
203
 
204
+ with gr.Row():
205
+ with gr.Column(scale=1):
206
+ # Sidebar for info and controls
207
+ with gr.Accordion("📖 Model Capabilities & Tips", open=False):
208
+ gr.Markdown("""
209
+ **Key Features:**
210
+ - **Text Reasoning:** Chain-of-thought with preserved reasoning.
211
+ - **Image/Document Intelligence:** OCR, chart analysis, multi-image docs (upload screenshots).
212
+ - **Video Understanding:** Enter public video URL (supports long-form with EVS).
213
+ - **Pro Tip:** For documents, upload multiple page images. Keep files small for fast inference.
214
+ - **License:** NVIDIA Open | Free tier via OpenRouter.
215
+ """)
216
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  api_key_input = gr.Textbox(
218
+ label="🔑 OpenRouter API Key",
219
+ placeholder="Enter your API key here (keep secure!)",
220
  type="password",
221
+ lines=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  )
223
+
224
+ with gr.Column(scale=4):
225
+ # Chat interface
226
+ chatbot = gr.Chatbot(
227
+ height=600,
228
+ show_label=False,
229
+ avatar_images=("user_avatar.png", None), # Optional: add custom avatars
230
+ bubble_full_width=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  )
232
+
 
 
233
  with gr.Row():
234
+ msg_input = gr.Textbox(
235
+ label="💭 Your Message",
236
+ placeholder="Ask anything: 'Count the apples' or 'Summarize this video'...",
237
+ lines=2,
238
+ scale=3
239
+ )
240
+ file_upload = gr.File(
241
+ label="🖼️ Attachments (Images for OCR/Charts/Docs)",
242
+ file_types=["image"],
243
+ file_count="multiple",
244
+ scale=1
245
+ )
246
+ video_input = gr.Textbox(
247
+ label="🎥 Video URL (Optional)",
248
+ placeholder="e.g., https://example.com/video.mp4",
249
+ lines=1
250
+ )
251
+
252
+ with gr.Row():
253
+ submit_btn = gr.Button("✨ Send & Reason", variant="primary", scale=3)
254
+ clear_btn = gr.Button("🗑️ Clear Chat", scale=1)
255
+
256
+ # State for multi-turn messages
257
+ messages_state = gr.State([])
258
+
259
+ # Event handlers
260
+ submit_btn.click(
261
+ fn=respond_to_query,
262
+ inputs=[msg_input, chatbot, file_upload, video_input, api_key_input, messages_state],
263
+ outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input]
264
+ ).then(
265
+ fn=lambda: gr.Info("Message sent! Reasoning active."),
266
+ outputs=[]
267
+ )
268
+
269
+ clear_btn.click(
270
+ fn=lambda: ([], "", None, "", [], ""),
271
+ outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input]
272
+ ).then(
273
+ fn=lambda: gr.Info("Chat cleared."),
274
+ outputs=[]
275
+ )
276
+
277
+ # Examples
278
+ gr.Examples(
279
+ examples=EXAMPLES,
280
+ inputs=[msg_input, file_upload, video_input],
281
+ label="💡 Quick Starts",
282
+ examples_per_page=6,
283
+ run_on_click=True,
284
+ fn=respond_to_query,
285
+ outputs=[chatbot, msg_input, file_upload, video_input, messages_state, msg_input],
286
+ cache_examples=False # Since files are dynamic
287
+ ).style(container=False)
288
+
289
  # Footer
290
+ gr.Markdown("""
291
+ <div style='text-align: center; padding: 20px; color: #a0a0a0;'>
292
+ Built with ❤️ for creative multimodal exploration | © 2025 Inspired by NVIDIA Nemotron
293
+ </div>
294
+ """)
 
295
 
 
296
  if __name__ == "__main__":
297
  demo.launch(
298
+ share=True, # Enable public link for demo
299
+ # server_name="0.0.0.0",
300
+ # server_port=7860,
301
+ # show_error=True,
302
+ # quiet=False
303
  )