dung-vpt-uney commited on
Commit
3564f62
·
1 Parent(s): 83428d7

Update Visual-CoT demo - 2025-10-12 23:15:20

Browse files

Fixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script

Files changed (1) hide show
  1. app.py +31 -26
app.py CHANGED
@@ -387,17 +387,18 @@ def create_demo():
387
 
388
  # Introduction
389
  gr.Markdown("""
390
- ## 🎯 What is Visual-CoT?
391
 
392
- **Visual Chain-of-Thought (VisCoT)** enables AI models to:
393
- - 🎯 **Identify important regions** in images using bounding boxes
394
- - 💭 **Reason step-by-step** like humans (Chain-of-Thought)
395
- - 💡 **Answer questions** about visual content with interpretable explanations
396
 
397
- ### 📊 Dataset & Model
398
- - **438K** Q&A pairs with bounding box annotations
399
- - **13 diverse benchmarks** (DocVQA, GQA, TextVQA, etc.)
400
- - **LLaVA-1.5 based** architecture with CLIP ViT-L/14
 
 
 
 
401
  """)
402
 
403
  # Authentication notice for Zero GPU
@@ -417,11 +418,15 @@ def create_demo():
417
  # ============================================================
418
  with gr.Tab("Interactive Demo"):
419
  gr.Markdown("""
420
- ### Try Visual-CoT with Your Own Images!
 
 
421
 
422
- Upload an image and ask a question. The model will:
423
- 1. **Detect** the region of interest (ROI) → Output bounding box
424
- 2. **Analyze** the ROI and full image → Generate answer
 
 
425
  """)
426
 
427
  with gr.Row():
@@ -429,23 +434,23 @@ def create_demo():
429
  # Input
430
  image_input = gr.Image(
431
  type="pil",
432
- label="📸 Upload Image",
433
  height=400,
434
  )
435
 
436
  question_input = gr.Textbox(
437
- label="❓ Your Question",
438
  placeholder="Example: What is unusual about this image?",
439
  lines=3,
440
  )
441
 
442
- with gr.Accordion("⚙️ Advanced Settings", open=False):
443
  temperature = gr.Slider(
444
  minimum=0.0,
445
  maximum=1.0,
446
  value=0.2,
447
  step=0.05,
448
- label="🌡️ Temperature",
449
  info="0 = Deterministic, 1 = Creative"
450
  )
451
 
@@ -454,26 +459,26 @@ def create_demo():
454
  maximum=1024,
455
  value=512,
456
  step=64,
457
- label="📝 Max Output Tokens"
458
  )
459
 
460
- submit_btn = gr.Button("🚀 Analyze Image", variant="primary", size="lg")
461
- clear_btn = gr.Button("🗑️ Clear", size="sm")
462
 
463
  with gr.Column(scale=1):
464
  # Output
465
- gr.Markdown("### 📤 Results")
466
 
467
  with gr.Group():
468
- gr.Markdown("#### 🎯 Step 1: Region Detection")
469
  bbox_output = gr.Textbox(
470
- label="Detected Bounding Box",
471
  lines=2,
472
  show_copy_button=True,
473
  )
474
 
475
  with gr.Group():
476
- gr.Markdown("#### 💡 Step 2: Answer")
477
  answer_output = gr.Textbox(
478
  label="Final Answer",
479
  lines=6,
@@ -481,9 +486,9 @@ def create_demo():
481
  )
482
 
483
  with gr.Group():
484
- gr.Markdown("#### Visualization")
485
  image_output = gr.Image(
486
- label="Image with Bounding Box",
487
  type="pil",
488
  height=350,
489
  )
 
387
 
388
  # Introduction
389
  gr.Markdown("""
390
+ ## 1. Introduction to Visual-CoT
391
 
392
+ **Visual Chain-of-Thought (VisCoT)** is a multi-modal language model that enables:
 
 
 
393
 
394
+ 1. **Region Identification**: Detect key regions in images using bounding boxes
395
+ 2. **Step-by-Step Reasoning**: Apply Chain-of-Thought methodology for visual understanding
396
+ 3. **Question Answering**: Provide interpretable explanations for visual content
397
+
398
+ ### 1.1 Dataset Statistics
399
+ - 438,000 question-answer pairs with bounding box annotations
400
+ - 13 diverse benchmarks (DocVQA, GQA, TextVQA, etc.)
401
+ - Based on LLaVA-1.5 architecture with CLIP ViT-L/14 vision encoder
402
  """)
403
 
404
  # Authentication notice for Zero GPU
 
418
  # ============================================================
419
  with gr.Tab("Interactive Demo"):
420
  gr.Markdown("""
421
+ ### 2. Interactive Demonstration
422
+
423
+ **Procedure**:
424
 
425
+ 1. Upload an image
426
+ 2. Enter a question about the image
427
+ 3. The model will:
428
+ - Step 1: Detect region of interest (ROI) and output bounding box
429
+ - Step 2: Analyze the ROI and generate answer
430
  """)
431
 
432
  with gr.Row():
 
434
  # Input
435
  image_input = gr.Image(
436
  type="pil",
437
+ label="Input Image",
438
  height=400,
439
  )
440
 
441
  question_input = gr.Textbox(
442
+ label="Question",
443
  placeholder="Example: What is unusual about this image?",
444
  lines=3,
445
  )
446
 
447
+ with gr.Accordion("Advanced Parameters", open=False):
448
  temperature = gr.Slider(
449
  minimum=0.0,
450
  maximum=1.0,
451
  value=0.2,
452
  step=0.05,
453
+ label="Temperature",
454
  info="0 = Deterministic, 1 = Creative"
455
  )
456
 
 
459
  maximum=1024,
460
  value=512,
461
  step=64,
462
+ label="Maximum Output Tokens"
463
  )
464
 
465
+ submit_btn = gr.Button("Run Analysis", variant="primary", size="lg")
466
+ clear_btn = gr.Button("Clear", size="sm")
467
 
468
  with gr.Column(scale=1):
469
  # Output
470
+ gr.Markdown("### 3. Results")
471
 
472
  with gr.Group():
473
+ gr.Markdown("#### 3.1 Step 1: Region Detection")
474
  bbox_output = gr.Textbox(
475
+ label="Detected Bounding Box Coordinates",
476
  lines=2,
477
  show_copy_button=True,
478
  )
479
 
480
  with gr.Group():
481
+ gr.Markdown("#### 3.2 Step 2: Answer Generation")
482
  answer_output = gr.Textbox(
483
  label="Final Answer",
484
  lines=6,
 
486
  )
487
 
488
  with gr.Group():
489
+ gr.Markdown("#### 3.3 Visualization")
490
  image_output = gr.Image(
491
+ label="Image with Bounding Box Overlay",
492
  type="pil",
493
  height=350,
494
  )