Spaces:

tuandunghcmut
/

viscot-demo

Running on Zero

dung-vpt-uney commited on Oct 12

Commit

3564f62

1 Parent(s): 83428d7

Update Visual-CoT demo - 2025-10-12 23:15:20

Fixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script

Files changed (1) hide show

app.py +31 -26

app.py CHANGED Viewed

@@ -387,17 +387,18 @@ def create_demo():
         # Introduction
         gr.Markdown("""
-        ## 🎯 What is Visual-CoT?
-        **Visual Chain-of-Thought (VisCoT)** enables AI models to:
-        - 🎯 **Identify important regions** in images using bounding boxes
-        - 💭 **Reason step-by-step** like humans (Chain-of-Thought)
-        - 💡 **Answer questions** about visual content with interpretable explanations
-        ### 📊 Dataset & Model
-        - **438K** Q&A pairs with bounding box annotations
-        - **13 diverse benchmarks** (DocVQA, GQA, TextVQA, etc.)
-        - **LLaVA-1.5 based** architecture with CLIP ViT-L/14
         """)
         # Authentication notice for Zero GPU
@@ -417,11 +418,15 @@ def create_demo():
             # ============================================================
             with gr.Tab("Interactive Demo"):
                 gr.Markdown("""
-                ### Try Visual-CoT with Your Own Images!
-                Upload an image and ask a question. The model will:
-                1. **Detect** the region of interest (ROI) → Output bounding box
-                2. **Analyze** the ROI and full image → Generate answer
                 """)
                 with gr.Row():
@@ -429,23 +434,23 @@ def create_demo():
                         # Input
                         image_input = gr.Image(
                             type="pil",
-                            label="📸 Upload Image",
                             height=400,
                         )
                         question_input = gr.Textbox(
-                            label="❓ Your Question",
                             placeholder="Example: What is unusual about this image?",
                             lines=3,
                         )
-                        with gr.Accordion("⚙️ Advanced Settings", open=False):
                             temperature = gr.Slider(
                                 minimum=0.0,
                                 maximum=1.0,
                                 value=0.2,
                                 step=0.05,
-                                label="🌡️ Temperature",
                                 info="0 = Deterministic, 1 = Creative"
                             )
@@ -454,26 +459,26 @@ def create_demo():
                                 maximum=1024,
                                 value=512,
                                 step=64,
-                                label="📝 Max Output Tokens"
                             )
-                        submit_btn = gr.Button("🚀 Analyze Image", variant="primary", size="lg")
-                        clear_btn = gr.Button("🗑️ Clear", size="sm")
                     with gr.Column(scale=1):
                         # Output
-                        gr.Markdown("### 📤 Results")
                         with gr.Group():
-                            gr.Markdown("#### 🎯 Step 1: Region Detection")
                             bbox_output = gr.Textbox(
-                                label="Detected Bounding Box",
                                 lines=2,
                                 show_copy_button=True,
                             )
                         with gr.Group():
-                            gr.Markdown("#### 💡 Step 2: Answer")
                             answer_output = gr.Textbox(
                                 label="Final Answer",
                                 lines=6,
@@ -481,9 +486,9 @@ def create_demo():
                             )
                         with gr.Group():
-                            gr.Markdown("#### Visualization")
                             image_output = gr.Image(
-                                label="Image with Bounding Box",
                                 type="pil",
                                 height=350,
                             )

         # Introduction
         gr.Markdown("""
+        ## 1. Introduction to Visual-CoT
+        **Visual Chain-of-Thought (VisCoT)** is a multi-modal language model that enables:
+        1. **Region Identification**: Detect key regions in images using bounding boxes
+        2. **Step-by-Step Reasoning**: Apply Chain-of-Thought methodology for visual understanding
+        3. **Question Answering**: Provide interpretable explanations for visual content
+        ### 1.1 Dataset Statistics
+        - 438,000 question-answer pairs with bounding box annotations
+        - 13 diverse benchmarks (DocVQA, GQA, TextVQA, etc.)
+        - Based on LLaVA-1.5 architecture with CLIP ViT-L/14 vision encoder
         """)
         # Authentication notice for Zero GPU
             # ============================================================
             with gr.Tab("Interactive Demo"):
                 gr.Markdown("""
+                ### 2. Interactive Demonstration
+                **Procedure**:
+                1. Upload an image
+                2. Enter a question about the image
+                3. The model will:
+                   - Step 1: Detect region of interest (ROI) and output bounding box
+                   - Step 2: Analyze the ROI and generate answer
                 """)
                 with gr.Row():
                         # Input
                         image_input = gr.Image(
                             type="pil",
+                            label="Input Image",
                             height=400,
                         )
                         question_input = gr.Textbox(
+                            label="Question",
                             placeholder="Example: What is unusual about this image?",
                             lines=3,
                         )
+                        with gr.Accordion("Advanced Parameters", open=False):
                             temperature = gr.Slider(
                                 minimum=0.0,
                                 maximum=1.0,
                                 value=0.2,
                                 step=0.05,
+                                label="Temperature",
                                 info="0 = Deterministic, 1 = Creative"
                             )
                                 maximum=1024,
                                 value=512,
                                 step=64,
+                                label="Maximum Output Tokens"
                             )
+                        submit_btn = gr.Button("Run Analysis", variant="primary", size="lg")
+                        clear_btn = gr.Button("Clear", size="sm")
                     with gr.Column(scale=1):
                         # Output
+                        gr.Markdown("### 3. Results")
                         with gr.Group():
+                            gr.Markdown("#### 3.1 Step 1: Region Detection")
                             bbox_output = gr.Textbox(
+                                label="Detected Bounding Box Coordinates",
                                 lines=2,
                                 show_copy_button=True,
                             )
                         with gr.Group():
+                            gr.Markdown("#### 3.2 Step 2: Answer Generation")
                             answer_output = gr.Textbox(
                                 label="Final Answer",
                                 lines=6,
                             )
                         with gr.Group():
+                            gr.Markdown("#### 3.3 Visualization")
                             image_output = gr.Image(
+                                label="Image with Bounding Box Overlay",
                                 type="pil",
                                 height=350,
                             )