Spaces:

tuandunghcmut
/

viscot-demo

Running on Zero

dung-vpt-uney commited on Oct 12

Commit

69afdf8

1 Parent(s): 31a530c

Update Visual-CoT demo - 2025-10-12 23:40:03

Fixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script

Files changed (1) hide show

app.py +113 -100

app.py CHANGED Viewed

@@ -34,20 +34,7 @@ from llava.mm_utils import (
     get_model_name_from_path,
 )
-# Import benchmark loader for local datasets
-try:
-    from benchmark_loader import (
-        get_all_dataset_names,
-        load_benchmark_example_for_gradio,
-        get_random_examples_for_gradio,
-        get_dataset_info,
-        get_dataset_stats,
-    )
-    BENCHMARK_LOADER_AVAILABLE = True
-    print("✅ Benchmark loader module imported successfully")
-except ImportError as e:
-    BENCHMARK_LOADER_AVAILABLE = False
-    print(f"⚠️ Benchmark loader not available: {e}")
 # =============================================================================
 # Authentication
@@ -81,16 +68,48 @@ MODEL_PATH = "deepcs233/VisCoT-7b-224"  # Default: smallest/fastest
 CURRENT_MODEL_NAME = "VisCoT-7B-224 (Fastest)"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Benchmark datasets - will be loaded from benchmark_loader module
-if BENCHMARK_LOADER_AVAILABLE:
-    BENCHMARK_DATASETS = get_all_dataset_names()
-    print(f"✅ Loaded {len(BENCHMARK_DATASETS)} benchmark datasets")
-    stats = get_dataset_stats()
-    total_examples = sum(s.get("total_examples", 0) for s in stats.values() if "error" not in s)
-    print(f"📊 Total examples across all benchmarks: {total_examples:,}")
-else:
-    BENCHMARK_DATASETS = ["GQA", "TextVQA", "DocVQA", "Visual7W", "Flickr30k"]
-    print("⚠️ Using fallback benchmark list")
 # =============================================================================
 # Model Loading (Global - bfloat16)
@@ -173,12 +192,51 @@ def switch_model(model_choice):
 # =============================================================================
 def load_benchmark_example(dataset_name, index=0):
-    """Load an example from benchmark dataset using benchmark_loader"""
-    if BENCHMARK_LOADER_AVAILABLE:
-        return load_benchmark_example_for_gradio(dataset_name, index)
-    else:
-        # Fallback for when benchmark_loader is not available
-        error_msg = "Benchmark loader module not available"
         return None, error_msg, "", "", error_msg
 # =============================================================================
@@ -405,7 +463,7 @@ def create_demo():
     .header {
         text-align: center;
         padding: 20px;
-        background: linear-gradient(135deg, #475569 0%, #334155 100%);
         color: white;
         border-radius: 10px;
         margin-bottom: 20px;
@@ -437,8 +495,8 @@ def create_demo():
     with gr.Blocks(
         theme=gr.themes.Soft(
-            primary_hue="slate",
-            secondary_hue="gray",
             neutral_hue="slate",
         ),
         css=custom_css,
@@ -604,42 +662,16 @@ def create_demo():
                             visible=False,
                         )
-                # Example images from benchmarks
-                gr.Markdown("### 📋 Try These Examples from Benchmarks")
-                # Generate examples from multiple benchmarks if available
-                if BENCHMARK_LOADER_AVAILABLE:
-                    try:
-                        benchmark_examples = get_random_examples_for_gradio(count=6)
-                        if benchmark_examples:
-                            gr.Examples(
-                                examples=benchmark_examples,
-                                inputs=[image_input, question_input],
-                                label="Click to load random benchmark examples",
-                            )
-                        else:
-                            gr.Markdown("*Benchmark examples loading failed. Check if images are available.*")
-                    except Exception as e:
-                        gr.Markdown(f"*Could not load benchmark examples: {e}*")
-                        # Fallback to default examples
-                        gr.Examples(
-                            examples=[
-                                ["examples/extreme_ironing.jpg", "What is unusual about this image?"],
-                                ["examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
-                            ],
-                            inputs=[image_input, question_input],
-                            label="Click to load example",
-                        )
-                else:
-                    # Fallback examples when benchmark loader not available
-                    gr.Examples(
-                        examples=[
-                            ["examples/extreme_ironing.jpg", "What is unusual about this image?"],
-                            ["examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
-                        ],
-                        inputs=[image_input, question_input],
-                        label="Click to load example",
-                    )
                 # Event handlers
                 submit_btn.click(
@@ -668,9 +700,9 @@ def create_demo():
                     with gr.Column(scale=2):
                         dataset_dropdown = gr.Dropdown(
                             choices=list(BENCHMARK_DATASETS.keys()),
-                            value="GQA",
                             label="Select Benchmark Dataset",
-                            info="Choose from 5 core benchmarks"
                         )
                     with gr.Column(scale=1):
                         example_index = gr.Number(
@@ -718,35 +750,16 @@ def create_demo():
                             interactive=False,
                         )
-                # Dataset information - dynamically generated
-                if BENCHMARK_LOADER_AVAILABLE:
-                    dataset_info_md = "---\n\n### Available Benchmark Datasets\n\n"
-                    stats = get_dataset_stats()
-                    for i, (name, info) in enumerate(stats.items(), 1):
-                        if "error" not in info:
-                            dataset_info_md += f"{i}. **{name}** ({info['total_examples']:,} examples): {info['description']}\n"
-                        else:
-                            dataset_info_md += f"{i}. **{name}**: {info['error']}\n"
-                    total_examples = sum(s.get("total_examples", 0) for s in stats.values() if "error" not in s)
-                    dataset_info_md += f"\n**Total:** {total_examples:,} annotated examples across {len(stats)} benchmarks\n"
-                    dataset_info_md += "\n**Source:** Local JSONL files from Visual-CoT dataset"
-                    gr.Markdown(dataset_info_md)
-                else:
-                    gr.Markdown("""
-                    ---
-                    ### Dataset Information
-                    1. **GQA** - Scene graph question answering with compositional reasoning
-                    2. **TextVQA** - Questions requiring reading and understanding text in images
-                    3. **DocVQA** - Document understanding and information extraction
-                    4. **Visual7W** - Visual question answering with pointing and telling tasks
-                    5. **Flickr30k** - Image captioning and visual grounding
-                    **Note:** Benchmark loader module not available.
-                    """)
                 # Event handlers
                 def load_and_update(dataset_name, index):

     get_model_name_from_path,
 )
+# No need for local benchmark loader - using HF datasets directly
 # =============================================================================
 # Authentication
 CURRENT_MODEL_NAME = "VisCoT-7B-224 (Fastest)"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Benchmark datasets from Visual Chain-of-Thought Reasoning Benchmarks Collection
+# https://huggingface.co/collections/tuandunghcmut/visual-chain-of-thought-reasoning-benchmarks
+BENCHMARK_DATASETS = {
+    "Visual-CoT": {
+        "path": "deepcs233/Visual-CoT",
+        "description": "Main Visual-CoT dataset with 438K question-answer pairs",
+    },
+    "GQA": {
+        "path": "lmms-lab/GQA",
+        "description": "Scene graph question answering (24.2M examples)",
+    },
+    "RefCOCO": {
+        "path": "lmms-lab/RefCOCO",
+        "description": "Referring expression comprehension (17.6K examples)",
+    },
+    "RefCOCO+": {
+        "path": "lmms-lab/RefCOCOplus",
+        "description": "RefCOCO with no location words (7.58K examples)",
+    },
+    "RefCOCOg": {
+        "path": "lmms-lab/RefCOCOg",
+        "description": "RefCOCO with longer expressions (12.6K examples)",
+    },
+    "POPE": {
+        "path": "lmms-lab/POPE",
+        "description": "Polling-based Object Probing Evaluation (18K examples)",
+    },
+    "ScienceQA": {
+        "path": "lmms-lab/ScienceQA",
+        "description": "Science question answering (12.6K examples)",
+    },
+    "MM-GCoT": {
+        "path": "AQUA6/MM-GCoT",
+        "description": "Multi-Modal Graph Chain-of-Thought (64.9K examples)",
+    },
+    "VGR": {
+        "path": "BytedanceDouyinContent/VGR",
+        "description": "Visual Grounding & Reasoning (90K examples)",
+    },
+}
+print(f"✅ Configured {len(BENCHMARK_DATASETS)} benchmark datasets from HF collection")
 # =============================================================================
 # Model Loading (Global - bfloat16)
 # =============================================================================
 def load_benchmark_example(dataset_name, index=0):
+    """Load an example from HF benchmark dataset"""
+    try:
+        from datasets import load_dataset
+        dataset_info = BENCHMARK_DATASETS.get(dataset_name)
+        if not dataset_info:
+            return None, "Dataset not found", "", "", ""
+        dataset_path = dataset_info["path"]
+        # Load dataset
+        print(f"Loading {dataset_name} from {dataset_path}...")
+        dataset = load_dataset(dataset_path, split="train", streaming=True)
+        # Get specific index (for streaming, we need to iterate)
+        for i, example in enumerate(dataset):
+            if i == index:
+                # Extract fields (structure varies by dataset)
+                image = example.get("image")
+                question = example.get("question", example.get("text", ""))
+                # Try to get bounding box in various formats
+                bbox = example.get("bbox", example.get("bboxes", ""))
+                if isinstance(bbox, list) and bbox:
+                    bbox_str = str(bbox)
+                else:
+                    bbox_str = "No bounding box available"
+                answer = example.get("answer", example.get("label", ""))
+                status = f"📊 Dataset: {dataset_name} | Example {index + 1}\n{dataset_info['description']}"
+                return image, question, bbox_str, answer, status
+            # Stop after a few iterations for efficiency
+            if i > index + 10:
+                break
+        return None, "Index out of range", "", "", "Could not find example at this index"
+    except Exception as e:
+        error_msg = f"Error loading {dataset_name}: {str(e)}"
+        print(error_msg)
+        import traceback
+        traceback.print_exc()
         return None, error_msg, "", "", error_msg
 # =============================================================================
     .header {
         text-align: center;
         padding: 20px;
+        background: linear-gradient(135deg, #1e3a8a 0%, #1e40af 100%);
         color: white;
         border-radius: 10px;
         margin-bottom: 20px;
     with gr.Blocks(
         theme=gr.themes.Soft(
+            primary_hue="blue",
+            secondary_hue="indigo",
             neutral_hue="slate",
         ),
         css=custom_css,
                             visible=False,
                         )
+                # Example images
+                gr.Markdown("### 📋 Try These Examples")
+                gr.Examples(
+                    examples=[
+                        ["examples/extreme_ironing.jpg", "What is unusual about this image?"],
+                        ["examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
+                    ],
+                    inputs=[image_input, question_input],
+                    label="Click to load example",
+                )
                 # Event handlers
                 submit_btn.click(
                     with gr.Column(scale=2):
                         dataset_dropdown = gr.Dropdown(
                             choices=list(BENCHMARK_DATASETS.keys()),
+                            value="Visual-CoT",
                             label="Select Benchmark Dataset",
+                            info="Choose from 9 visual reasoning benchmarks"
                         )
                     with gr.Column(scale=1):
                         example_index = gr.Number(
                             interactive=False,
                         )
+                # Dataset information - dynamically generated from BENCHMARK_DATASETS
+                dataset_info_md = "---\n\n### Available Benchmark Datasets\n\n"
+                for i, (name, info) in enumerate(BENCHMARK_DATASETS.items(), 1):
+                    dataset_info_md += f"{i}. **{name}**: {info['description']}\n"
+                    dataset_info_md += f"   - Path: `{info['path']}`\n"
+                dataset_info_md += f"\n**Total:** {len(BENCHMARK_DATASETS)} benchmarks from Visual Chain-of-Thought Reasoning Collection\n"
+                dataset_info_md += "\n**Source:** [Hugging Face Collection](https://huggingface.co/collections/tuandunghcmut/visual-chain-of-thought-reasoning-benchmarks)"
+                gr.Markdown(dataset_info_md)
                 # Event handlers
                 def load_and_update(dataset_name, index):