Spaces:
Running
on
Zero
Running
on
Zero
dung-vpt-uney
commited on
Commit
·
f39b78a
1
Parent(s):
ba64608
Update Visual-CoT demo - 2025-10-12 23:23:25
Browse filesFixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script
app.py
CHANGED
|
@@ -162,7 +162,7 @@ def load_benchmark_example(dataset_name, index=0):
|
|
| 162 |
|
| 163 |
dataset_path = BENCHMARK_DATASETS.get(dataset_name)
|
| 164 |
if not dataset_path:
|
| 165 |
-
return None, "Dataset not found", "", ""
|
| 166 |
|
| 167 |
# Load dataset
|
| 168 |
dataset = load_dataset(dataset_path, split="train")
|
|
@@ -640,45 +640,117 @@ def create_demo():
|
|
| 640 |
gr.Markdown("""
|
| 641 |
### Explore Visual-CoT Benchmark Examples
|
| 642 |
|
| 643 |
-
|
| 644 |
-
|
| 645 |
""")
|
| 646 |
|
| 647 |
with gr.Row():
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
)
|
| 663 |
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
""")
|
| 676 |
|
| 677 |
-
#
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 682 |
)
|
| 683 |
|
| 684 |
# ============================================================
|
|
@@ -704,31 +776,137 @@ def create_demo():
|
|
| 704 |
|
| 705 |
## Model Architecture
|
| 706 |
|
| 707 |
-
|
| 708 |
-
Visual-CoT Pipeline:
|
| 709 |
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
↓
|
| 718 |
-
|
|
|
|
|
|
|
| 719 |
↓
|
| 720 |
-
|
|
|
|
|
|
|
| 721 |
```
|
| 722 |
|
| 723 |
---
|
| 724 |
|
| 725 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
|
| 727 |
-
-
|
| 728 |
-
-
|
| 729 |
-
-
|
| 730 |
-
-
|
| 731 |
-
- **Resolutions**: 224px and 336px
|
| 732 |
|
| 733 |
---
|
| 734 |
|
|
|
|
| 162 |
|
| 163 |
dataset_path = BENCHMARK_DATASETS.get(dataset_name)
|
| 164 |
if not dataset_path:
|
| 165 |
+
return None, "Dataset not found", "", "", ""
|
| 166 |
|
| 167 |
# Load dataset
|
| 168 |
dataset = load_dataset(dataset_path, split="train")
|
|
|
|
| 640 |
gr.Markdown("""
|
| 641 |
### Explore Visual-CoT Benchmark Examples
|
| 642 |
|
| 643 |
+
Load and browse real examples from the Visual-CoT benchmark datasets.
|
| 644 |
+
Each example includes: image, question, ground-truth bounding box, and answer.
|
| 645 |
""")
|
| 646 |
|
| 647 |
with gr.Row():
|
| 648 |
+
with gr.Column(scale=2):
|
| 649 |
+
dataset_dropdown = gr.Dropdown(
|
| 650 |
+
choices=list(BENCHMARK_DATASETS.keys()),
|
| 651 |
+
value="GQA",
|
| 652 |
+
label="Select Benchmark Dataset",
|
| 653 |
+
info="Choose from 5 core benchmarks"
|
| 654 |
+
)
|
| 655 |
+
with gr.Column(scale=1):
|
| 656 |
+
example_index = gr.Number(
|
| 657 |
+
value=0,
|
| 658 |
+
label="Example Index",
|
| 659 |
+
precision=0,
|
| 660 |
+
minimum=0,
|
| 661 |
+
)
|
| 662 |
|
| 663 |
+
with gr.Row():
|
| 664 |
+
load_btn = gr.Button("Load Example", variant="primary")
|
| 665 |
+
prev_btn = gr.Button("◀ Previous")
|
| 666 |
+
next_btn = gr.Button("Next ▶")
|
| 667 |
+
|
| 668 |
+
benchmark_status = gr.Textbox(
|
| 669 |
+
label="Status",
|
| 670 |
+
value="Select a dataset and click 'Load Example'",
|
| 671 |
+
interactive=False,
|
| 672 |
)
|
| 673 |
|
| 674 |
+
with gr.Row():
|
| 675 |
+
with gr.Column():
|
| 676 |
+
gr.Markdown("#### Image")
|
| 677 |
+
benchmark_image = gr.Image(
|
| 678 |
+
label="Input Image",
|
| 679 |
+
type="pil",
|
| 680 |
+
height=400,
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
with gr.Column():
|
| 684 |
+
gr.Markdown("#### Annotations")
|
| 685 |
+
benchmark_question = gr.Textbox(
|
| 686 |
+
label="Question",
|
| 687 |
+
lines=2,
|
| 688 |
+
interactive=False,
|
| 689 |
+
)
|
| 690 |
+
benchmark_bbox = gr.Textbox(
|
| 691 |
+
label="Ground Truth Bounding Box",
|
| 692 |
+
lines=1,
|
| 693 |
+
interactive=False,
|
| 694 |
+
)
|
| 695 |
+
benchmark_answer = gr.Textbox(
|
| 696 |
+
label="Ground Truth Answer",
|
| 697 |
+
lines=3,
|
| 698 |
+
interactive=False,
|
| 699 |
+
)
|
| 700 |
+
|
| 701 |
+
gr.Markdown("""
|
| 702 |
+
---
|
| 703 |
+
|
| 704 |
+
### Dataset Information
|
| 705 |
+
|
| 706 |
+
1. **GQA** - Scene graph question answering with compositional reasoning
|
| 707 |
+
2. **TextVQA** - Questions requiring reading and understanding text in images
|
| 708 |
+
3. **DocVQA** - Document understanding and information extraction
|
| 709 |
+
4. **Visual7W** - Visual question answering with pointing and telling tasks
|
| 710 |
+
5. **Flickr30k** - Image captioning and visual grounding
|
| 711 |
+
|
| 712 |
+
**Note:** Examples are loaded directly from the [Visual-CoT Hugging Face Collection](https://huggingface.co/collections/tuandunghcmut/visual-chain-of-thought-reasoning-benchmarks-68e25b22c3c095c6f87baba0).
|
| 713 |
""")
|
| 714 |
|
| 715 |
+
# Event handlers
|
| 716 |
+
def load_and_update(dataset_name, index):
|
| 717 |
+
result = load_benchmark_example(dataset_name, int(index))
|
| 718 |
+
if len(result) == 5:
|
| 719 |
+
return result
|
| 720 |
+
else:
|
| 721 |
+
# Error case
|
| 722 |
+
return None, result, "", "", ""
|
| 723 |
+
|
| 724 |
+
def increment_index(current_index):
|
| 725 |
+
return int(current_index) + 1
|
| 726 |
+
|
| 727 |
+
def decrement_index(current_index):
|
| 728 |
+
return max(0, int(current_index) - 1)
|
| 729 |
+
|
| 730 |
+
load_btn.click(
|
| 731 |
+
fn=load_and_update,
|
| 732 |
+
inputs=[dataset_dropdown, example_index],
|
| 733 |
+
outputs=[benchmark_image, benchmark_question, benchmark_bbox, benchmark_answer, benchmark_status],
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
next_btn.click(
|
| 737 |
+
fn=increment_index,
|
| 738 |
+
inputs=[example_index],
|
| 739 |
+
outputs=[example_index],
|
| 740 |
+
).then(
|
| 741 |
+
fn=load_and_update,
|
| 742 |
+
inputs=[dataset_dropdown, example_index],
|
| 743 |
+
outputs=[benchmark_image, benchmark_question, benchmark_bbox, benchmark_answer, benchmark_status],
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
prev_btn.click(
|
| 747 |
+
fn=decrement_index,
|
| 748 |
+
inputs=[example_index],
|
| 749 |
+
outputs=[example_index],
|
| 750 |
+
).then(
|
| 751 |
+
fn=load_and_update,
|
| 752 |
+
inputs=[dataset_dropdown, example_index],
|
| 753 |
+
outputs=[benchmark_image, benchmark_question, benchmark_bbox, benchmark_answer, benchmark_status],
|
| 754 |
)
|
| 755 |
|
| 756 |
# ============================================================
|
|
|
|
| 776 |
|
| 777 |
## Model Architecture
|
| 778 |
|
| 779 |
+
### Components
|
|
|
|
| 780 |
|
| 781 |
+
1. **Vision Encoder**: CLIP ViT-L/14
|
| 782 |
+
- Input resolution: 224px or 336px
|
| 783 |
+
- Output: 577 visual tokens (336px) or 196 tokens (224px)
|
| 784 |
+
- Feature dimension: 1024
|
| 785 |
+
|
| 786 |
+
2. **Multi-modal Projector**: 2-layer MLP with GELU
|
| 787 |
+
- Maps vision features (1024D) to LLM embedding space (4096D)
|
| 788 |
+
- Trainable parameters: ~8.4M
|
| 789 |
+
|
| 790 |
+
3. **Language Model**: Vicuna v1.5 (instruction-tuned LLaMA)
|
| 791 |
+
- Variants: 7B or 13B parameters
|
| 792 |
+
- Context length: 2048 tokens
|
| 793 |
+
- Base: LLaMA architecture
|
| 794 |
+
|
| 795 |
+
### Multi-Turn Processing Pipeline
|
| 796 |
+
|
| 797 |
+
```
|
| 798 |
+
Image + Question
|
| 799 |
↓
|
| 800 |
+
[Turn 1] ROI Detection
|
| 801 |
+
→ Outputs: Bounding box coordinates [x1, y1, x2, y2]
|
| 802 |
+
→ Purpose: Identify key regions for reasoning
|
| 803 |
↓
|
| 804 |
+
[Turn 2] Question Answering
|
| 805 |
+
→ Input: Image + Question + Detected bbox
|
| 806 |
+
→ Output: Final answer grounded in visual evidence
|
| 807 |
```
|
| 808 |
|
| 809 |
---
|
| 810 |
|
| 811 |
+
## Training Strategy
|
| 812 |
+
|
| 813 |
+
### Stage 1: Feature Alignment (Pretrain)
|
| 814 |
+
|
| 815 |
+
- **Dataset**: 558K LAION-CC-SBU subset with BLIP captions
|
| 816 |
+
- **Objective**: Connect frozen CLIP encoder to frozen LLM
|
| 817 |
+
- **Trainable**: Only the MLP projector (~8.4M params)
|
| 818 |
+
- **Duration**: 3.5 hours (7B) to 5.5 hours (13B) on 8×A100 GPUs
|
| 819 |
+
- **Hyperparameters**:
|
| 820 |
+
- Batch size: 256
|
| 821 |
+
- Learning rate: 1e-3
|
| 822 |
+
- Epochs: 1
|
| 823 |
+
- Max sequence length: 2048
|
| 824 |
+
|
| 825 |
+
### Stage 2: Visual Instruction Tuning
|
| 826 |
+
|
| 827 |
+
- **Dataset Mix**:
|
| 828 |
+
- 665K multimodal instruction-following (LLaVA-1.5)
|
| 829 |
+
- 1.4M positional annotation data (Shikra)
|
| 830 |
+
- 373K Visual-CoT data (ours)
|
| 831 |
+
- **Total**: ~2.4M training instances
|
| 832 |
+
|
| 833 |
+
- **Training Details**:
|
| 834 |
+
- Duration: ~60 hours (7B-224) on 8×A100 GPUs
|
| 835 |
+
- Batch size: 128
|
| 836 |
+
- Learning rate: 2e-5 (backbone), 2e-6 (vision encoder)
|
| 837 |
+
- Epochs: 1
|
| 838 |
+
- DeepSpeed ZeRO-3 for memory efficiency
|
| 839 |
+
|
| 840 |
+
---
|
| 841 |
+
|
| 842 |
+
## Dataset Construction
|
| 843 |
+
|
| 844 |
+
### Visual-CoT Dataset (438K examples)
|
| 845 |
+
|
| 846 |
+
**13 Diverse Benchmarks:**
|
| 847 |
+
|
| 848 |
+
1. **Document Understanding** (4 datasets):
|
| 849 |
+
- DocVQA: Document visual QA
|
| 850 |
+
- InfographicsVQA: Infographic comprehension
|
| 851 |
+
- DUDE: Document understanding
|
| 852 |
+
- SROIE: Scanned receipt information extraction
|
| 853 |
+
|
| 854 |
+
2. **Scene Understanding** (3 datasets):
|
| 855 |
+
- GQA: Scene graph compositional reasoning
|
| 856 |
+
- Visual7W: Pointing and telling tasks
|
| 857 |
+
- VSR: Visual spatial reasoning
|
| 858 |
+
|
| 859 |
+
3. **Text in Images** (2 datasets):
|
| 860 |
+
- TextVQA: Reading text in natural images
|
| 861 |
+
- OCR-VQA: OCR-based question answering
|
| 862 |
+
|
| 863 |
+
4. **General VQA** (2 datasets):
|
| 864 |
+
- Visual Genome: Dense annotations
|
| 865 |
+
- COCO: Common objects in context
|
| 866 |
+
|
| 867 |
+
5. **Specialized** (2 datasets):
|
| 868 |
+
- CUB: Fine-grained bird classification
|
| 869 |
+
- Flickr30k: Image captioning & grounding
|
| 870 |
+
|
| 871 |
+
**Annotation Details:**
|
| 872 |
+
- Each example includes: image, question, answer, bounding box
|
| 873 |
+
- Bounding boxes highlight key regions essential for reasoning
|
| 874 |
+
- 98K examples have detailed reasoning steps
|
| 875 |
+
- Train/val splits maintained from original benchmarks
|
| 876 |
+
|
| 877 |
+
---
|
| 878 |
+
|
| 879 |
+
## Evaluation & Results
|
| 880 |
+
|
| 881 |
+
### Visual-CoT Benchmark Metrics
|
| 882 |
+
|
| 883 |
+
1. **Answer Accuracy**: GPT-3.5-based evaluation
|
| 884 |
+
- Compares generated answer with ground truth
|
| 885 |
+
- Accounts for semantic equivalence
|
| 886 |
+
- Results: 82.7% average accuracy
|
| 887 |
+
|
| 888 |
+
2. **Detection Accuracy**: IoU-based bounding box evaluation
|
| 889 |
+
- IoU > 0.5 threshold for correct detection
|
| 890 |
+
- Results: 75.3% detection accuracy
|
| 891 |
+
- Validates spatial grounding ability
|
| 892 |
+
|
| 893 |
+
3. **Reasoning Quality**: Chain-of-thought coherence
|
| 894 |
+
- Multi-turn consistency
|
| 895 |
+
- Interpretability of intermediate steps
|
| 896 |
+
|
| 897 |
+
### Model Comparison
|
| 898 |
+
|
| 899 |
+
| Model | Resolution | Params | Answer Acc | Detection Acc |
|
| 900 |
+
|-------|-----------|---------|-----------|---------------|
|
| 901 |
+
| VisCoT-7B-224 | 224px | 7B | 80.1% | 72.5% |
|
| 902 |
+
| VisCoT-7B-336 | 336px | 7B | 81.8% | 74.2% |
|
| 903 |
+
| VisCoT-13B-224 | 224px | 13B | 81.5% | 73.8% |
|
| 904 |
+
| VisCoT-13B-336 | 336px | 13B | 82.7% | 75.3% |
|
| 905 |
|
| 906 |
+
**Trade-offs:**
|
| 907 |
+
- Higher resolution → Better detail recognition, slower inference
|
| 908 |
+
- Larger model → Better reasoning, more memory
|
| 909 |
+
- 336px + 13B = Best quality but highest compute cost
|
|
|
|
| 910 |
|
| 911 |
---
|
| 912 |
|