dung-vpt-uney commited on
Commit
0e3c28d
·
1 Parent(s): 69afdf8

Update Visual-CoT demo - 2025-10-12 23:45:35

Browse files

Fixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script

Files changed (1) hide show
  1. app.py +32 -9
app.py CHANGED
@@ -64,8 +64,8 @@ AVAILABLE_MODELS = {
64
  "VisCoT-13B-336 (Best)": "deepcs233/VisCoT-13b-336",
65
  }
66
 
67
- MODEL_PATH = "deepcs233/VisCoT-7b-224" # Default: smallest/fastest
68
- CURRENT_MODEL_NAME = "VisCoT-7B-224 (Fastest)"
69
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
70
 
71
  # Benchmark datasets from Visual Chain-of-Thought Reasoning Benchmarks Collection
@@ -73,38 +73,56 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
73
  BENCHMARK_DATASETS = {
74
  "Visual-CoT": {
75
  "path": "deepcs233/Visual-CoT",
 
 
76
  "description": "Main Visual-CoT dataset with 438K question-answer pairs",
77
  },
78
  "GQA": {
79
  "path": "lmms-lab/GQA",
80
- "description": "Scene graph question answering (24.2M examples)",
 
 
81
  },
82
  "RefCOCO": {
83
  "path": "lmms-lab/RefCOCO",
 
 
84
  "description": "Referring expression comprehension (17.6K examples)",
85
  },
86
  "RefCOCO+": {
87
  "path": "lmms-lab/RefCOCOplus",
 
 
88
  "description": "RefCOCO with no location words (7.58K examples)",
89
  },
90
  "RefCOCOg": {
91
  "path": "lmms-lab/RefCOCOg",
 
 
92
  "description": "RefCOCO with longer expressions (12.6K examples)",
93
  },
94
  "POPE": {
95
  "path": "lmms-lab/POPE",
96
- "description": "Polling-based Object Probing Evaluation (18K examples)",
 
 
97
  },
98
  "ScienceQA": {
99
  "path": "lmms-lab/ScienceQA",
 
 
100
  "description": "Science question answering (12.6K examples)",
101
  },
102
  "MM-GCoT": {
103
  "path": "AQUA6/MM-GCoT",
 
 
104
  "description": "Multi-Modal Graph Chain-of-Thought (64.9K examples)",
105
  },
106
  "VGR": {
107
  "path": "BytedanceDouyinContent/VGR",
 
 
108
  "description": "Visual Grounding & Reasoning (90K examples)",
109
  },
110
  }
@@ -201,10 +219,15 @@ def load_benchmark_example(dataset_name, index=0):
201
  return None, "Dataset not found", "", "", ""
202
 
203
  dataset_path = dataset_info["path"]
 
 
204
 
205
- # Load dataset
206
- print(f"Loading {dataset_name} from {dataset_path}...")
207
- dataset = load_dataset(dataset_path, split="train", streaming=True)
 
 
 
208
 
209
  # Get specific index (for streaming, we need to iterate)
210
  for i, example in enumerate(dataset):
@@ -506,8 +529,8 @@ def create_demo():
506
  # Header
507
  gr.HTML("""
508
  <div class="header">
509
- <h1>🌋 Visual-CoT: Chain-of-Thought Reasoning</h1>
510
- <p style="font-size: 18px; margin: 10px 0;">
511
  Advancing Multi-Modal Language Models with Visual Chain-of-Thought
512
  </p>
513
  <p style="font-size: 14px; opacity: 0.9;">
 
64
  "VisCoT-13B-336 (Best)": "deepcs233/VisCoT-13b-336",
65
  }
66
 
67
+ MODEL_PATH = "deepcs233/VisCoT-7b-336" # Default: balanced quality/speed
68
+ CURRENT_MODEL_NAME = "VisCoT-7B-336 (Balanced)"
69
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
70
 
71
  # Benchmark datasets from Visual Chain-of-Thought Reasoning Benchmarks Collection
 
73
  BENCHMARK_DATASETS = {
74
  "Visual-CoT": {
75
  "path": "deepcs233/Visual-CoT",
76
+ "config": None,
77
+ "split": "train",
78
  "description": "Main Visual-CoT dataset with 438K question-answer pairs",
79
  },
80
  "GQA": {
81
  "path": "lmms-lab/GQA",
82
+ "config": "train_balanced_images",
83
+ "split": "train",
84
+ "description": "Scene graph question answering (balanced training set)",
85
  },
86
  "RefCOCO": {
87
  "path": "lmms-lab/RefCOCO",
88
+ "config": None,
89
+ "split": "train",
90
  "description": "Referring expression comprehension (17.6K examples)",
91
  },
92
  "RefCOCO+": {
93
  "path": "lmms-lab/RefCOCOplus",
94
+ "config": None,
95
+ "split": "train",
96
  "description": "RefCOCO with no location words (7.58K examples)",
97
  },
98
  "RefCOCOg": {
99
  "path": "lmms-lab/RefCOCOg",
100
+ "config": None,
101
+ "split": "train",
102
  "description": "RefCOCO with longer expressions (12.6K examples)",
103
  },
104
  "POPE": {
105
  "path": "lmms-lab/POPE",
106
+ "config": None,
107
+ "split": "test",
108
+ "description": "Polling-based Object Probing Evaluation (18K test examples)",
109
  },
110
  "ScienceQA": {
111
  "path": "lmms-lab/ScienceQA",
112
+ "config": None,
113
+ "split": "train",
114
  "description": "Science question answering (12.6K examples)",
115
  },
116
  "MM-GCoT": {
117
  "path": "AQUA6/MM-GCoT",
118
+ "config": None,
119
+ "split": "train",
120
  "description": "Multi-Modal Graph Chain-of-Thought (64.9K examples)",
121
  },
122
  "VGR": {
123
  "path": "BytedanceDouyinContent/VGR",
124
+ "config": None,
125
+ "split": "train",
126
  "description": "Visual Grounding & Reasoning (90K examples)",
127
  },
128
  }
 
219
  return None, "Dataset not found", "", "", ""
220
 
221
  dataset_path = dataset_info["path"]
222
+ dataset_config = dataset_info.get("config")
223
+ dataset_split = dataset_info.get("split", "train")
224
 
225
+ # Load dataset with config and split
226
+ print(f"Loading {dataset_name} from {dataset_path} (config={dataset_config}, split={dataset_split})...")
227
+ if dataset_config:
228
+ dataset = load_dataset(dataset_path, dataset_config, split=dataset_split, streaming=True)
229
+ else:
230
+ dataset = load_dataset(dataset_path, split=dataset_split, streaming=True)
231
 
232
  # Get specific index (for streaming, we need to iterate)
233
  for i, example in enumerate(dataset):
 
529
  # Header
530
  gr.HTML("""
531
  <div class="header">
532
+ <h1 style="color: white;">🌋 Visual-CoT: Chain-of-Thought Reasoning</h1>
533
+ <p style="font-size: 18px; margin: 10px 0; color: white;">
534
  Advancing Multi-Modal Language Models with Visual Chain-of-Thought
535
  </p>
536
  <p style="font-size: 14px; opacity: 0.9;">