Spaces:
Running
on
Zero
Running
on
Zero
dung-vpt-uney
commited on
Commit
·
0e3c28d
1
Parent(s):
69afdf8
Update Visual-CoT demo - 2025-10-12 23:45:35
Browse filesFixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script
app.py
CHANGED
|
@@ -64,8 +64,8 @@ AVAILABLE_MODELS = {
|
|
| 64 |
"VisCoT-13B-336 (Best)": "deepcs233/VisCoT-13b-336",
|
| 65 |
}
|
| 66 |
|
| 67 |
-
MODEL_PATH = "deepcs233/VisCoT-7b-
|
| 68 |
-
CURRENT_MODEL_NAME = "VisCoT-7B-
|
| 69 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 70 |
|
| 71 |
# Benchmark datasets from Visual Chain-of-Thought Reasoning Benchmarks Collection
|
|
@@ -73,38 +73,56 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 73 |
BENCHMARK_DATASETS = {
|
| 74 |
"Visual-CoT": {
|
| 75 |
"path": "deepcs233/Visual-CoT",
|
|
|
|
|
|
|
| 76 |
"description": "Main Visual-CoT dataset with 438K question-answer pairs",
|
| 77 |
},
|
| 78 |
"GQA": {
|
| 79 |
"path": "lmms-lab/GQA",
|
| 80 |
-
"
|
|
|
|
|
|
|
| 81 |
},
|
| 82 |
"RefCOCO": {
|
| 83 |
"path": "lmms-lab/RefCOCO",
|
|
|
|
|
|
|
| 84 |
"description": "Referring expression comprehension (17.6K examples)",
|
| 85 |
},
|
| 86 |
"RefCOCO+": {
|
| 87 |
"path": "lmms-lab/RefCOCOplus",
|
|
|
|
|
|
|
| 88 |
"description": "RefCOCO with no location words (7.58K examples)",
|
| 89 |
},
|
| 90 |
"RefCOCOg": {
|
| 91 |
"path": "lmms-lab/RefCOCOg",
|
|
|
|
|
|
|
| 92 |
"description": "RefCOCO with longer expressions (12.6K examples)",
|
| 93 |
},
|
| 94 |
"POPE": {
|
| 95 |
"path": "lmms-lab/POPE",
|
| 96 |
-
"
|
|
|
|
|
|
|
| 97 |
},
|
| 98 |
"ScienceQA": {
|
| 99 |
"path": "lmms-lab/ScienceQA",
|
|
|
|
|
|
|
| 100 |
"description": "Science question answering (12.6K examples)",
|
| 101 |
},
|
| 102 |
"MM-GCoT": {
|
| 103 |
"path": "AQUA6/MM-GCoT",
|
|
|
|
|
|
|
| 104 |
"description": "Multi-Modal Graph Chain-of-Thought (64.9K examples)",
|
| 105 |
},
|
| 106 |
"VGR": {
|
| 107 |
"path": "BytedanceDouyinContent/VGR",
|
|
|
|
|
|
|
| 108 |
"description": "Visual Grounding & Reasoning (90K examples)",
|
| 109 |
},
|
| 110 |
}
|
|
@@ -201,10 +219,15 @@ def load_benchmark_example(dataset_name, index=0):
|
|
| 201 |
return None, "Dataset not found", "", "", ""
|
| 202 |
|
| 203 |
dataset_path = dataset_info["path"]
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
# Load dataset
|
| 206 |
-
print(f"Loading {dataset_name} from {dataset_path}...")
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
# Get specific index (for streaming, we need to iterate)
|
| 210 |
for i, example in enumerate(dataset):
|
|
@@ -506,8 +529,8 @@ def create_demo():
|
|
| 506 |
# Header
|
| 507 |
gr.HTML("""
|
| 508 |
<div class="header">
|
| 509 |
-
<h1>🌋 Visual-CoT: Chain-of-Thought Reasoning</h1>
|
| 510 |
-
<p style="font-size: 18px; margin: 10px 0;">
|
| 511 |
Advancing Multi-Modal Language Models with Visual Chain-of-Thought
|
| 512 |
</p>
|
| 513 |
<p style="font-size: 14px; opacity: 0.9;">
|
|
|
|
| 64 |
"VisCoT-13B-336 (Best)": "deepcs233/VisCoT-13b-336",
|
| 65 |
}
|
| 66 |
|
| 67 |
+
MODEL_PATH = "deepcs233/VisCoT-7b-336" # Default: balanced quality/speed
|
| 68 |
+
CURRENT_MODEL_NAME = "VisCoT-7B-336 (Balanced)"
|
| 69 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 70 |
|
| 71 |
# Benchmark datasets from Visual Chain-of-Thought Reasoning Benchmarks Collection
|
|
|
|
| 73 |
BENCHMARK_DATASETS = {
|
| 74 |
"Visual-CoT": {
|
| 75 |
"path": "deepcs233/Visual-CoT",
|
| 76 |
+
"config": None,
|
| 77 |
+
"split": "train",
|
| 78 |
"description": "Main Visual-CoT dataset with 438K question-answer pairs",
|
| 79 |
},
|
| 80 |
"GQA": {
|
| 81 |
"path": "lmms-lab/GQA",
|
| 82 |
+
"config": "train_balanced_images",
|
| 83 |
+
"split": "train",
|
| 84 |
+
"description": "Scene graph question answering (balanced training set)",
|
| 85 |
},
|
| 86 |
"RefCOCO": {
|
| 87 |
"path": "lmms-lab/RefCOCO",
|
| 88 |
+
"config": None,
|
| 89 |
+
"split": "train",
|
| 90 |
"description": "Referring expression comprehension (17.6K examples)",
|
| 91 |
},
|
| 92 |
"RefCOCO+": {
|
| 93 |
"path": "lmms-lab/RefCOCOplus",
|
| 94 |
+
"config": None,
|
| 95 |
+
"split": "train",
|
| 96 |
"description": "RefCOCO with no location words (7.58K examples)",
|
| 97 |
},
|
| 98 |
"RefCOCOg": {
|
| 99 |
"path": "lmms-lab/RefCOCOg",
|
| 100 |
+
"config": None,
|
| 101 |
+
"split": "train",
|
| 102 |
"description": "RefCOCO with longer expressions (12.6K examples)",
|
| 103 |
},
|
| 104 |
"POPE": {
|
| 105 |
"path": "lmms-lab/POPE",
|
| 106 |
+
"config": None,
|
| 107 |
+
"split": "test",
|
| 108 |
+
"description": "Polling-based Object Probing Evaluation (18K test examples)",
|
| 109 |
},
|
| 110 |
"ScienceQA": {
|
| 111 |
"path": "lmms-lab/ScienceQA",
|
| 112 |
+
"config": None,
|
| 113 |
+
"split": "train",
|
| 114 |
"description": "Science question answering (12.6K examples)",
|
| 115 |
},
|
| 116 |
"MM-GCoT": {
|
| 117 |
"path": "AQUA6/MM-GCoT",
|
| 118 |
+
"config": None,
|
| 119 |
+
"split": "train",
|
| 120 |
"description": "Multi-Modal Graph Chain-of-Thought (64.9K examples)",
|
| 121 |
},
|
| 122 |
"VGR": {
|
| 123 |
"path": "BytedanceDouyinContent/VGR",
|
| 124 |
+
"config": None,
|
| 125 |
+
"split": "train",
|
| 126 |
"description": "Visual Grounding & Reasoning (90K examples)",
|
| 127 |
},
|
| 128 |
}
|
|
|
|
| 219 |
return None, "Dataset not found", "", "", ""
|
| 220 |
|
| 221 |
dataset_path = dataset_info["path"]
|
| 222 |
+
dataset_config = dataset_info.get("config")
|
| 223 |
+
dataset_split = dataset_info.get("split", "train")
|
| 224 |
|
| 225 |
+
# Load dataset with config and split
|
| 226 |
+
print(f"Loading {dataset_name} from {dataset_path} (config={dataset_config}, split={dataset_split})...")
|
| 227 |
+
if dataset_config:
|
| 228 |
+
dataset = load_dataset(dataset_path, dataset_config, split=dataset_split, streaming=True)
|
| 229 |
+
else:
|
| 230 |
+
dataset = load_dataset(dataset_path, split=dataset_split, streaming=True)
|
| 231 |
|
| 232 |
# Get specific index (for streaming, we need to iterate)
|
| 233 |
for i, example in enumerate(dataset):
|
|
|
|
| 529 |
# Header
|
| 530 |
gr.HTML("""
|
| 531 |
<div class="header">
|
| 532 |
+
<h1 style="color: white;">🌋 Visual-CoT: Chain-of-Thought Reasoning</h1>
|
| 533 |
+
<p style="font-size: 18px; margin: 10px 0; color: white;">
|
| 534 |
Advancing Multi-Modal Language Models with Visual Chain-of-Thought
|
| 535 |
</p>
|
| 536 |
<p style="font-size: 14px; opacity: 0.9;">
|