Spaces:
Running
on
Zero
Running
on
Zero
dung-vpt-uney
commited on
Commit
·
6d0eaf9
1
Parent(s):
3461177
Update Visual-CoT demo - 2025-10-12 22:53:04
Browse filesFixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script
app.py
CHANGED
|
@@ -70,36 +70,35 @@ BENCHMARK_DATASETS = [
|
|
| 70 |
"cub",
|
| 71 |
]
|
| 72 |
|
| 73 |
-
# Global model variables (lazy loading)
|
| 74 |
-
tokenizer, model, image_processor, context_len = None, None, None, None
|
| 75 |
-
|
| 76 |
# =============================================================================
|
| 77 |
-
# Model Loading (
|
| 78 |
# =============================================================================
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
# =============================================================================
|
|
@@ -217,9 +216,7 @@ def generate_viscot_response(image, question, temperature=0.2, max_tokens=512):
|
|
| 217 |
return "❌ Please enter a question!", "", None, ""
|
| 218 |
|
| 219 |
try:
|
| 220 |
-
#
|
| 221 |
-
tokenizer, model, image_processor, context_len = load_model_once()
|
| 222 |
-
|
| 223 |
# Initialize conversation
|
| 224 |
conv_mode = "llava_v1"
|
| 225 |
conv = conv_templates[conv_mode].copy()
|
|
@@ -593,26 +590,26 @@ def create_demo():
|
|
| 593 |
|
| 594 |
```
|
| 595 |
┌─────────────────────────────────────┐
|
| 596 |
-
│ Visual-CoT Pipeline
|
| 597 |
├─────────────────────────────────────┤
|
| 598 |
│ │
|
| 599 |
│ 📸 Image Input │
|
| 600 |
│ ↓ │
|
| 601 |
-
│ 🔍 CLIP ViT-L/14 (Vision Encoder)
|
| 602 |
│ ↓ │
|
| 603 |
│ 🔗 MLP Projector (2-layer) │
|
| 604 |
-
│ ↓
|
| 605 |
│ 🧠 LLaMA/Vicuna (Language Model) │
|
| 606 |
-
│ ↓
|
| 607 |
│ ┌──────────────┐ │
|
| 608 |
│ │ Step 1: ROI │ → Bounding Box │
|
| 609 |
│ └──────────────┘ │
|
| 610 |
-
│ ↓
|
| 611 |
│ ┌──────────────┐ │
|
| 612 |
│ │ Step 2: QA │ → Final Answer │
|
| 613 |
│ └──────────────┘ │
|
| 614 |
-
│
|
| 615 |
-
|
| 616 |
```
|
| 617 |
|
| 618 |
---
|
|
|
|
| 70 |
"cub",
|
| 71 |
]
|
| 72 |
|
|
|
|
|
|
|
|
|
|
| 73 |
# =============================================================================
|
| 74 |
+
# Model Loading (Global - bfloat16)
|
| 75 |
# =============================================================================
|
| 76 |
|
| 77 |
+
print("🔄 Loading Visual-CoT model in bfloat16...")
|
| 78 |
+
disable_torch_init()
|
| 79 |
+
|
| 80 |
+
model_name = get_model_name_from_path(MODEL_PATH)
|
| 81 |
+
|
| 82 |
+
# Load model globally with bfloat16 precision
|
| 83 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(
|
| 84 |
+
MODEL_PATH,
|
| 85 |
+
None,
|
| 86 |
+
model_name,
|
| 87 |
+
load_8bit=False,
|
| 88 |
+
load_4bit=False,
|
| 89 |
+
device=DEVICE,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Ensure model is in bfloat16
|
| 93 |
+
if DEVICE == "cuda":
|
| 94 |
+
model = model.to(dtype=torch.bfloat16)
|
| 95 |
+
print(f"✓ Model loaded in bfloat16 on {DEVICE}")
|
| 96 |
+
else:
|
| 97 |
+
print(f"✓ Model loaded on {DEVICE} (CPU mode)")
|
| 98 |
+
|
| 99 |
+
print(f"✓ Model: {model_name}")
|
| 100 |
+
print(f"✓ Context length: {context_len}")
|
| 101 |
+
print(f"✓ Device: {DEVICE}")
|
| 102 |
|
| 103 |
|
| 104 |
# =============================================================================
|
|
|
|
| 216 |
return "❌ Please enter a question!", "", None, ""
|
| 217 |
|
| 218 |
try:
|
| 219 |
+
# Model is already loaded globally - use it directly
|
|
|
|
|
|
|
| 220 |
# Initialize conversation
|
| 221 |
conv_mode = "llava_v1"
|
| 222 |
conv = conv_templates[conv_mode].copy()
|
|
|
|
| 590 |
|
| 591 |
```
|
| 592 |
┌─────────────────────────────────────┐
|
| 593 |
+
│ Visual-CoT Pipeline │
|
| 594 |
├─────────────────────────────────────┤
|
| 595 |
│ │
|
| 596 |
│ 📸 Image Input │
|
| 597 |
│ ↓ │
|
| 598 |
+
│ 🔍 CLIP ViT-L/14 (Vision Encoder) │
|
| 599 |
│ ↓ │
|
| 600 |
│ 🔗 MLP Projector (2-layer) │
|
| 601 |
+
│ ↓ │
|
| 602 |
│ 🧠 LLaMA/Vicuna (Language Model) │
|
| 603 |
+
│ ↓ │
|
| 604 |
│ ┌──────────────┐ │
|
| 605 |
│ │ Step 1: ROI │ → Bounding Box │
|
| 606 |
│ └──────────────┘ │
|
| 607 |
+
│ ↓ │
|
| 608 |
│ ┌──────────────┐ │
|
| 609 |
│ │ Step 2: QA │ → Final Answer │
|
| 610 |
│ └──────────────┘ │
|
| 611 |
+
│ │
|
| 612 |
+
└────────────────────────────────────┘
|
| 613 |
```
|
| 614 |
|
| 615 |
---
|