Spaces:

tuandunghcmut
/

viscot-demo

Running on Zero

dung-vpt-uney commited on Oct 12

Commit

6d0eaf9

1 Parent(s): 3461177

Update Visual-CoT demo - 2025-10-12 22:53:04

Fixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script

Files changed (1) hide show

app.py +34 -37

app.py CHANGED Viewed

@@ -70,36 +70,35 @@ BENCHMARK_DATASETS = [
     "cub",
 ]
-# Global model variables (lazy loading)
-tokenizer, model, image_processor, context_len = None, None, None, None
 # =============================================================================
-# Model Loading (with Zero GPU optimization)
 # =============================================================================
-def load_model_once():
-    """Load model once and cache it"""
-    global tokenizer, model, image_processor, context_len
-    if model is not None:
-        return tokenizer, model, image_processor, context_len
-    print("🔄 Loading Visual-CoT model...")
-    disable_torch_init()
-    model_name = get_model_name_from_path(MODEL_PATH)
-    tokenizer, model, image_processor, context_len = load_pretrained_model(
-        MODEL_PATH,
-        None,
-        model_name,
-        load_8bit=False,
-        load_4bit=False,
-        device=DEVICE,
-    )
-    print("✓ Model loaded successfully!")
-    return tokenizer, model, image_processor, context_len
 # =============================================================================
@@ -217,9 +216,7 @@ def generate_viscot_response(image, question, temperature=0.2, max_tokens=512):
         return "❌ Please enter a question!", "", None, ""
     try:
-        # Load model (lazy loading)
-        tokenizer, model, image_processor, context_len = load_model_once()
         # Initialize conversation
         conv_mode = "llava_v1"
         conv = conv_templates[conv_mode].copy()
@@ -593,26 +590,26 @@ def create_demo():
                 ```
                 ┌─────────────────────────────────────┐
-                │      Visual-CoT Pipeline           │
                 ├─────────────────────────────────────┤
                 │                                     │
                 │  📸 Image Input                     │
                 │         ↓                           │
-                │  🔍 CLIP ViT-L/14 (Vision Encoder) │
                 │         ↓                           │
                 │  🔗 MLP Projector (2-layer)        │
-                │         ↓                           │
                 │  🧠 LLaMA/Vicuna (Language Model)  │
-                │         ↓                           │
                 │  ┌──────────────┐                  │
                 │  │ Step 1: ROI  │ → Bounding Box   │
                 │  └──────────────┘                  │
-                │         ↓                           │
                 │  ┌──────────────┐                  │
                 │  │ Step 2: QA   │ → Final Answer   │
                 │  └──────────────┘                  │
-                │                                     │
-                └─────────────────────────────────────┘
                 ```
                 ---

     "cub",
 ]
 # =============================================================================
+# Model Loading (Global - bfloat16)
 # =============================================================================
+print("🔄 Loading Visual-CoT model in bfloat16...")
+disable_torch_init()
+model_name = get_model_name_from_path(MODEL_PATH)
+# Load model globally with bfloat16 precision
+tokenizer, model, image_processor, context_len = load_pretrained_model(
+    MODEL_PATH,
+    None,
+    model_name,
+    load_8bit=False,
+    load_4bit=False,
+    device=DEVICE,
+)
+# Ensure model is in bfloat16
+if DEVICE == "cuda":
+    model = model.to(dtype=torch.bfloat16)
+    print(f"✓ Model loaded in bfloat16 on {DEVICE}")
+else:
+    print(f"✓ Model loaded on {DEVICE} (CPU mode)")
+print(f"✓ Model: {model_name}")
+print(f"✓ Context length: {context_len}")
+print(f"✓ Device: {DEVICE}")
 # =============================================================================
         return "❌ Please enter a question!", "", None, ""
     try:
+        # Model is already loaded globally - use it directly
         # Initialize conversation
         conv_mode = "llava_v1"
         conv = conv_templates[conv_mode].copy()
                 ```
                 ┌─────────────────────────────────────┐
+                │      Visual-CoT Pipeline            │
                 ├─────────────────────────────────────┤
                 │                                     │
                 │  📸 Image Input                     │
                 │         ↓                           │
+                │  🔍 CLIP ViT-L/14 (Vision Encoder)  │
                 │         ↓                           │
                 │  🔗 MLP Projector (2-layer)        │
+                │         ↓                          │
                 │  🧠 LLaMA/Vicuna (Language Model)  │
+                │         ↓                          │
                 │  ┌──────────────┐                  │
                 │  │ Step 1: ROI  │ → Bounding Box   │
                 │  └──────────────┘                  │
+                │         ↓                          │
                 │  ┌──────────────┐                  │
                 │  │ Step 2: QA   │ → Final Answer   │
                 │  └──────────────┘                  │
+                │                                    │
+                └────────────────────────────────────┘
                 ```
                 ---