dung-vpt-uney commited on
Commit
6d0eaf9
·
1 Parent(s): 3461177

Update Visual-CoT demo - 2025-10-12 22:53:04

Browse files

Fixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script

Files changed (1) hide show
  1. app.py +34 -37
app.py CHANGED
@@ -70,36 +70,35 @@ BENCHMARK_DATASETS = [
70
  "cub",
71
  ]
72
 
73
- # Global model variables (lazy loading)
74
- tokenizer, model, image_processor, context_len = None, None, None, None
75
-
76
  # =============================================================================
77
- # Model Loading (with Zero GPU optimization)
78
  # =============================================================================
79
 
80
- def load_model_once():
81
- """Load model once and cache it"""
82
- global tokenizer, model, image_processor, context_len
83
-
84
- if model is not None:
85
- return tokenizer, model, image_processor, context_len
86
-
87
- print("🔄 Loading Visual-CoT model...")
88
- disable_torch_init()
89
-
90
- model_name = get_model_name_from_path(MODEL_PATH)
91
-
92
- tokenizer, model, image_processor, context_len = load_pretrained_model(
93
- MODEL_PATH,
94
- None,
95
- model_name,
96
- load_8bit=False,
97
- load_4bit=False,
98
- device=DEVICE,
99
- )
100
-
101
- print("✓ Model loaded successfully!")
102
- return tokenizer, model, image_processor, context_len
 
 
103
 
104
 
105
  # =============================================================================
@@ -217,9 +216,7 @@ def generate_viscot_response(image, question, temperature=0.2, max_tokens=512):
217
  return "❌ Please enter a question!", "", None, ""
218
 
219
  try:
220
- # Load model (lazy loading)
221
- tokenizer, model, image_processor, context_len = load_model_once()
222
-
223
  # Initialize conversation
224
  conv_mode = "llava_v1"
225
  conv = conv_templates[conv_mode].copy()
@@ -593,26 +590,26 @@ def create_demo():
593
 
594
  ```
595
  ┌─────────────────────────────────────┐
596
- │ Visual-CoT Pipeline
597
  ├─────────────────────────────────────┤
598
  │ │
599
  │ 📸 Image Input │
600
  │ ↓ │
601
- │ 🔍 CLIP ViT-L/14 (Vision Encoder)
602
  │ ↓ │
603
  │ 🔗 MLP Projector (2-layer) │
604
- │ ↓
605
  │ 🧠 LLaMA/Vicuna (Language Model) │
606
- │ ↓
607
  │ ┌──────────────┐ │
608
  │ │ Step 1: ROI │ → Bounding Box │
609
  │ └──────────────┘ │
610
- │ ↓
611
  │ ┌──────────────┐ │
612
  │ │ Step 2: QA │ → Final Answer │
613
  │ └──────────────┘ │
614
-
615
- └─────────────────────────────────────┘
616
  ```
617
 
618
  ---
 
70
  "cub",
71
  ]
72
 
 
 
 
73
  # =============================================================================
74
+ # Model Loading (Global - bfloat16)
75
  # =============================================================================
76
 
77
+ print("🔄 Loading Visual-CoT model in bfloat16...")
78
+ disable_torch_init()
79
+
80
+ model_name = get_model_name_from_path(MODEL_PATH)
81
+
82
+ # Load model globally with bfloat16 precision
83
+ tokenizer, model, image_processor, context_len = load_pretrained_model(
84
+ MODEL_PATH,
85
+ None,
86
+ model_name,
87
+ load_8bit=False,
88
+ load_4bit=False,
89
+ device=DEVICE,
90
+ )
91
+
92
+ # Ensure model is in bfloat16
93
+ if DEVICE == "cuda":
94
+ model = model.to(dtype=torch.bfloat16)
95
+ print(f"✓ Model loaded in bfloat16 on {DEVICE}")
96
+ else:
97
+ print(f"✓ Model loaded on {DEVICE} (CPU mode)")
98
+
99
+ print(f"✓ Model: {model_name}")
100
+ print(f"✓ Context length: {context_len}")
101
+ print(f"✓ Device: {DEVICE}")
102
 
103
 
104
  # =============================================================================
 
216
  return "❌ Please enter a question!", "", None, ""
217
 
218
  try:
219
+ # Model is already loaded globally - use it directly
 
 
220
  # Initialize conversation
221
  conv_mode = "llava_v1"
222
  conv = conv_templates[conv_mode].copy()
 
590
 
591
  ```
592
  ┌─────────────────────────────────────┐
593
+ │ Visual-CoT Pipeline
594
  ├─────────────────────────────────────┤
595
  │ │
596
  │ 📸 Image Input │
597
  │ ↓ │
598
+ │ 🔍 CLIP ViT-L/14 (Vision Encoder)
599
  │ ↓ │
600
  │ 🔗 MLP Projector (2-layer) │
601
+ │ ↓
602
  │ 🧠 LLaMA/Vicuna (Language Model) │
603
+ │ ↓
604
  │ ┌──────────────┐ │
605
  │ │ Step 1: ROI │ → Bounding Box │
606
  │ └──────────────┘ │
607
+ │ ↓
608
  │ ┌──────────────┐ │
609
  │ │ Step 2: QA │ → Final Answer │
610
  │ └──────────────┘ │
611
+
612
+ └────────────────────────────────────┘
613
  ```
614
 
615
  ---