Spaces:
Runtime error
Runtime error
File size: 15,311 Bytes
1314bf5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 |
import torch
import gc
import os
from typing import Dict, Any, Optional, Callable
from transformers import AutoModel, AutoTokenizer, AutoConfig
from ..base_model import BaseModel
from ...utils.image_processing import load_image
from ...config.config_manager import ConfigManager
class InternVLModel(BaseModel):
"""InternVL3 model implementation."""
def __init__(self, model_name: str, model_config: Dict[str, Any], config_manager: ConfigManager):
"""
Initialize the InternVL model.
Args:
model_name: Name of the model
model_config: Configuration dictionary for the model
config_manager: Configuration manager instance
"""
super().__init__(model_name, model_config)
self.config_manager = config_manager
# Set environment variable for CUDA memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
def check_model_exists_locally(self) -> bool:
"""Check if model exists locally in Hugging Face cache."""
try:
from transformers.utils import cached_file
cached_file(self.model_id, "config.json", local_files_only=True)
return True
except:
return False
def download_model_with_progress(self, progress_callback: Optional[Callable] = None) -> bool:
"""
Download model with progress tracking.
Args:
progress_callback: Callback function for progress updates
Returns:
True if successful, False otherwise
"""
try:
if progress_callback:
progress_callback("π₯ Downloading tokenizer...")
# Download tokenizer first (smaller)
tokenizer = AutoTokenizer.from_pretrained(
self.model_id,
trust_remote_code=True,
use_fast=False
)
if progress_callback:
progress_callback("π₯ Downloading model weights... This may take several minutes...")
# Download model config and weights
config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
if progress_callback:
progress_callback("β
Model downloaded successfully!")
return True
except Exception as e:
if progress_callback:
progress_callback(f"β Download failed: {str(e)}")
return False
def split_model(self) -> Dict[str, int]:
"""
Distribute LLM layers across GPUs, keeping vision encoder on GPU 0.
Returns:
Device map dictionary
"""
device_map = {}
world_size = torch.cuda.device_count()
if world_size < 2:
return "auto" # let transformers decide
cfg = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
num_layers = cfg.llm_config.num_hidden_layers # type: ignore[attr-defined]
# More aggressive distribution - treat GPU 0 as 0.3 GPU capacity due to vision model
effective_gpus = world_size - 0.7 # More conservative for GPU 0
layers_per_gpu = num_layers / effective_gpus
# Calculate layer distribution
gpu_layers = []
for i in range(world_size):
if i == 0:
# GPU 0 gets fewer layers due to vision model
gpu_layers.append(max(1, int(layers_per_gpu * 0.3)))
else:
gpu_layers.append(int(layers_per_gpu))
# Adjust if total doesn't match num_layers
total_assigned = sum(gpu_layers)
diff = num_layers - total_assigned
if diff > 0:
# Add remaining layers to non-zero GPUs
for i in range(1, min(world_size, diff + 1)):
gpu_layers[i] += 1
elif diff < 0:
# Remove excess layers from GPU 0
gpu_layers[0] = max(1, gpu_layers[0] + diff)
# Assign layers to devices
layer_cnt = 0
for gpu_id, num_layers_on_gpu in enumerate(gpu_layers):
for _ in range(num_layers_on_gpu):
if layer_cnt < num_layers:
device_map[f'language_model.model.layers.{layer_cnt}'] = gpu_id
layer_cnt += 1
# Distribute other components more evenly across GPUs
last_gpu = world_size - 1
# Vision model must stay on GPU 0
device_map['vision_model'] = 0
device_map['mlp1'] = 0
# Distribute language model components across GPUs
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.model.norm'] = last_gpu # Move to last GPU
device_map['language_model.model.rotary_emb'] = 1 if world_size > 1 else 0 # Move to GPU 1
device_map['language_model.output'] = last_gpu # Move to last GPU
device_map['language_model.lm_head'] = last_gpu # Move to last GPU
# Keep the last layer on the same GPU as output layers for compatibility
device_map[f'language_model.model.layers.{num_layers - 1}'] = last_gpu
print(f"Layer distribution: {gpu_layers}")
print(f"Total layers: {num_layers}, Assigned: {sum(gpu_layers)}")
return device_map
def load_model(self, quantization_type: str, progress_callback: Optional[Callable] = None) -> bool:
"""
Load the model with specified quantization.
Args:
quantization_type: Type of quantization to use
progress_callback: Callback function for progress updates
Returns:
True if successful, False otherwise
"""
if not self.validate_quantization(quantization_type):
raise ValueError(f"Quantization type '{quantization_type}' not supported for {self.model_name}")
# If model is already loaded with the same quantization, return
if (self.model is not None and self.tokenizer is not None and
self.current_quantization == quantization_type):
if progress_callback:
progress_callback(f"β
{self.model_name} already loaded!")
return True
print(f"Loading {self.model_name} with {quantization_type} quantization...")
if progress_callback:
progress_callback(f"π Loading {self.model_name} with {quantization_type} quantization...")
try:
# Check if model exists locally
model_exists = self.check_model_exists_locally()
if not model_exists:
if progress_callback:
progress_callback(f"π₯ {self.model_name} not found locally. Starting download...")
print(f"Model {self.model_name} not found locally. Starting download...")
success = self.download_model_with_progress(progress_callback)
if not success:
raise Exception(f"Failed to download {self.model_name}")
else:
if progress_callback:
progress_callback(f"β
{self.model_name} found locally.")
# Clear existing model if any
if self.model is not None:
self.unload_model()
# Print memory before loading
self._print_gpu_memory("before loading")
if progress_callback:
progress_callback(f"π Loading {self.model_name} tokenizer...")
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_id,
trust_remote_code=True,
use_fast=False
)
# Load model based on quantization type
if "non-quantized" in quantization_type:
if progress_callback:
progress_callback(f"π Loading {self.model_name} model in 16-bit precision...")
device_map = self.split_model()
print(f"Device map for multi-GPU: {device_map}")
# Try loading with custom device_map, fallback to "auto" if it fails
# Some InternVL models (e.g., InternVL3_5) don't support custom device_map
# due to missing 'all_tied_weights_keys' attribute
try:
self.model = AutoModel.from_pretrained(
self.model_id,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map,
).eval()
except (AttributeError, TypeError, RuntimeError, ValueError) as e:
error_str = str(e).lower()
# Check for device_map related errors, especially all_tied_weights_keys
# This is a known issue with some InternVL models that don't expose
# the all_tied_weights_keys attribute required for custom device_map
if ("all_tied_weights_keys" in error_str or
"tied_weights" in error_str or
("device_map" in error_str and "attribute" in error_str)):
print(f"β οΈ Custom device_map failed ({str(e)}), falling back to 'auto' device_map...")
if progress_callback:
progress_callback(f"β οΈ Using automatic device mapping...")
self.model = AutoModel.from_pretrained(
self.model_id,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map="auto",
).eval()
else:
# Re-raise if it's a different error
raise
else: # quantized (8bit)
if progress_callback:
progress_callback(f"π Loading {self.model_name} model with 8-bit quantization...")
print("Loading with 8-bit quantization to reduce memory usage...")
self.model = AutoModel.from_pretrained(
self.model_id,
torch_dtype=torch.bfloat16,
load_in_8bit=True,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map="auto" # Let transformers handle device mapping for quantized model
).eval()
# Verify model and tokenizer are properly loaded
if self.model is None:
raise Exception(f"Model failed to load for {self.model_name}")
if self.tokenizer is None:
raise Exception(f"Tokenizer failed to load for {self.model_name}")
self.current_quantization = quantization_type
self.is_loaded = True
success_msg = f"β
{self.model_name} loaded successfully with {quantization_type} quantization!"
print(success_msg)
if progress_callback:
progress_callback(success_msg)
# Print GPU memory usage after loading
self._print_gpu_memory("after loading")
return True
except Exception as e:
error_msg = f"Failed to load model {self.model_name}: {str(e)}"
print(error_msg)
if progress_callback:
progress_callback(f"β {error_msg}")
# Reset on failure
self.unload_model()
raise Exception(error_msg)
def unload_model(self) -> None:
"""Unload the model from memory."""
if self.model is not None:
print("π§Ή Clearing model from memory...")
del self.model
self.model = None
if self.tokenizer is not None:
del self.tokenizer
self.tokenizer = None
self.current_quantization = None
self.is_loaded = False
# Clear GPU cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Force garbage collection
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache() # Clear again after gc
print("β
Model unloaded successfully")
def inference(self, image_path: str, prompt: str, **kwargs) -> str:
"""
Perform inference on an image with a text prompt.
Args:
image_path: Path to the image file
prompt: Text prompt for the model
**kwargs: Additional inference parameters
Returns:
Model's text response
"""
if not self.is_loaded:
raise RuntimeError(f"Model {self.model_name} is not loaded. Call load_model() first.")
try:
# Load and preprocess image using default settings from original app.py
pixel_values = load_image(image_path, input_size=448, max_num=12).to(torch.bfloat16)
# Move pixel_values to the same device as the model
if torch.cuda.is_available():
# Get the device of the first model parameter
model_device = next(self.model.parameters()).device
pixel_values = pixel_values.to(model_device)
else:
# Fallback to CPU if no CUDA available
pixel_values = pixel_values.cpu()
# Prepare prompt
formatted_prompt = f"<image>\n{prompt}" if prompt else "<image>\n"
# Generation configuration - using same settings as original app.py
gen_cfg = dict(max_new_tokens=1024, do_sample=True)
# Perform inference
response = self.model.chat(self.tokenizer, pixel_values, formatted_prompt, gen_cfg)
return response
except Exception as e:
error_msg = f"Error processing image: {str(e)}"
print(error_msg)
return error_msg
def _print_gpu_memory(self, stage: str) -> None:
"""Print GPU memory usage for debugging."""
if torch.cuda.is_available():
print(f"Memory {stage}:")
for i in range(torch.cuda.device_count()):
allocated = torch.cuda.memory_allocated(i) / 1024**3
reserved = torch.cuda.memory_reserved(i) / 1024**3
print(f"GPU {i}: Allocated {allocated:.2f} GB, Reserved {reserved:.2f} GB") |