Spaces:
Sleeping
Sleeping
Commit
Β·
41f50c5
1
Parent(s):
a76dbfd
Fix vLLM device detection and AWQ model loading
Browse files
app.py
CHANGED
|
@@ -173,14 +173,14 @@ def load_vllm_model(model_name: str):
|
|
| 173 |
repo = model_config["repo_id"]
|
| 174 |
quantization = model_config.get("quantization", None)
|
| 175 |
|
| 176 |
-
# For AWQ models,
|
| 177 |
-
# vLLM
|
| 178 |
-
#
|
| 179 |
if quantization == "awq":
|
| 180 |
-
#
|
| 181 |
-
#
|
| 182 |
-
model_path =
|
| 183 |
-
print(f"Loading {model_path} with vLLM (AWQ quantization, files in default/
|
| 184 |
else:
|
| 185 |
model_path = repo
|
| 186 |
print(f"Loading {model_path} with vLLM (quantization: {quantization})...")
|
|
@@ -214,12 +214,21 @@ def load_vllm_model(model_name: str):
|
|
| 214 |
|
| 215 |
# Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
|
| 216 |
# ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
|
|
|
|
| 217 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 218 |
if not cuda_visible or not cuda_visible.isdigit():
|
| 219 |
# If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
|
| 220 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 221 |
print(f" β Set CUDA_VISIBLE_DEVICES=0 (was: {cuda_visible})")
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 224 |
if quantization == "awq":
|
| 225 |
llm_kwargs["quantization"] = "awq"
|
|
@@ -327,15 +336,28 @@ def load_pipeline(model_name: str):
|
|
| 327 |
print(f"β
Using cached Transformers pipeline for {model_name}")
|
| 328 |
return PIPELINES[model_name]
|
| 329 |
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
tokenizer = get_tokenizer(repo, tokenizer_repo=tokenizer_repo)
|
| 333 |
|
| 334 |
# Try AWQ first if available (Transformers fallback path)
|
| 335 |
if AWQ_AVAILABLE:
|
| 336 |
try:
|
| 337 |
-
print(f"π Loading {
|
| 338 |
-
pipe = load_awq_pipeline(
|
| 339 |
PIPELINES[model_name] = pipe
|
| 340 |
_schedule_background_warm(model_name)
|
| 341 |
# Warm kernels immediately after loading
|
|
@@ -343,13 +365,13 @@ def load_pipeline(model_name: str):
|
|
| 343 |
print(f"β
Transformers + AutoAWQ pipeline loaded: {model_name}")
|
| 344 |
return pipe
|
| 345 |
except Exception as exc:
|
| 346 |
-
print(f"β οΈ AutoAWQ load failed for {
|
| 347 |
print(f" β Falling back to BitsAndBytes 8-bit...")
|
| 348 |
|
| 349 |
# Fallback to BitsAndBytes 8-bit
|
| 350 |
if BITSANDBYTES_AVAILABLE:
|
| 351 |
try:
|
| 352 |
-
print(f"π Loading {
|
| 353 |
quant_config = BitsAndBytesConfig(load_in_8bit=True)
|
| 354 |
model_kwargs = {"quantization_config": quant_config}
|
| 355 |
if FLASH_ATTN_AVAILABLE:
|
|
@@ -357,7 +379,7 @@ def load_pipeline(model_name: str):
|
|
| 357 |
|
| 358 |
pipe = pipeline(
|
| 359 |
task="text-generation",
|
| 360 |
-
model=
|
| 361 |
tokenizer=tokenizer,
|
| 362 |
trust_remote_code=True,
|
| 363 |
device_map="auto",
|
|
|
|
| 173 |
repo = model_config["repo_id"]
|
| 174 |
quantization = model_config.get("quantization", None)
|
| 175 |
|
| 176 |
+
# For AWQ models, vLLM should point to repo root (not default/ subfolder)
|
| 177 |
+
# vLLM will find quantization_config.json at root, which points to default/ subfolder
|
| 178 |
+
# The quantization_config.json tells vLLM where the actual model files are
|
| 179 |
if quantization == "awq":
|
| 180 |
+
# Point to repo root - vLLM will auto-detect AWQ via quantization_config.json
|
| 181 |
+
# The config file at root tells vLLM the model files are in default/ subfolder
|
| 182 |
+
model_path = repo # Use repo root, not repo/default
|
| 183 |
+
print(f"Loading {model_path} with vLLM (AWQ quantization, vLLM will find files in default/ via quantization_config.json)...")
|
| 184 |
else:
|
| 185 |
model_path = repo
|
| 186 |
print(f"Loading {model_path} with vLLM (quantization: {quantization})...")
|
|
|
|
| 214 |
|
| 215 |
# Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
|
| 216 |
# ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
|
| 217 |
+
# IMPORTANT: Set this BEFORE creating LLM() instance, as vLLM checks device during init
|
| 218 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 219 |
if not cuda_visible or not cuda_visible.isdigit():
|
| 220 |
# If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
|
| 221 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 222 |
print(f" β Set CUDA_VISIBLE_DEVICES=0 (was: {cuda_visible})")
|
| 223 |
|
| 224 |
+
# Force torch to see the correct device after setting CUDA_VISIBLE_DEVICES
|
| 225 |
+
# This ensures vLLM's device detection works correctly
|
| 226 |
+
import torch
|
| 227 |
+
if torch.cuda.is_available():
|
| 228 |
+
# Verify device is accessible
|
| 229 |
+
device_name = torch.cuda.get_device_name(0)
|
| 230 |
+
print(f" β Verified CUDA device accessible: {device_name}")
|
| 231 |
+
|
| 232 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 233 |
if quantization == "awq":
|
| 234 |
llm_kwargs["quantization"] = "awq"
|
|
|
|
| 336 |
print(f"β
Using cached Transformers pipeline for {model_name}")
|
| 337 |
return PIPELINES[model_name]
|
| 338 |
|
| 339 |
+
model_config = MODELS[model_name]
|
| 340 |
+
repo = model_config["repo_id"]
|
| 341 |
+
tokenizer_repo = model_config.get("tokenizer_repo", None)
|
| 342 |
+
quantization = model_config.get("quantization", None)
|
| 343 |
+
|
| 344 |
+
# For AWQ models, the AWQ repo doesn't have standard model files (they're in default/)
|
| 345 |
+
# Use the original repo for Transformers fallback, not the AWQ repo
|
| 346 |
+
if quantization == "awq" and tokenizer_repo:
|
| 347 |
+
# AWQ repos have files in default/ subfolder which Transformers can't load directly
|
| 348 |
+
# Use the original repo for Transformers fallback
|
| 349 |
+
transformers_repo = tokenizer_repo # Use original repo for Transformers
|
| 350 |
+
print(f"β οΈ AWQ model detected - Transformers fallback will use original repo: {transformers_repo}")
|
| 351 |
+
else:
|
| 352 |
+
transformers_repo = repo
|
| 353 |
+
|
| 354 |
tokenizer = get_tokenizer(repo, tokenizer_repo=tokenizer_repo)
|
| 355 |
|
| 356 |
# Try AWQ first if available (Transformers fallback path)
|
| 357 |
if AWQ_AVAILABLE:
|
| 358 |
try:
|
| 359 |
+
print(f"π Loading {transformers_repo} with Transformers + AutoAWQ (fallback path)...")
|
| 360 |
+
pipe = load_awq_pipeline(transformers_repo, tokenizer)
|
| 361 |
PIPELINES[model_name] = pipe
|
| 362 |
_schedule_background_warm(model_name)
|
| 363 |
# Warm kernels immediately after loading
|
|
|
|
| 365 |
print(f"β
Transformers + AutoAWQ pipeline loaded: {model_name}")
|
| 366 |
return pipe
|
| 367 |
except Exception as exc:
|
| 368 |
+
print(f"β οΈ AutoAWQ load failed for {transformers_repo}: {exc}")
|
| 369 |
print(f" β Falling back to BitsAndBytes 8-bit...")
|
| 370 |
|
| 371 |
# Fallback to BitsAndBytes 8-bit
|
| 372 |
if BITSANDBYTES_AVAILABLE:
|
| 373 |
try:
|
| 374 |
+
print(f"π Loading {transformers_repo} with BitsAndBytes 8-bit quantization...")
|
| 375 |
quant_config = BitsAndBytesConfig(load_in_8bit=True)
|
| 376 |
model_kwargs = {"quantization_config": quant_config}
|
| 377 |
if FLASH_ATTN_AVAILABLE:
|
|
|
|
| 379 |
|
| 380 |
pipe = pipeline(
|
| 381 |
task="text-generation",
|
| 382 |
+
model=transformers_repo,
|
| 383 |
tokenizer=tokenizer,
|
| 384 |
trust_remote_code=True,
|
| 385 |
device_map="auto",
|