Alikestocode commited on
Commit
41f50c5
Β·
1 Parent(s): a76dbfd

Fix vLLM device detection and AWQ model loading

Browse files
Files changed (1) hide show
  1. app.py +36 -14
app.py CHANGED
@@ -173,14 +173,14 @@ def load_vllm_model(model_name: str):
173
  repo = model_config["repo_id"]
174
  quantization = model_config.get("quantization", None)
175
 
176
- # For AWQ models, files are in the 'default' subfolder
177
- # vLLM needs to point to the actual model location
178
- # Since files are in default/, we need to use the full path: repo/default
179
  if quantization == "awq":
180
- # AWQ models from LLM Compressor have files in default/ subfolder
181
- # Point vLLM directly to the default/ subfolder where model files are located
182
- model_path = f"{repo}/default"
183
- print(f"Loading {model_path} with vLLM (AWQ quantization, files in default/ subfolder)...")
184
  else:
185
  model_path = repo
186
  print(f"Loading {model_path} with vLLM (quantization: {quantization})...")
@@ -214,12 +214,21 @@ def load_vllm_model(model_name: str):
214
 
215
  # Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
216
  # ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
 
217
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
218
  if not cuda_visible or not cuda_visible.isdigit():
219
  # If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
220
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
221
  print(f" β†’ Set CUDA_VISIBLE_DEVICES=0 (was: {cuda_visible})")
222
 
 
 
 
 
 
 
 
 
223
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
224
  if quantization == "awq":
225
  llm_kwargs["quantization"] = "awq"
@@ -327,15 +336,28 @@ def load_pipeline(model_name: str):
327
  print(f"βœ… Using cached Transformers pipeline for {model_name}")
328
  return PIPELINES[model_name]
329
 
330
- repo = MODELS[model_name]["repo_id"]
331
- tokenizer_repo = MODELS[model_name].get("tokenizer_repo", None)
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  tokenizer = get_tokenizer(repo, tokenizer_repo=tokenizer_repo)
333
 
334
  # Try AWQ first if available (Transformers fallback path)
335
  if AWQ_AVAILABLE:
336
  try:
337
- print(f"πŸ”„ Loading {repo} with Transformers + AutoAWQ (fallback path)...")
338
- pipe = load_awq_pipeline(repo, tokenizer)
339
  PIPELINES[model_name] = pipe
340
  _schedule_background_warm(model_name)
341
  # Warm kernels immediately after loading
@@ -343,13 +365,13 @@ def load_pipeline(model_name: str):
343
  print(f"βœ… Transformers + AutoAWQ pipeline loaded: {model_name}")
344
  return pipe
345
  except Exception as exc:
346
- print(f"⚠️ AutoAWQ load failed for {repo}: {exc}")
347
  print(f" β†’ Falling back to BitsAndBytes 8-bit...")
348
 
349
  # Fallback to BitsAndBytes 8-bit
350
  if BITSANDBYTES_AVAILABLE:
351
  try:
352
- print(f"πŸ”„ Loading {repo} with BitsAndBytes 8-bit quantization...")
353
  quant_config = BitsAndBytesConfig(load_in_8bit=True)
354
  model_kwargs = {"quantization_config": quant_config}
355
  if FLASH_ATTN_AVAILABLE:
@@ -357,7 +379,7 @@ def load_pipeline(model_name: str):
357
 
358
  pipe = pipeline(
359
  task="text-generation",
360
- model=repo,
361
  tokenizer=tokenizer,
362
  trust_remote_code=True,
363
  device_map="auto",
 
173
  repo = model_config["repo_id"]
174
  quantization = model_config.get("quantization", None)
175
 
176
+ # For AWQ models, vLLM should point to repo root (not default/ subfolder)
177
+ # vLLM will find quantization_config.json at root, which points to default/ subfolder
178
+ # The quantization_config.json tells vLLM where the actual model files are
179
  if quantization == "awq":
180
+ # Point to repo root - vLLM will auto-detect AWQ via quantization_config.json
181
+ # The config file at root tells vLLM the model files are in default/ subfolder
182
+ model_path = repo # Use repo root, not repo/default
183
+ print(f"Loading {model_path} with vLLM (AWQ quantization, vLLM will find files in default/ via quantization_config.json)...")
184
  else:
185
  model_path = repo
186
  print(f"Loading {model_path} with vLLM (quantization: {quantization})...")
 
214
 
215
  # Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
216
  # ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
217
+ # IMPORTANT: Set this BEFORE creating LLM() instance, as vLLM checks device during init
218
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
219
  if not cuda_visible or not cuda_visible.isdigit():
220
  # If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
221
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
222
  print(f" β†’ Set CUDA_VISIBLE_DEVICES=0 (was: {cuda_visible})")
223
 
224
+ # Force torch to see the correct device after setting CUDA_VISIBLE_DEVICES
225
+ # This ensures vLLM's device detection works correctly
226
+ import torch
227
+ if torch.cuda.is_available():
228
+ # Verify device is accessible
229
+ device_name = torch.cuda.get_device_name(0)
230
+ print(f" β†’ Verified CUDA device accessible: {device_name}")
231
+
232
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
233
  if quantization == "awq":
234
  llm_kwargs["quantization"] = "awq"
 
336
  print(f"βœ… Using cached Transformers pipeline for {model_name}")
337
  return PIPELINES[model_name]
338
 
339
+ model_config = MODELS[model_name]
340
+ repo = model_config["repo_id"]
341
+ tokenizer_repo = model_config.get("tokenizer_repo", None)
342
+ quantization = model_config.get("quantization", None)
343
+
344
+ # For AWQ models, the AWQ repo doesn't have standard model files (they're in default/)
345
+ # Use the original repo for Transformers fallback, not the AWQ repo
346
+ if quantization == "awq" and tokenizer_repo:
347
+ # AWQ repos have files in default/ subfolder which Transformers can't load directly
348
+ # Use the original repo for Transformers fallback
349
+ transformers_repo = tokenizer_repo # Use original repo for Transformers
350
+ print(f"⚠️ AWQ model detected - Transformers fallback will use original repo: {transformers_repo}")
351
+ else:
352
+ transformers_repo = repo
353
+
354
  tokenizer = get_tokenizer(repo, tokenizer_repo=tokenizer_repo)
355
 
356
  # Try AWQ first if available (Transformers fallback path)
357
  if AWQ_AVAILABLE:
358
  try:
359
+ print(f"πŸ”„ Loading {transformers_repo} with Transformers + AutoAWQ (fallback path)...")
360
+ pipe = load_awq_pipeline(transformers_repo, tokenizer)
361
  PIPELINES[model_name] = pipe
362
  _schedule_background_warm(model_name)
363
  # Warm kernels immediately after loading
 
365
  print(f"βœ… Transformers + AutoAWQ pipeline loaded: {model_name}")
366
  return pipe
367
  except Exception as exc:
368
+ print(f"⚠️ AutoAWQ load failed for {transformers_repo}: {exc}")
369
  print(f" β†’ Falling back to BitsAndBytes 8-bit...")
370
 
371
  # Fallback to BitsAndBytes 8-bit
372
  if BITSANDBYTES_AVAILABLE:
373
  try:
374
+ print(f"πŸ”„ Loading {transformers_repo} with BitsAndBytes 8-bit quantization...")
375
  quant_config = BitsAndBytesConfig(load_in_8bit=True)
376
  model_kwargs = {"quantization_config": quant_config}
377
  if FLASH_ATTN_AVAILABLE:
 
379
 
380
  pipe = pipeline(
381
  task="text-generation",
382
+ model=transformers_repo,
383
  tokenizer=tokenizer,
384
  trust_remote_code=True,
385
  device_map="auto",