Alikestocode commited on
Commit
0e2f6c4
·
1 Parent(s): 6fb2aa6

Load vLLM from local snapshot to support default subfolders

Browse files
Files changed (1) hide show
  1. app.py +32 -7
app.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
  import json
4
  import os
5
  import re
6
- from typing import Any, Dict, List, Tuple
7
 
8
  import gradio as gr
9
  import spaces
@@ -57,6 +57,8 @@ PREFETCH_DISABLED = os.environ.get("DISABLE_PREFETCH", "0") == "1"
57
  PREFETCH_THREADS = int(os.environ.get("PREFETCH_THREADS", "4"))
58
  PREFETCH_EXECUTOR = None
59
 
 
 
60
 
61
  def _prefetch_repo(repo_id: str) -> None:
62
  if not HF_HUB_AVAILABLE:
@@ -73,6 +75,26 @@ def _prefetch_repo(repo_id: str) -> None:
73
  print(f"Prefetch skipped for {repo_id}: {exc}")
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def _start_prefetch_workers(model_names: list[str]):
77
  global PREFETCH_EXECUTOR
78
  if PREFETCH_DISABLED or not HF_HUB_AVAILABLE:
@@ -255,13 +277,16 @@ def load_vllm_model(model_name: str):
255
  quantization = model_config.get("quantization", None)
256
 
257
  # For AWQ models, vLLM should point to repo root (not default/ subfolder)
258
- # vLLM will find quantization_config.json at root, which points to default/ subfolder
259
- # The quantization_config.json tells vLLM where the actual model files are
260
  if quantization == "awq":
261
- # Point to repo root - vLLM will auto-detect AWQ via quantization_config.json
262
- # The config file at root tells vLLM the model files are in default/ subfolder
263
- model_path = repo # Use repo root, not repo/default
264
- print(f"Loading {model_path} with vLLM (AWQ quantization, vLLM will find files in default/ via quantization_config.json)...")
 
 
 
 
265
  else:
266
  model_path = repo
267
  print(f"Loading {model_path} with vLLM (quantization: {quantization})...")
 
3
  import json
4
  import os
5
  import re
6
+ from typing import Any, Dict, List, Tuple, Optional
7
 
8
  import gradio as gr
9
  import spaces
 
57
  PREFETCH_THREADS = int(os.environ.get("PREFETCH_THREADS", "4"))
58
  PREFETCH_EXECUTOR = None
59
 
60
+ LOCAL_REPO_CACHE: Dict[str, str] = {}
61
+
62
 
63
  def _prefetch_repo(repo_id: str) -> None:
64
  if not HF_HUB_AVAILABLE:
 
75
  print(f"Prefetch skipped for {repo_id}: {exc}")
76
 
77
 
78
+ def _ensure_local_repo(repo_id: str) -> Optional[str]:
79
+ if not HF_HUB_AVAILABLE:
80
+ return None
81
+ cached = LOCAL_REPO_CACHE.get(repo_id)
82
+ if cached and os.path.isdir(cached):
83
+ return cached
84
+ try:
85
+ local_path = snapshot_download(
86
+ repo_id=repo_id,
87
+ etag_timeout=10,
88
+ resume_download=True,
89
+ local_files_only=False,
90
+ )
91
+ LOCAL_REPO_CACHE[repo_id] = local_path
92
+ return local_path
93
+ except Exception as exc: # pragma: no cover
94
+ print(f"Local snapshot failed for {repo_id}: {exc}")
95
+ return None
96
+
97
+
98
  def _start_prefetch_workers(model_names: list[str]):
99
  global PREFETCH_EXECUTOR
100
  if PREFETCH_DISABLED or not HF_HUB_AVAILABLE:
 
277
  quantization = model_config.get("quantization", None)
278
 
279
  # For AWQ models, vLLM should point to repo root (not default/ subfolder)
280
+ # If repo is stored with AWQ artifacts inside a default/ directory, fall back to local snapshot
 
281
  if quantization == "awq":
282
+ model_path = repo
283
+ local_repo = _ensure_local_repo(repo)
284
+ if local_repo:
285
+ default_dir = os.path.join(local_repo, "default")
286
+ model_path = default_dir if os.path.isdir(default_dir) else local_repo
287
+ print(f"Loading {model_path} (local snapshot) with vLLM (AWQ quantization)...")
288
+ else:
289
+ print(f"Loading {model_path} with vLLM (AWQ quantization, vLLM will find files in default/ via quantization_config.json)...")
290
  else:
291
  model_path = repo
292
  print(f"Loading {model_path} with vLLM (quantization: {quantization})...")