Alikestocode commited on
Commit
a76dbfd
Β·
1 Parent(s): dd11bd9

Fix AWQ model loading: point to default/ subfolder and fix tokenizer loading

Browse files
Files changed (1) hide show
  1. app.py +41 -17
app.py CHANGED
@@ -16,9 +16,12 @@ torch.backends.cuda.matmul.allow_tf32 = True
16
 
17
  # Ensure CUDA is visible to vLLM on ZeroGPU
18
  # vLLM needs explicit CUDA device configuration
 
19
  if torch.cuda.is_available():
20
- # Set CUDA_VISIBLE_DEVICES if not already set (helps vLLM detect GPU)
21
- if "CUDA_VISIBLE_DEVICES" not in os.environ:
 
 
22
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
23
  print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
24
  print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
@@ -106,12 +109,14 @@ ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and
106
  MODELS = {
107
  "Router-Qwen3-32B-AWQ": {
108
  "repo_id": "Alovestocode/router-qwen3-32b-merged-awq", # AWQ quantized model
 
109
  "description": "Router checkpoint on Qwen3 32B merged, optimized with AWQ quantization via vLLM.",
110
  "params_b": 32.0,
111
  "quantization": "awq", # vLLM will auto-detect AWQ
112
  },
113
  "Router-Gemma3-27B-AWQ": {
114
  "repo_id": "Alovestocode/router-gemma3-merged-awq", # AWQ quantized model
 
115
  "description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
116
  "params_b": 27.0,
117
  "quantization": "awq", # vLLM will auto-detect AWQ
@@ -138,12 +143,15 @@ WARMED_REMAINING = False
138
  TOOL_PATTERN = re.compile(r"^/[a-z0-9_-]+\(.*\)$", re.IGNORECASE)
139
 
140
 
141
- def get_tokenizer(repo: str):
142
- tok = TOKENIZER_CACHE.get(repo)
 
 
 
143
  if tok is not None:
144
  return tok
145
  tok = AutoTokenizer.from_pretrained(
146
- repo,
147
  token=HF_TOKEN,
148
  use_fast=True,
149
  trust_remote_code=True
@@ -152,7 +160,7 @@ def get_tokenizer(repo: str):
152
  tok.truncation_side = "left"
153
  if tok.pad_token_id is None and tok.eos_token_id is not None:
154
  tok.pad_token_id = tok.eos_token_id
155
- TOKENIZER_CACHE[repo] = tok
156
  return tok
157
 
158
 
@@ -161,11 +169,21 @@ def load_vllm_model(model_name: str):
161
  if model_name in VLLM_MODELS:
162
  return VLLM_MODELS[model_name]
163
 
164
- repo = MODELS[model_name]["repo_id"]
165
  model_config = MODELS[model_name]
 
166
  quantization = model_config.get("quantization", None)
167
 
168
- print(f"Loading {repo} with vLLM (quantization: {quantization})...")
 
 
 
 
 
 
 
 
 
 
169
 
170
  try:
171
  # Detect device explicitly for vLLM
@@ -181,8 +199,9 @@ def load_vllm_model(model_name: str):
181
  # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
182
  # Note: HF_TOKEN is passed via environment variable, not as a parameter
183
  # vLLM auto-detects CUDA from torch.cuda.is_available() and CUDA_VISIBLE_DEVICES
 
184
  llm_kwargs = {
185
- "model": repo,
186
  "trust_remote_code": True,
187
  "dtype": "bfloat16", # Prefer bf16 over int8 for speed
188
  "gpu_memory_utilization": 0.90, # Leave headroom for KV cache
@@ -193,27 +212,31 @@ def load_vllm_model(model_name: str):
193
  "enable_prefix_caching": True, # Cache prompts for faster TTFT
194
  }
195
 
196
- # Ensure CUDA_VISIBLE_DEVICES is set for vLLM device detection
197
- if "CUDA_VISIBLE_DEVICES" not in os.environ:
 
 
 
198
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
199
 
200
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
201
  if quantization == "awq":
202
  llm_kwargs["quantization"] = "awq"
203
- # vLLM will auto-detect AWQ weights from quantization_config.json at repo root
204
- # Weights may be in a 'default' subfolder (LLM Compressor stage structure)
205
- # vLLM handles this automatically via the quantization config
206
  # Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
207
  # FP8 KV cache is compatible with AWQ quantization
208
  try:
209
  llm_kwargs["kv_cache_dtype"] = "fp8"
210
  print(f" β†’ AWQ quantization + FP8 KV cache enabled (vLLM native support)")
211
  print(f" β†’ FP8 KV cache reduces memory by ~50%, enabling longer contexts")
212
- print(f" β†’ Loading AWQ model from: {repo}")
213
  except Exception:
214
  # Fallback if FP8 KV cache not supported
215
  print(f" β†’ AWQ quantization enabled (FP8 KV cache not available)")
216
- print(f" β†’ Loading AWQ model from: {repo}")
217
  elif quantization == "fp8":
218
  # Try FP8 quantization if available (faster than AWQ)
219
  try:
@@ -305,7 +328,8 @@ def load_pipeline(model_name: str):
305
  return PIPELINES[model_name]
306
 
307
  repo = MODELS[model_name]["repo_id"]
308
- tokenizer = get_tokenizer(repo)
 
309
 
310
  # Try AWQ first if available (Transformers fallback path)
311
  if AWQ_AVAILABLE:
 
16
 
17
  # Ensure CUDA is visible to vLLM on ZeroGPU
18
  # vLLM needs explicit CUDA device configuration
19
+ # ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
20
  if torch.cuda.is_available():
21
+ # Set CUDA_VISIBLE_DEVICES if not already set or if it's a MIG UUID
22
+ cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
23
+ if not cuda_visible or not cuda_visible.isdigit():
24
+ # If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
25
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
26
  print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
27
  print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
 
109
  MODELS = {
110
  "Router-Qwen3-32B-AWQ": {
111
  "repo_id": "Alovestocode/router-qwen3-32b-merged-awq", # AWQ quantized model
112
+ "tokenizer_repo": "Alovestocode/router-qwen3-32b-merged", # Tokenizer from original repo
113
  "description": "Router checkpoint on Qwen3 32B merged, optimized with AWQ quantization via vLLM.",
114
  "params_b": 32.0,
115
  "quantization": "awq", # vLLM will auto-detect AWQ
116
  },
117
  "Router-Gemma3-27B-AWQ": {
118
  "repo_id": "Alovestocode/router-gemma3-merged-awq", # AWQ quantized model
119
+ "tokenizer_repo": "Alovestocode/router-gemma3-merged", # Tokenizer from original repo
120
  "description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
121
  "params_b": 27.0,
122
  "quantization": "awq", # vLLM will auto-detect AWQ
 
143
  TOOL_PATTERN = re.compile(r"^/[a-z0-9_-]+\(.*\)$", re.IGNORECASE)
144
 
145
 
146
+ def get_tokenizer(repo: str, tokenizer_repo: str = None):
147
+ """Get tokenizer, preferring tokenizer_repo if provided (for AWQ models)."""
148
+ # Use tokenizer_repo if provided (for AWQ models where tokenizer is in original repo)
149
+ actual_repo = tokenizer_repo if tokenizer_repo else repo
150
+ tok = TOKENIZER_CACHE.get(actual_repo)
151
  if tok is not None:
152
  return tok
153
  tok = AutoTokenizer.from_pretrained(
154
+ actual_repo,
155
  token=HF_TOKEN,
156
  use_fast=True,
157
  trust_remote_code=True
 
160
  tok.truncation_side = "left"
161
  if tok.pad_token_id is None and tok.eos_token_id is not None:
162
  tok.pad_token_id = tok.eos_token_id
163
+ TOKENIZER_CACHE[actual_repo] = tok
164
  return tok
165
 
166
 
 
169
  if model_name in VLLM_MODELS:
170
  return VLLM_MODELS[model_name]
171
 
 
172
  model_config = MODELS[model_name]
173
+ repo = model_config["repo_id"]
174
  quantization = model_config.get("quantization", None)
175
 
176
+ # For AWQ models, files are in the 'default' subfolder
177
+ # vLLM needs to point to the actual model location
178
+ # Since files are in default/, we need to use the full path: repo/default
179
+ if quantization == "awq":
180
+ # AWQ models from LLM Compressor have files in default/ subfolder
181
+ # Point vLLM directly to the default/ subfolder where model files are located
182
+ model_path = f"{repo}/default"
183
+ print(f"Loading {model_path} with vLLM (AWQ quantization, files in default/ subfolder)...")
184
+ else:
185
+ model_path = repo
186
+ print(f"Loading {model_path} with vLLM (quantization: {quantization})...")
187
 
188
  try:
189
  # Detect device explicitly for vLLM
 
199
  # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
200
  # Note: HF_TOKEN is passed via environment variable, not as a parameter
201
  # vLLM auto-detects CUDA from torch.cuda.is_available() and CUDA_VISIBLE_DEVICES
202
+ # For AWQ models with files in default/ subfolder, vLLM should auto-detect via quantization_config.json
203
  llm_kwargs = {
204
+ "model": model_path, # Use model_path which may point to default/ subfolder
205
  "trust_remote_code": True,
206
  "dtype": "bfloat16", # Prefer bf16 over int8 for speed
207
  "gpu_memory_utilization": 0.90, # Leave headroom for KV cache
 
212
  "enable_prefix_caching": True, # Cache prompts for faster TTFT
213
  }
214
 
215
+ # Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
216
+ # ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
217
+ cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
218
+ if not cuda_visible or not cuda_visible.isdigit():
219
+ # If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
220
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
221
+ print(f" β†’ Set CUDA_VISIBLE_DEVICES=0 (was: {cuda_visible})")
222
 
223
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
224
  if quantization == "awq":
225
  llm_kwargs["quantization"] = "awq"
226
+ # AWQ model files are in the 'default' subfolder
227
+ # vLLM should auto-detect this via quantization_config.json at repo root
228
+ # If auto-detection fails, we can explicitly point to default/ subfolder
229
  # Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
230
  # FP8 KV cache is compatible with AWQ quantization
231
  try:
232
  llm_kwargs["kv_cache_dtype"] = "fp8"
233
  print(f" β†’ AWQ quantization + FP8 KV cache enabled (vLLM native support)")
234
  print(f" β†’ FP8 KV cache reduces memory by ~50%, enabling longer contexts")
235
+ print(f" β†’ Loading AWQ model from: {model_path} (files in default/ subfolder)")
236
  except Exception:
237
  # Fallback if FP8 KV cache not supported
238
  print(f" β†’ AWQ quantization enabled (FP8 KV cache not available)")
239
+ print(f" β†’ Loading AWQ model from: {model_path} (files in default/ subfolder)")
240
  elif quantization == "fp8":
241
  # Try FP8 quantization if available (faster than AWQ)
242
  try:
 
328
  return PIPELINES[model_name]
329
 
330
  repo = MODELS[model_name]["repo_id"]
331
+ tokenizer_repo = MODELS[model_name].get("tokenizer_repo", None)
332
+ tokenizer = get_tokenizer(repo, tokenizer_repo=tokenizer_repo)
333
 
334
  # Try AWQ first if available (Transformers fallback path)
335
  if AWQ_AVAILABLE: