Spaces:
Sleeping
Sleeping
Commit
Β·
a76dbfd
1
Parent(s):
dd11bd9
Fix AWQ model loading: point to default/ subfolder and fix tokenizer loading
Browse files
app.py
CHANGED
|
@@ -16,9 +16,12 @@ torch.backends.cuda.matmul.allow_tf32 = True
|
|
| 16 |
|
| 17 |
# Ensure CUDA is visible to vLLM on ZeroGPU
|
| 18 |
# vLLM needs explicit CUDA device configuration
|
|
|
|
| 19 |
if torch.cuda.is_available():
|
| 20 |
-
# Set CUDA_VISIBLE_DEVICES if not already set
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 23 |
print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
|
| 24 |
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
|
@@ -106,12 +109,14 @@ ROUTER_SYSTEM_PROMPT = """You are the Router Agent coordinating Math, Code, and
|
|
| 106 |
MODELS = {
|
| 107 |
"Router-Qwen3-32B-AWQ": {
|
| 108 |
"repo_id": "Alovestocode/router-qwen3-32b-merged-awq", # AWQ quantized model
|
|
|
|
| 109 |
"description": "Router checkpoint on Qwen3 32B merged, optimized with AWQ quantization via vLLM.",
|
| 110 |
"params_b": 32.0,
|
| 111 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
| 112 |
},
|
| 113 |
"Router-Gemma3-27B-AWQ": {
|
| 114 |
"repo_id": "Alovestocode/router-gemma3-merged-awq", # AWQ quantized model
|
|
|
|
| 115 |
"description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
|
| 116 |
"params_b": 27.0,
|
| 117 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
|
@@ -138,12 +143,15 @@ WARMED_REMAINING = False
|
|
| 138 |
TOOL_PATTERN = re.compile(r"^/[a-z0-9_-]+\(.*\)$", re.IGNORECASE)
|
| 139 |
|
| 140 |
|
| 141 |
-
def get_tokenizer(repo: str):
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
| 143 |
if tok is not None:
|
| 144 |
return tok
|
| 145 |
tok = AutoTokenizer.from_pretrained(
|
| 146 |
-
|
| 147 |
token=HF_TOKEN,
|
| 148 |
use_fast=True,
|
| 149 |
trust_remote_code=True
|
|
@@ -152,7 +160,7 @@ def get_tokenizer(repo: str):
|
|
| 152 |
tok.truncation_side = "left"
|
| 153 |
if tok.pad_token_id is None and tok.eos_token_id is not None:
|
| 154 |
tok.pad_token_id = tok.eos_token_id
|
| 155 |
-
TOKENIZER_CACHE[
|
| 156 |
return tok
|
| 157 |
|
| 158 |
|
|
@@ -161,11 +169,21 @@ def load_vllm_model(model_name: str):
|
|
| 161 |
if model_name in VLLM_MODELS:
|
| 162 |
return VLLM_MODELS[model_name]
|
| 163 |
|
| 164 |
-
repo = MODELS[model_name]["repo_id"]
|
| 165 |
model_config = MODELS[model_name]
|
|
|
|
| 166 |
quantization = model_config.get("quantization", None)
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
try:
|
| 171 |
# Detect device explicitly for vLLM
|
|
@@ -181,8 +199,9 @@ def load_vllm_model(model_name: str):
|
|
| 181 |
# vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
|
| 182 |
# Note: HF_TOKEN is passed via environment variable, not as a parameter
|
| 183 |
# vLLM auto-detects CUDA from torch.cuda.is_available() and CUDA_VISIBLE_DEVICES
|
|
|
|
| 184 |
llm_kwargs = {
|
| 185 |
-
"model":
|
| 186 |
"trust_remote_code": True,
|
| 187 |
"dtype": "bfloat16", # Prefer bf16 over int8 for speed
|
| 188 |
"gpu_memory_utilization": 0.90, # Leave headroom for KV cache
|
|
@@ -193,27 +212,31 @@ def load_vllm_model(model_name: str):
|
|
| 193 |
"enable_prefix_caching": True, # Cache prompts for faster TTFT
|
| 194 |
}
|
| 195 |
|
| 196 |
-
# Ensure CUDA_VISIBLE_DEVICES is set for vLLM device detection
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
| 198 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
|
|
| 199 |
|
| 200 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 201 |
if quantization == "awq":
|
| 202 |
llm_kwargs["quantization"] = "awq"
|
| 203 |
-
#
|
| 204 |
-
#
|
| 205 |
-
#
|
| 206 |
# Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
|
| 207 |
# FP8 KV cache is compatible with AWQ quantization
|
| 208 |
try:
|
| 209 |
llm_kwargs["kv_cache_dtype"] = "fp8"
|
| 210 |
print(f" β AWQ quantization + FP8 KV cache enabled (vLLM native support)")
|
| 211 |
print(f" β FP8 KV cache reduces memory by ~50%, enabling longer contexts")
|
| 212 |
-
print(f" β Loading AWQ model from: {
|
| 213 |
except Exception:
|
| 214 |
# Fallback if FP8 KV cache not supported
|
| 215 |
print(f" β AWQ quantization enabled (FP8 KV cache not available)")
|
| 216 |
-
print(f" β Loading AWQ model from: {
|
| 217 |
elif quantization == "fp8":
|
| 218 |
# Try FP8 quantization if available (faster than AWQ)
|
| 219 |
try:
|
|
@@ -305,7 +328,8 @@ def load_pipeline(model_name: str):
|
|
| 305 |
return PIPELINES[model_name]
|
| 306 |
|
| 307 |
repo = MODELS[model_name]["repo_id"]
|
| 308 |
-
|
|
|
|
| 309 |
|
| 310 |
# Try AWQ first if available (Transformers fallback path)
|
| 311 |
if AWQ_AVAILABLE:
|
|
|
|
| 16 |
|
| 17 |
# Ensure CUDA is visible to vLLM on ZeroGPU
|
| 18 |
# vLLM needs explicit CUDA device configuration
|
| 19 |
+
# ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
|
| 20 |
if torch.cuda.is_available():
|
| 21 |
+
# Set CUDA_VISIBLE_DEVICES if not already set or if it's a MIG UUID
|
| 22 |
+
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 23 |
+
if not cuda_visible or not cuda_visible.isdigit():
|
| 24 |
+
# If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
|
| 25 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 26 |
print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
|
| 27 |
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
|
|
|
| 109 |
MODELS = {
|
| 110 |
"Router-Qwen3-32B-AWQ": {
|
| 111 |
"repo_id": "Alovestocode/router-qwen3-32b-merged-awq", # AWQ quantized model
|
| 112 |
+
"tokenizer_repo": "Alovestocode/router-qwen3-32b-merged", # Tokenizer from original repo
|
| 113 |
"description": "Router checkpoint on Qwen3 32B merged, optimized with AWQ quantization via vLLM.",
|
| 114 |
"params_b": 32.0,
|
| 115 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
| 116 |
},
|
| 117 |
"Router-Gemma3-27B-AWQ": {
|
| 118 |
"repo_id": "Alovestocode/router-gemma3-merged-awq", # AWQ quantized model
|
| 119 |
+
"tokenizer_repo": "Alovestocode/router-gemma3-merged", # Tokenizer from original repo
|
| 120 |
"description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
|
| 121 |
"params_b": 27.0,
|
| 122 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
|
|
|
| 143 |
TOOL_PATTERN = re.compile(r"^/[a-z0-9_-]+\(.*\)$", re.IGNORECASE)
|
| 144 |
|
| 145 |
|
| 146 |
+
def get_tokenizer(repo: str, tokenizer_repo: str = None):
|
| 147 |
+
"""Get tokenizer, preferring tokenizer_repo if provided (for AWQ models)."""
|
| 148 |
+
# Use tokenizer_repo if provided (for AWQ models where tokenizer is in original repo)
|
| 149 |
+
actual_repo = tokenizer_repo if tokenizer_repo else repo
|
| 150 |
+
tok = TOKENIZER_CACHE.get(actual_repo)
|
| 151 |
if tok is not None:
|
| 152 |
return tok
|
| 153 |
tok = AutoTokenizer.from_pretrained(
|
| 154 |
+
actual_repo,
|
| 155 |
token=HF_TOKEN,
|
| 156 |
use_fast=True,
|
| 157 |
trust_remote_code=True
|
|
|
|
| 160 |
tok.truncation_side = "left"
|
| 161 |
if tok.pad_token_id is None and tok.eos_token_id is not None:
|
| 162 |
tok.pad_token_id = tok.eos_token_id
|
| 163 |
+
TOKENIZER_CACHE[actual_repo] = tok
|
| 164 |
return tok
|
| 165 |
|
| 166 |
|
|
|
|
| 169 |
if model_name in VLLM_MODELS:
|
| 170 |
return VLLM_MODELS[model_name]
|
| 171 |
|
|
|
|
| 172 |
model_config = MODELS[model_name]
|
| 173 |
+
repo = model_config["repo_id"]
|
| 174 |
quantization = model_config.get("quantization", None)
|
| 175 |
|
| 176 |
+
# For AWQ models, files are in the 'default' subfolder
|
| 177 |
+
# vLLM needs to point to the actual model location
|
| 178 |
+
# Since files are in default/, we need to use the full path: repo/default
|
| 179 |
+
if quantization == "awq":
|
| 180 |
+
# AWQ models from LLM Compressor have files in default/ subfolder
|
| 181 |
+
# Point vLLM directly to the default/ subfolder where model files are located
|
| 182 |
+
model_path = f"{repo}/default"
|
| 183 |
+
print(f"Loading {model_path} with vLLM (AWQ quantization, files in default/ subfolder)...")
|
| 184 |
+
else:
|
| 185 |
+
model_path = repo
|
| 186 |
+
print(f"Loading {model_path} with vLLM (quantization: {quantization})...")
|
| 187 |
|
| 188 |
try:
|
| 189 |
# Detect device explicitly for vLLM
|
|
|
|
| 199 |
# vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
|
| 200 |
# Note: HF_TOKEN is passed via environment variable, not as a parameter
|
| 201 |
# vLLM auto-detects CUDA from torch.cuda.is_available() and CUDA_VISIBLE_DEVICES
|
| 202 |
+
# For AWQ models with files in default/ subfolder, vLLM should auto-detect via quantization_config.json
|
| 203 |
llm_kwargs = {
|
| 204 |
+
"model": model_path, # Use model_path which may point to default/ subfolder
|
| 205 |
"trust_remote_code": True,
|
| 206 |
"dtype": "bfloat16", # Prefer bf16 over int8 for speed
|
| 207 |
"gpu_memory_utilization": 0.90, # Leave headroom for KV cache
|
|
|
|
| 212 |
"enable_prefix_caching": True, # Cache prompts for faster TTFT
|
| 213 |
}
|
| 214 |
|
| 215 |
+
# Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
|
| 216 |
+
# ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
|
| 217 |
+
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 218 |
+
if not cuda_visible or not cuda_visible.isdigit():
|
| 219 |
+
# If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
|
| 220 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 221 |
+
print(f" β Set CUDA_VISIBLE_DEVICES=0 (was: {cuda_visible})")
|
| 222 |
|
| 223 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 224 |
if quantization == "awq":
|
| 225 |
llm_kwargs["quantization"] = "awq"
|
| 226 |
+
# AWQ model files are in the 'default' subfolder
|
| 227 |
+
# vLLM should auto-detect this via quantization_config.json at repo root
|
| 228 |
+
# If auto-detection fails, we can explicitly point to default/ subfolder
|
| 229 |
# Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
|
| 230 |
# FP8 KV cache is compatible with AWQ quantization
|
| 231 |
try:
|
| 232 |
llm_kwargs["kv_cache_dtype"] = "fp8"
|
| 233 |
print(f" β AWQ quantization + FP8 KV cache enabled (vLLM native support)")
|
| 234 |
print(f" β FP8 KV cache reduces memory by ~50%, enabling longer contexts")
|
| 235 |
+
print(f" β Loading AWQ model from: {model_path} (files in default/ subfolder)")
|
| 236 |
except Exception:
|
| 237 |
# Fallback if FP8 KV cache not supported
|
| 238 |
print(f" β AWQ quantization enabled (FP8 KV cache not available)")
|
| 239 |
+
print(f" β Loading AWQ model from: {model_path} (files in default/ subfolder)")
|
| 240 |
elif quantization == "fp8":
|
| 241 |
# Try FP8 quantization if available (faster than AWQ)
|
| 242 |
try:
|
|
|
|
| 328 |
return PIPELINES[model_name]
|
| 329 |
|
| 330 |
repo = MODELS[model_name]["repo_id"]
|
| 331 |
+
tokenizer_repo = MODELS[model_name].get("tokenizer_repo", None)
|
| 332 |
+
tokenizer = get_tokenizer(repo, tokenizer_repo=tokenizer_repo)
|
| 333 |
|
| 334 |
# Try AWQ first if available (Transformers fallback path)
|
| 335 |
if AWQ_AVAILABLE:
|