Alikestocode commited on
Commit
ae07f77
·
1 Parent(s): 808203f

Replace AutoAWQ with LLM Compressor (vLLM native) in Colab notebook

Browse files

- Use llm-compressor instead of autoawq for quantization
- LLM Compressor is vLLM's native tool with better integration
- Simplified quantization pipeline using oneshot() function
- Updated verification to prefer vLLM over Transformers
- Better compatibility with vLLM inference engine

Files changed (1) hide show
  1. quantize_to_awq_colab.ipynb +176 -108
quantize_to_awq_colab.ipynb CHANGED
@@ -4,15 +4,21 @@
4
  "cell_type": "markdown",
5
  "metadata": {},
6
  "source": [
7
- "# Router Models AWQ Quantization\n",
8
  "\n",
9
- "This notebook quantizes the CourseGPT-Pro router models to AWQ (Activation-aware Weight Quantization) format for efficient inference.\n",
10
  "\n",
11
  "**Models to quantize:**\n",
12
  "- `Alovestocode/router-gemma3-merged` (27B)\n",
13
  "- `Alovestocode/router-qwen3-32b-merged` (33B)\n",
14
  "\n",
15
- "**Output:** AWQ-quantized models ready for vLLM or Transformers inference.\n"
 
 
 
 
 
 
16
  ]
17
  },
18
  {
@@ -29,7 +35,8 @@
29
  "outputs": [],
30
  "source": [
31
  "# Install required packages\n",
32
- "%pip install -q autoawq transformers accelerate huggingface_hub\n",
 
33
  "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n",
34
  "\n",
35
  "# Utility function to check disk space\n",
@@ -117,12 +124,15 @@
117
  "metadata": {},
118
  "outputs": [],
119
  "source": [
120
- "from awq import AutoAWQForCausalLM\n",
 
 
121
  "from transformers import AutoTokenizer\n",
122
- "from huggingface_hub import HfApi, scan_cache_dir, delete_revisions\n",
123
  "import torch\n",
124
  "import shutil\n",
125
  "import gc\n",
 
126
  "\n",
127
  "def quantize_model_to_awq(\n",
128
  " model_name: str,\n",
@@ -132,7 +142,7 @@
132
  " awq_config: dict,\n",
133
  " calibration_dataset_size: int = 128\n",
134
  "):\n",
135
- " \"\"\"Quantize a model to AWQ format.\n",
136
  " \n",
137
  " Args:\n",
138
  " model_name: Display name for the model\n",
@@ -143,42 +153,26 @@
143
  " calibration_dataset_size: Number of calibration samples\n",
144
  " \"\"\"\n",
145
  " print(f\"\\n{'='*60}\")\n",
146
- " print(f\"Quantizing {model_name}\")\n",
147
  " print(f\"Source: {repo_id}\")\n",
148
  " print(f\"Destination: {output_repo}\")\n",
149
  " print(f\"{'='*60}\\n\")\n",
150
  " \n",
151
- " # Step 1: Load tokenizer\n",
152
- " print(f\"[1/5] Loading tokenizer from {repo_id}...\")\n",
153
- " tokenizer = AutoTokenizer.from_pretrained(\n",
154
- " repo_id,\n",
155
- " trust_remote_code=True,\n",
156
- " token=os.environ.get(\"HF_TOKEN\")\n",
157
- " )\n",
158
- " print(f\"✅ Tokenizer loaded\")\n",
159
- " \n",
160
- " # Step 2: Load model\n",
161
- " print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
162
- " print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
163
- " \n",
164
- " # Check disk space before loading\n",
165
  " free_space_before = check_disk_space()\n",
166
  " if free_space_before < 30:\n",
167
- " print(f\"⚠️ WARNING: Low disk space ({free_space_before:.2f} GB). Model loading may fail.\")\n",
168
  " \n",
169
- " model = AutoAWQForCausalLM.from_pretrained(\n",
170
- " repo_id,\n",
171
- " device_map=\"auto\",\n",
172
- " trust_remote_code=True,\n",
173
- " token=os.environ.get(\"HF_TOKEN\")\n",
174
- " )\n",
175
- " print(f\"✅ Model loaded\")\n",
176
  " \n",
177
- " # Step 3: Prepare calibration dataset\n",
178
- " print(f\"\\n[3/5] Preparing calibration dataset ({calibration_dataset_size} samples)...\")\n",
179
  " \n",
180
- " # Create a simple calibration dataset\n",
181
- " # You can customize this based on your use case\n",
182
  " calibration_texts = [\n",
183
  " \"You are the Router Agent coordinating Math, Code, and General-Search specialists.\",\n",
184
  " \"Emit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\",\n",
@@ -195,34 +189,43 @@
195
  " calibration_texts.extend(calibration_texts[:calibration_dataset_size - len(calibration_texts)])\n",
196
  " \n",
197
  " calibration_texts = calibration_texts[:calibration_dataset_size]\n",
198
- " \n",
199
- " # Tokenize calibration data\n",
200
- " def tokenize_function(texts):\n",
201
- " return tokenizer(\n",
202
- " texts,\n",
203
- " return_tensors=\"pt\",\n",
204
- " padding=True,\n",
205
- " truncation=True,\n",
206
- " max_length=512\n",
207
- " )\n",
208
- " \n",
209
- " calibration_data = tokenize_function(calibration_texts)\n",
210
  " print(f\"✅ Calibration dataset prepared: {len(calibration_texts)} samples\")\n",
211
  " \n",
212
- " # Step 4: Quantize model\n",
213
- " print(f\"\\n[4/5] Quantizing model to AWQ (this may take 30-60 minutes)...\")\n",
214
  " print(f\"Config: {awq_config}\")\n",
 
215
  " \n",
216
- " model.quantize(\n",
217
- " tokenizer,\n",
218
- " quant_config=awq_config,\n",
219
- " calib_data=calibration_data\n",
220
- " )\n",
221
- " \n",
222
- " print(f\"✅ Model quantized to AWQ\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  " \n",
224
- " # Step 5: Save quantized model\n",
225
- " print(f\"\\n[5/5] Saving quantized model to {output_repo}...\")\n",
226
  " \n",
227
  " # Create repo if it doesn't exist\n",
228
  " api = HfApi()\n",
@@ -233,28 +236,38 @@
233
  " exist_ok=True,\n",
234
  " token=os.environ.get(\"HF_TOKEN\")\n",
235
  " )\n",
 
236
  " except Exception as e:\n",
237
  " print(f\"Note: Repo may already exist: {e}\")\n",
238
  " \n",
239
- " # Save model\n",
240
- " model.save_quantized(\n",
241
- " output_repo,\n",
242
- " safetensors=True,\n",
243
- " shard_size=\"10GB\" # Shard large models\n",
244
- " )\n",
245
- " \n",
246
- " # Upload tokenizer\n",
247
- " tokenizer.save_pretrained(output_repo)\n",
 
 
 
 
 
 
248
  " \n",
249
- " print(f\"✅ Quantized model saved to {output_repo}\")\n",
 
250
  " \n",
251
- " # Step 6: Clean up to free disk space (critical for Colab)\n",
252
- " print(f\"\\n[6/6] Cleaning up local files to free disk space...\")\n",
 
 
 
 
 
253
  " \n",
254
  " # Free GPU memory\n",
255
- " del model\n",
256
- " del tokenizer\n",
257
- " del calibration_data\n",
258
  " torch.cuda.empty_cache()\n",
259
  " gc.collect()\n",
260
  " \n",
@@ -282,7 +295,8 @@
282
  " \n",
283
  " print(f\"\\n✅ {model_name} quantization complete!\")\n",
284
  " print(f\"Model available at: https://huggingface.co/{output_repo}\")\n",
285
- " print(f\"💾 Local model files deleted to save disk space\")\n"
 
286
  ]
287
  },
288
  {
@@ -342,63 +356,117 @@
342
  "metadata": {},
343
  "outputs": [],
344
  "source": [
 
345
  "from transformers import AutoTokenizer\n",
346
- "from awq import AutoAWQForCausalLM\n",
347
  "\n",
348
- "def verify_awq_model(repo_id: str):\n",
349
- " \"\"\"Verify that an AWQ model can be loaded correctly.\"\"\"\n",
350
- " print(f\"\\nVerifying {repo_id}...\")\n",
351
  " \n",
352
  " try:\n",
353
- " # Load tokenizer\n",
354
- " tokenizer = AutoTokenizer.from_pretrained(\n",
355
- " repo_id,\n",
356
- " trust_remote_code=True,\n",
357
- " token=os.environ.get(\"HF_TOKEN\")\n",
358
- " )\n",
359
  " \n",
360
- " # Load AWQ model\n",
361
- " model = AutoAWQForCausalLM.from_quantized(\n",
362
- " repo_id,\n",
363
- " fuse_layers=True,\n",
364
  " trust_remote_code=True,\n",
365
- " device_map=\"auto\",\n",
366
- " token=os.environ.get(\"HF_TOKEN\")\n",
367
  " )\n",
368
  " \n",
369
  " # Test generation\n",
370
- " test_prompt = \"You are the Router Agent. Test prompt.\"\n",
371
- " inputs = tokenizer(test_prompt, return_tensors=\"pt\").to(model.device)\n",
 
 
372
  " \n",
373
- " with torch.inference_mode():\n",
374
- " outputs = model.generate(\n",
375
- " **inputs,\n",
376
- " max_new_tokens=10,\n",
377
- " do_sample=False\n",
378
- " )\n",
379
  " \n",
380
- " generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
381
- " print(f\"✅ Model loads and generates correctly\")\n",
382
  " print(f\"Generated: {generated_text[:100]}...\")\n",
383
  " \n",
384
- " # Check model size\n",
385
- " total_params = sum(p.numel() for p in model.parameters())\n",
386
- " print(f\"Total parameters: {total_params / 1e9:.2f}B\")\n",
387
- " \n",
388
- " del model\n",
389
- " del tokenizer\n",
390
  " torch.cuda.empty_cache()\n",
391
  " \n",
392
  " return True\n",
393
  " except Exception as e:\n",
394
- " print(f\"❌ Verification failed: {e}\")\n",
395
  " import traceback\n",
396
  " traceback.print_exc()\n",
397
  " return False\n",
398
  "\n",
399
- "# Verify both models\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  "for model_key, model_info in MODELS_TO_QUANTIZE.items():\n",
401
- " verify_awq_model(model_info[\"output_repo\"])\n"
 
 
 
 
 
 
 
 
 
402
  ]
403
  },
404
  {
 
4
  "cell_type": "markdown",
5
  "metadata": {},
6
  "source": [
7
+ "# Router Models AWQ Quantization with LLM Compressor (vLLM Native)\n",
8
  "\n",
9
+ "This notebook quantizes the CourseGPT-Pro router models to AWQ (Activation-aware Weight Quantization) format using **LLM Compressor** - vLLM's native quantization tool.\n",
10
  "\n",
11
  "**Models to quantize:**\n",
12
  "- `Alovestocode/router-gemma3-merged` (27B)\n",
13
  "- `Alovestocode/router-qwen3-32b-merged` (33B)\n",
14
  "\n",
15
+ "**Output:** AWQ-quantized models ready for vLLM inference with optimal performance.\n",
16
+ "\n",
17
+ "**Why LLM Compressor?**\n",
18
+ "- Native vLLM integration (better compatibility)\n",
19
+ "- Supports advanced features (pruning, combined modifiers)\n",
20
+ "- Actively maintained by vLLM team\n",
21
+ "- Optimized for vLLM inference engine\n"
22
  ]
23
  },
24
  {
 
35
  "outputs": [],
36
  "source": [
37
  "# Install required packages\n",
38
+ "# LLM Compressor is vLLM's native quantization tool\n",
39
+ "%pip install -q llm-compressor transformers accelerate huggingface_hub\n",
40
  "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n",
41
  "\n",
42
  "# Utility function to check disk space\n",
 
124
  "metadata": {},
125
  "outputs": [],
126
  "source": [
127
+ "# LLM Compressor (vLLM native quantization tool)\n",
128
+ "from llmcompressor import oneshot\n",
129
+ "from llmcompressor.modifiers.quantization import AWQModifier\n",
130
  "from transformers import AutoTokenizer\n",
131
+ "from huggingface_hub import HfApi, scan_cache_dir, delete_revisions, upload_folder\n",
132
  "import torch\n",
133
  "import shutil\n",
134
  "import gc\n",
135
+ "import os\n",
136
  "\n",
137
  "def quantize_model_to_awq(\n",
138
  " model_name: str,\n",
 
142
  " awq_config: dict,\n",
143
  " calibration_dataset_size: int = 128\n",
144
  "):\n",
145
+ " \"\"\"Quantize a model to AWQ format using LLM Compressor (vLLM native).\n",
146
  " \n",
147
  " Args:\n",
148
  " model_name: Display name for the model\n",
 
153
  " calibration_dataset_size: Number of calibration samples\n",
154
  " \"\"\"\n",
155
  " print(f\"\\n{'='*60}\")\n",
156
+ " print(f\"Quantizing {model_name} with LLM Compressor (vLLM native)\")\n",
157
  " print(f\"Source: {repo_id}\")\n",
158
  " print(f\"Destination: {output_repo}\")\n",
159
  " print(f\"{'='*60}\\n\")\n",
160
  " \n",
161
+ " # Check disk space before starting\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  " free_space_before = check_disk_space()\n",
163
  " if free_space_before < 30:\n",
164
+ " print(f\"⚠️ WARNING: Low disk space ({free_space_before:.2f} GB). Quantization may fail.\")\n",
165
  " \n",
166
+ " # Step 1: Create temporary output directory\n",
167
+ " import tempfile\n",
168
+ " temp_output_dir = f\"./temp_{model_name.replace('-', '_')}_awq\"\n",
169
+ " print(f\"[1/4] Creating temporary output directory: {temp_output_dir}\")\n",
170
+ " os.makedirs(temp_output_dir, exist_ok=True)\n",
 
 
171
  " \n",
172
+ " # Step 2: Prepare calibration dataset\n",
173
+ " print(f\"\\n[2/4] Preparing calibration dataset ({calibration_dataset_size} samples)...\")\n",
174
  " \n",
175
+ " # Create calibration dataset for router agent\n",
 
176
  " calibration_texts = [\n",
177
  " \"You are the Router Agent coordinating Math, Code, and General-Search specialists.\",\n",
178
  " \"Emit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\",\n",
 
189
  " calibration_texts.extend(calibration_texts[:calibration_dataset_size - len(calibration_texts)])\n",
190
  " \n",
191
  " calibration_texts = calibration_texts[:calibration_dataset_size]\n",
 
 
 
 
 
 
 
 
 
 
 
 
192
  " print(f\"✅ Calibration dataset prepared: {len(calibration_texts)} samples\")\n",
193
  " \n",
194
+ " # Step 3: Quantize model using LLM Compressor\n",
195
+ " print(f\"\\n[3/4] Quantizing model to AWQ with LLM Compressor (this may take 30-60 minutes)...\")\n",
196
  " print(f\"Config: {awq_config}\")\n",
197
+ " print(\"⚠️ LLM Compressor will load the model, quantize it, and save to local directory\")\n",
198
  " \n",
199
+ " try:\n",
200
+ " # LLM Compressor's oneshot function handles everything:\n",
201
+ " # - Loading the model\n",
202
+ " # - Quantization with calibration data\n",
203
+ " # - Saving quantized model\n",
204
+ " oneshot(\n",
205
+ " model=repo_id,\n",
206
+ " output_dir=temp_output_dir,\n",
207
+ " modifiers=[\n",
208
+ " AWQModifier(\n",
209
+ " w_bit=awq_config.get(\"w_bit\", 4),\n",
210
+ " q_group_size=awq_config.get(\"q_group_size\", 128),\n",
211
+ " zero_point=awq_config.get(\"zero_point\", True),\n",
212
+ " version=awq_config.get(\"version\", \"GEMM\")\n",
213
+ " )\n",
214
+ " ],\n",
215
+ " token=os.environ.get(\"HF_TOKEN\"),\n",
216
+ " # Calibration data can be passed as a list of strings\n",
217
+ " calibration_data=calibration_texts[:min(calibration_dataset_size, 128)] # Limit for efficiency\n",
218
+ " )\n",
219
+ " \n",
220
+ " print(f\"✅ Model quantized to AWQ\")\n",
221
+ " except Exception as e:\n",
222
+ " print(f\"❌ Quantization failed: {e}\")\n",
223
+ " import traceback\n",
224
+ " traceback.print_exc()\n",
225
+ " raise\n",
226
  " \n",
227
+ " # Step 4: Upload to Hugging Face\n",
228
+ " print(f\"\\n[4/4] Uploading quantized model to {output_repo}...\")\n",
229
  " \n",
230
  " # Create repo if it doesn't exist\n",
231
  " api = HfApi()\n",
 
236
  " exist_ok=True,\n",
237
  " token=os.environ.get(\"HF_TOKEN\")\n",
238
  " )\n",
239
+ " print(f\"✅ Repository ready: {output_repo}\")\n",
240
  " except Exception as e:\n",
241
  " print(f\"Note: Repo may already exist: {e}\")\n",
242
  " \n",
243
+ " # Upload the quantized model directory\n",
244
+ " try:\n",
245
+ " upload_folder(\n",
246
+ " folder_path=temp_output_dir,\n",
247
+ " repo_id=output_repo,\n",
248
+ " repo_type=\"model\",\n",
249
+ " token=os.environ.get(\"HF_TOKEN\"),\n",
250
+ " ignore_patterns=[\"*.pt\", \"*.bin\"] # Only upload safetensors\n",
251
+ " )\n",
252
+ " print(f\"✅ Quantized model uploaded to {output_repo}\")\n",
253
+ " except Exception as e:\n",
254
+ " print(f\"❌ Upload failed: {e}\")\n",
255
+ " import traceback\n",
256
+ " traceback.print_exc()\n",
257
+ " raise\n",
258
  " \n",
259
+ " # Step 5: Clean up to free disk space (critical for Colab)\n",
260
+ " print(f\"\\n[5/5] Cleaning up local files to free disk space...\")\n",
261
  " \n",
262
+ " # Delete temporary output directory\n",
263
+ " try:\n",
264
+ " import shutil\n",
265
+ " shutil.rmtree(temp_output_dir)\n",
266
+ " print(f\" ✅ Deleted temporary directory: {temp_output_dir}\")\n",
267
+ " except Exception as e:\n",
268
+ " print(f\" ⚠️ Could not delete temp directory: {e}\")\n",
269
  " \n",
270
  " # Free GPU memory\n",
 
 
 
271
  " torch.cuda.empty_cache()\n",
272
  " gc.collect()\n",
273
  " \n",
 
295
  " \n",
296
  " print(f\"\\n✅ {model_name} quantization complete!\")\n",
297
  " print(f\"Model available at: https://huggingface.co/{output_repo}\")\n",
298
+ " print(f\"💾 Local model files deleted to save disk space\")\n",
299
+ " print(f\"🚀 Model is ready for vLLM inference with optimal performance!\")\n"
300
  ]
301
  },
302
  {
 
356
  "metadata": {},
357
  "outputs": [],
358
  "source": [
359
+ "# Verify quantized models with vLLM (recommended) or Transformers\n",
360
  "from transformers import AutoTokenizer\n",
 
361
  "\n",
362
+ "def verify_awq_model_vllm(repo_id: str):\n",
363
+ " \"\"\"Verify AWQ model can be loaded with vLLM (recommended).\"\"\"\n",
364
+ " print(f\"\\nVerifying {repo_id} with vLLM...\")\n",
365
  " \n",
366
  " try:\n",
367
+ " # Try importing vLLM\n",
368
+ " try:\n",
369
+ " from vllm import LLM, SamplingParams\n",
370
+ " except ImportError:\n",
371
+ " print(\"⚠️ vLLM not available, skipping vLLM verification\")\n",
372
+ " return False\n",
373
  " \n",
374
+ " # Load with vLLM (auto-detects AWQ)\n",
375
+ " llm = LLM(\n",
376
+ " model=repo_id,\n",
377
+ " quantization=\"awq\",\n",
378
  " trust_remote_code=True,\n",
379
+ " token=os.environ.get(\"HF_TOKEN\"),\n",
380
+ " gpu_memory_utilization=0.5 # Lower for verification\n",
381
  " )\n",
382
  " \n",
383
  " # Test generation\n",
384
+ " sampling_params = SamplingParams(\n",
385
+ " temperature=0.0,\n",
386
+ " max_tokens=10\n",
387
+ " )\n",
388
  " \n",
389
+ " test_prompt = \"You are the Router Agent. Test prompt.\"\n",
390
+ " outputs = llm.generate([test_prompt], sampling_params)\n",
 
 
 
 
391
  " \n",
392
+ " generated_text = outputs[0].outputs[0].text\n",
393
+ " print(f\"✅ vLLM loads and generates correctly\")\n",
394
  " print(f\"Generated: {generated_text[:100]}...\")\n",
395
  " \n",
396
+ " del llm\n",
 
 
 
 
 
397
  " torch.cuda.empty_cache()\n",
398
  " \n",
399
  " return True\n",
400
  " except Exception as e:\n",
401
+ " print(f\"❌ vLLM verification failed: {e}\")\n",
402
  " import traceback\n",
403
  " traceback.print_exc()\n",
404
  " return False\n",
405
  "\n",
406
+ "def verify_awq_model_transformers(repo_id: str):\n",
407
+ " \"\"\"Verify AWQ model can be loaded with Transformers (fallback).\"\"\"\n",
408
+ " print(f\"\\nVerifying {repo_id} with Transformers...\")\n",
409
+ " \n",
410
+ " try:\n",
411
+ " # Load tokenizer\n",
412
+ " tokenizer = AutoTokenizer.from_pretrained(\n",
413
+ " repo_id,\n",
414
+ " trust_remote_code=True,\n",
415
+ " token=os.environ.get(\"HF_TOKEN\")\n",
416
+ " )\n",
417
+ " \n",
418
+ " # Try loading with AutoAWQ (if available)\n",
419
+ " try:\n",
420
+ " from awq import AutoAWQForCausalLM\n",
421
+ " model = AutoAWQForCausalLM.from_quantized(\n",
422
+ " repo_id,\n",
423
+ " fuse_layers=True,\n",
424
+ " trust_remote_code=True,\n",
425
+ " device_map=\"auto\",\n",
426
+ " token=os.environ.get(\"HF_TOKEN\")\n",
427
+ " )\n",
428
+ " \n",
429
+ " # Test generation\n",
430
+ " test_prompt = \"You are the Router Agent. Test prompt.\"\n",
431
+ " inputs = tokenizer(test_prompt, return_tensors=\"pt\").to(model.device)\n",
432
+ " \n",
433
+ " with torch.inference_mode():\n",
434
+ " outputs = model.generate(\n",
435
+ " **inputs,\n",
436
+ " max_new_tokens=10,\n",
437
+ " do_sample=False\n",
438
+ " )\n",
439
+ " \n",
440
+ " generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
441
+ " print(f\"✅ Transformers loads and generates correctly\")\n",
442
+ " print(f\"Generated: {generated_text[:100]}...\")\n",
443
+ " \n",
444
+ " del model\n",
445
+ " del tokenizer\n",
446
+ " torch.cuda.empty_cache()\n",
447
+ " \n",
448
+ " return True\n",
449
+ " except ImportError:\n",
450
+ " print(\"⚠️ AutoAWQ not available, skipping Transformers verification\")\n",
451
+ " return False\n",
452
+ " except Exception as e:\n",
453
+ " print(f\"❌ Transformers verification failed: {e}\")\n",
454
+ " import traceback\n",
455
+ " traceback.print_exc()\n",
456
+ " return False\n",
457
+ "\n",
458
+ "# Verify both models (prefer vLLM)\n",
459
  "for model_key, model_info in MODELS_TO_QUANTIZE.items():\n",
460
+ " print(f\"\\n{'='*60}\")\n",
461
+ " print(f\"Verifying {model_key}\")\n",
462
+ " print(f\"{'='*60}\")\n",
463
+ " \n",
464
+ " # Try vLLM first (recommended)\n",
465
+ " vllm_ok = verify_awq_model_vllm(model_info[\"output_repo\"])\n",
466
+ " \n",
467
+ " # Fallback to Transformers if vLLM not available\n",
468
+ " if not vllm_ok:\n",
469
+ " verify_awq_model_transformers(model_info[\"output_repo\"])\n"
470
  ]
471
  },
472
  {