Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 10, 2025

Commit

ae07f77

1 Parent(s): 808203f

Replace AutoAWQ with LLM Compressor (vLLM native) in Colab notebook

- Use llm-compressor instead of autoawq for quantization
- LLM Compressor is vLLM's native tool with better integration
- Simplified quantization pipeline using oneshot() function
- Updated verification to prefer vLLM over Transformers
- Better compatibility with vLLM inference engine

Files changed (1) hide show

quantize_to_awq_colab.ipynb +176 -108

quantize_to_awq_colab.ipynb CHANGED Viewed

@@ -4,15 +4,21 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "# Router Models AWQ Quantization\n",
         "\n",
-        "This notebook quantizes the CourseGPT-Pro router models to AWQ (Activation-aware Weight Quantization) format for efficient inference.\n",
         "\n",
         "**Models to quantize:**\n",
         "- `Alovestocode/router-gemma3-merged` (27B)\n",
         "- `Alovestocode/router-qwen3-32b-merged` (33B)\n",
         "\n",
-        "**Output:** AWQ-quantized models ready for vLLM or Transformers inference.\n"
       ]
     },
     {
@@ -29,7 +35,8 @@
       "outputs": [],
       "source": [
         "# Install required packages\n",
-        "%pip install -q autoawq transformers accelerate huggingface_hub\n",
         "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n",
         "\n",
         "# Utility function to check disk space\n",
@@ -117,12 +124,15 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "from awq import AutoAWQForCausalLM\n",
         "from transformers import AutoTokenizer\n",
-        "from huggingface_hub import HfApi, scan_cache_dir, delete_revisions\n",
         "import torch\n",
         "import shutil\n",
         "import gc\n",
         "\n",
         "def quantize_model_to_awq(\n",
         "    model_name: str,\n",
@@ -132,7 +142,7 @@
         "    awq_config: dict,\n",
         "    calibration_dataset_size: int = 128\n",
         "):\n",
-        "    \"\"\"Quantize a model to AWQ format.\n",
         "    \n",
         "    Args:\n",
         "        model_name: Display name for the model\n",
@@ -143,42 +153,26 @@
         "        calibration_dataset_size: Number of calibration samples\n",
         "    \"\"\"\n",
         "    print(f\"\\n{'='*60}\")\n",
-        "    print(f\"Quantizing {model_name}\")\n",
         "    print(f\"Source: {repo_id}\")\n",
         "    print(f\"Destination: {output_repo}\")\n",
         "    print(f\"{'='*60}\\n\")\n",
         "    \n",
-        "    # Step 1: Load tokenizer\n",
-        "    print(f\"[1/5] Loading tokenizer from {repo_id}...\")\n",
-        "    tokenizer = AutoTokenizer.from_pretrained(\n",
-        "        repo_id,\n",
-        "        trust_remote_code=True,\n",
-        "        token=os.environ.get(\"HF_TOKEN\")\n",
-        "    )\n",
-        "    print(f\"✅ Tokenizer loaded\")\n",
-        "    \n",
-        "    # Step 2: Load model\n",
-        "    print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
-        "    print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
-        "    \n",
-        "    # Check disk space before loading\n",
         "    free_space_before = check_disk_space()\n",
         "    if free_space_before < 30:\n",
-        "        print(f\"⚠️ WARNING: Low disk space ({free_space_before:.2f} GB). Model loading may fail.\")\n",
         "    \n",
-        "    model = AutoAWQForCausalLM.from_pretrained(\n",
-        "        repo_id,\n",
-        "        device_map=\"auto\",\n",
-        "        trust_remote_code=True,\n",
-        "        token=os.environ.get(\"HF_TOKEN\")\n",
-        "    )\n",
-        "    print(f\"✅ Model loaded\")\n",
         "    \n",
-        "    # Step 3: Prepare calibration dataset\n",
-        "    print(f\"\\n[3/5] Preparing calibration dataset ({calibration_dataset_size} samples)...\")\n",
         "    \n",
-        "    # Create a simple calibration dataset\n",
-        "    # You can customize this based on your use case\n",
         "    calibration_texts = [\n",
         "        \"You are the Router Agent coordinating Math, Code, and General-Search specialists.\",\n",
         "        \"Emit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\",\n",
@@ -195,34 +189,43 @@
         "        calibration_texts.extend(calibration_texts[:calibration_dataset_size - len(calibration_texts)])\n",
         "    \n",
         "    calibration_texts = calibration_texts[:calibration_dataset_size]\n",
-        "    \n",
-        "    # Tokenize calibration data\n",
-        "    def tokenize_function(texts):\n",
-        "        return tokenizer(\n",
-        "            texts,\n",
-        "            return_tensors=\"pt\",\n",
-        "            padding=True,\n",
-        "            truncation=True,\n",
-        "            max_length=512\n",
-        "        )\n",
-        "    \n",
-        "    calibration_data = tokenize_function(calibration_texts)\n",
         "    print(f\"✅ Calibration dataset prepared: {len(calibration_texts)} samples\")\n",
         "    \n",
-        "    # Step 4: Quantize model\n",
-        "    print(f\"\\n[4/5] Quantizing model to AWQ (this may take 30-60 minutes)...\")\n",
         "    print(f\"Config: {awq_config}\")\n",
         "    \n",
-        "    model.quantize(\n",
-        "        tokenizer,\n",
-        "        quant_config=awq_config,\n",
-        "        calib_data=calibration_data\n",
-        "    )\n",
-        "    \n",
-        "    print(f\"✅ Model quantized to AWQ\")\n",
         "    \n",
-        "    # Step 5: Save quantized model\n",
-        "    print(f\"\\n[5/5] Saving quantized model to {output_repo}...\")\n",
         "    \n",
         "    # Create repo if it doesn't exist\n",
         "    api = HfApi()\n",
@@ -233,28 +236,38 @@
         "            exist_ok=True,\n",
         "            token=os.environ.get(\"HF_TOKEN\")\n",
         "        )\n",
         "    except Exception as e:\n",
         "        print(f\"Note: Repo may already exist: {e}\")\n",
         "    \n",
-        "    # Save model\n",
-        "    model.save_quantized(\n",
-        "        output_repo,\n",
-        "        safetensors=True,\n",
-        "        shard_size=\"10GB\"  # Shard large models\n",
-        "    )\n",
-        "    \n",
-        "    # Upload tokenizer\n",
-        "    tokenizer.save_pretrained(output_repo)\n",
         "    \n",
-        "    print(f\"✅ Quantized model saved to {output_repo}\")\n",
         "    \n",
-        "    # Step 6: Clean up to free disk space (critical for Colab)\n",
-        "    print(f\"\\n[6/6] Cleaning up local files to free disk space...\")\n",
         "    \n",
         "    # Free GPU memory\n",
-        "    del model\n",
-        "    del tokenizer\n",
-        "    del calibration_data\n",
         "    torch.cuda.empty_cache()\n",
         "    gc.collect()\n",
         "    \n",
@@ -282,7 +295,8 @@
         "    \n",
         "    print(f\"\\n✅ {model_name} quantization complete!\")\n",
         "    print(f\"Model available at: https://huggingface.co/{output_repo}\")\n",
-        "    print(f\"💾 Local model files deleted to save disk space\")\n"
       ]
     },
     {
@@ -342,63 +356,117 @@
       "metadata": {},
       "outputs": [],
       "source": [
         "from transformers import AutoTokenizer\n",
-        "from awq import AutoAWQForCausalLM\n",
         "\n",
-        "def verify_awq_model(repo_id: str):\n",
-        "    \"\"\"Verify that an AWQ model can be loaded correctly.\"\"\"\n",
-        "    print(f\"\\nVerifying {repo_id}...\")\n",
         "    \n",
         "    try:\n",
-        "        # Load tokenizer\n",
-        "        tokenizer = AutoTokenizer.from_pretrained(\n",
-        "            repo_id,\n",
-        "            trust_remote_code=True,\n",
-        "            token=os.environ.get(\"HF_TOKEN\")\n",
-        "        )\n",
         "        \n",
-        "        # Load AWQ model\n",
-        "        model = AutoAWQForCausalLM.from_quantized(\n",
-        "            repo_id,\n",
-        "            fuse_layers=True,\n",
         "            trust_remote_code=True,\n",
-        "            device_map=\"auto\",\n",
-        "            token=os.environ.get(\"HF_TOKEN\")\n",
         "        )\n",
         "        \n",
         "        # Test generation\n",
-        "        test_prompt = \"You are the Router Agent. Test prompt.\"\n",
-        "        inputs = tokenizer(test_prompt, return_tensors=\"pt\").to(model.device)\n",
         "        \n",
-        "        with torch.inference_mode():\n",
-        "            outputs = model.generate(\n",
-        "                **inputs,\n",
-        "                max_new_tokens=10,\n",
-        "                do_sample=False\n",
-        "            )\n",
         "        \n",
-        "        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
-        "        print(f\"✅ Model loads and generates correctly\")\n",
         "        print(f\"Generated: {generated_text[:100]}...\")\n",
         "        \n",
-        "        # Check model size\n",
-        "        total_params = sum(p.numel() for p in model.parameters())\n",
-        "        print(f\"Total parameters: {total_params / 1e9:.2f}B\")\n",
-        "        \n",
-        "        del model\n",
-        "        del tokenizer\n",
         "        torch.cuda.empty_cache()\n",
         "        \n",
         "        return True\n",
         "    except Exception as e:\n",
-        "        print(f\"❌ Verification failed: {e}\")\n",
         "        import traceback\n",
         "        traceback.print_exc()\n",
         "        return False\n",
         "\n",
-        "# Verify both models\n",
         "for model_key, model_info in MODELS_TO_QUANTIZE.items():\n",
-        "    verify_awq_model(model_info[\"output_repo\"])\n"
       ]
     },
     {

       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "# Router Models AWQ Quantization with LLM Compressor (vLLM Native)\n",
         "\n",
+        "This notebook quantizes the CourseGPT-Pro router models to AWQ (Activation-aware Weight Quantization) format using **LLM Compressor** - vLLM's native quantization tool.\n",
         "\n",
         "**Models to quantize:**\n",
         "- `Alovestocode/router-gemma3-merged` (27B)\n",
         "- `Alovestocode/router-qwen3-32b-merged` (33B)\n",
         "\n",
+        "**Output:** AWQ-quantized models ready for vLLM inference with optimal performance.\n",
+        "\n",
+        "**Why LLM Compressor?**\n",
+        "- Native vLLM integration (better compatibility)\n",
+        "- Supports advanced features (pruning, combined modifiers)\n",
+        "- Actively maintained by vLLM team\n",
+        "- Optimized for vLLM inference engine\n"
       ]
     },
     {
       "outputs": [],
       "source": [
         "# Install required packages\n",
+        "# LLM Compressor is vLLM's native quantization tool\n",
+        "%pip install -q llm-compressor transformers accelerate huggingface_hub\n",
         "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n",
         "\n",
         "# Utility function to check disk space\n",
       "metadata": {},
       "outputs": [],
       "source": [
+        "# LLM Compressor (vLLM native quantization tool)\n",
+        "from llmcompressor import oneshot\n",
+        "from llmcompressor.modifiers.quantization import AWQModifier\n",
         "from transformers import AutoTokenizer\n",
+        "from huggingface_hub import HfApi, scan_cache_dir, delete_revisions, upload_folder\n",
         "import torch\n",
         "import shutil\n",
         "import gc\n",
+        "import os\n",
         "\n",
         "def quantize_model_to_awq(\n",
         "    model_name: str,\n",
         "    awq_config: dict,\n",
         "    calibration_dataset_size: int = 128\n",
         "):\n",
+        "    \"\"\"Quantize a model to AWQ format using LLM Compressor (vLLM native).\n",
         "    \n",
         "    Args:\n",
         "        model_name: Display name for the model\n",
         "        calibration_dataset_size: Number of calibration samples\n",
         "    \"\"\"\n",
         "    print(f\"\\n{'='*60}\")\n",
+        "    print(f\"Quantizing {model_name} with LLM Compressor (vLLM native)\")\n",
         "    print(f\"Source: {repo_id}\")\n",
         "    print(f\"Destination: {output_repo}\")\n",
         "    print(f\"{'='*60}\\n\")\n",
         "    \n",
+        "    # Check disk space before starting\n",
         "    free_space_before = check_disk_space()\n",
         "    if free_space_before < 30:\n",
+        "        print(f\"⚠️ WARNING: Low disk space ({free_space_before:.2f} GB). Quantization may fail.\")\n",
         "    \n",
+        "    # Step 1: Create temporary output directory\n",
+        "    import tempfile\n",
+        "    temp_output_dir = f\"./temp_{model_name.replace('-', '_')}_awq\"\n",
+        "    print(f\"[1/4] Creating temporary output directory: {temp_output_dir}\")\n",
+        "    os.makedirs(temp_output_dir, exist_ok=True)\n",
         "    \n",
+        "    # Step 2: Prepare calibration dataset\n",
+        "    print(f\"\\n[2/4] Preparing calibration dataset ({calibration_dataset_size} samples)...\")\n",
         "    \n",
+        "    # Create calibration dataset for router agent\n",
         "    calibration_texts = [\n",
         "        \"You are the Router Agent coordinating Math, Code, and General-Search specialists.\",\n",
         "        \"Emit EXACTLY ONE strict JSON object with keys route_plan, route_rationale, expected_artifacts,\",\n",
         "        calibration_texts.extend(calibration_texts[:calibration_dataset_size - len(calibration_texts)])\n",
         "    \n",
         "    calibration_texts = calibration_texts[:calibration_dataset_size]\n",
         "    print(f\"✅ Calibration dataset prepared: {len(calibration_texts)} samples\")\n",
         "    \n",
+        "    # Step 3: Quantize model using LLM Compressor\n",
+        "    print(f\"\\n[3/4] Quantizing model to AWQ with LLM Compressor (this may take 30-60 minutes)...\")\n",
         "    print(f\"Config: {awq_config}\")\n",
+        "    print(\"⚠️ LLM Compressor will load the model, quantize it, and save to local directory\")\n",
         "    \n",
+        "    try:\n",
+        "        # LLM Compressor's oneshot function handles everything:\n",
+        "        # - Loading the model\n",
+        "        # - Quantization with calibration data\n",
+        "        # - Saving quantized model\n",
+        "        oneshot(\n",
+        "            model=repo_id,\n",
+        "            output_dir=temp_output_dir,\n",
+        "            modifiers=[\n",
+        "                AWQModifier(\n",
+        "                    w_bit=awq_config.get(\"w_bit\", 4),\n",
+        "                    q_group_size=awq_config.get(\"q_group_size\", 128),\n",
+        "                    zero_point=awq_config.get(\"zero_point\", True),\n",
+        "                    version=awq_config.get(\"version\", \"GEMM\")\n",
+        "                )\n",
+        "            ],\n",
+        "            token=os.environ.get(\"HF_TOKEN\"),\n",
+        "            # Calibration data can be passed as a list of strings\n",
+        "            calibration_data=calibration_texts[:min(calibration_dataset_size, 128)]  # Limit for efficiency\n",
+        "        )\n",
+        "        \n",
+        "        print(f\"✅ Model quantized to AWQ\")\n",
+        "    except Exception as e:\n",
+        "        print(f\"❌ Quantization failed: {e}\")\n",
+        "        import traceback\n",
+        "        traceback.print_exc()\n",
+        "        raise\n",
         "    \n",
+        "    # Step 4: Upload to Hugging Face\n",
+        "    print(f\"\\n[4/4] Uploading quantized model to {output_repo}...\")\n",
         "    \n",
         "    # Create repo if it doesn't exist\n",
         "    api = HfApi()\n",
         "            exist_ok=True,\n",
         "            token=os.environ.get(\"HF_TOKEN\")\n",
         "        )\n",
+        "        print(f\"✅ Repository ready: {output_repo}\")\n",
         "    except Exception as e:\n",
         "        print(f\"Note: Repo may already exist: {e}\")\n",
         "    \n",
+        "    # Upload the quantized model directory\n",
+        "    try:\n",
+        "        upload_folder(\n",
+        "            folder_path=temp_output_dir,\n",
+        "            repo_id=output_repo,\n",
+        "            repo_type=\"model\",\n",
+        "            token=os.environ.get(\"HF_TOKEN\"),\n",
+        "            ignore_patterns=[\"*.pt\", \"*.bin\"]  # Only upload safetensors\n",
+        "        )\n",
+        "        print(f\"✅ Quantized model uploaded to {output_repo}\")\n",
+        "    except Exception as e:\n",
+        "        print(f\"❌ Upload failed: {e}\")\n",
+        "        import traceback\n",
+        "        traceback.print_exc()\n",
+        "        raise\n",
         "    \n",
+        "    # Step 5: Clean up to free disk space (critical for Colab)\n",
+        "    print(f\"\\n[5/5] Cleaning up local files to free disk space...\")\n",
         "    \n",
+        "    # Delete temporary output directory\n",
+        "    try:\n",
+        "        import shutil\n",
+        "        shutil.rmtree(temp_output_dir)\n",
+        "        print(f\"  ✅ Deleted temporary directory: {temp_output_dir}\")\n",
+        "    except Exception as e:\n",
+        "        print(f\"  ⚠️ Could not delete temp directory: {e}\")\n",
         "    \n",
         "    # Free GPU memory\n",
         "    torch.cuda.empty_cache()\n",
         "    gc.collect()\n",
         "    \n",
         "    \n",
         "    print(f\"\\n✅ {model_name} quantization complete!\")\n",
         "    print(f\"Model available at: https://huggingface.co/{output_repo}\")\n",
+        "    print(f\"💾 Local model files deleted to save disk space\")\n",
+        "    print(f\"🚀 Model is ready for vLLM inference with optimal performance!\")\n"
       ]
     },
     {
       "metadata": {},
       "outputs": [],
       "source": [
+        "# Verify quantized models with vLLM (recommended) or Transformers\n",
         "from transformers import AutoTokenizer\n",
         "\n",
+        "def verify_awq_model_vllm(repo_id: str):\n",
+        "    \"\"\"Verify AWQ model can be loaded with vLLM (recommended).\"\"\"\n",
+        "    print(f\"\\nVerifying {repo_id} with vLLM...\")\n",
         "    \n",
         "    try:\n",
+        "        # Try importing vLLM\n",
+        "        try:\n",
+        "            from vllm import LLM, SamplingParams\n",
+        "        except ImportError:\n",
+        "            print(\"⚠️ vLLM not available, skipping vLLM verification\")\n",
+        "            return False\n",
         "        \n",
+        "        # Load with vLLM (auto-detects AWQ)\n",
+        "        llm = LLM(\n",
+        "            model=repo_id,\n",
+        "            quantization=\"awq\",\n",
         "            trust_remote_code=True,\n",
+        "            token=os.environ.get(\"HF_TOKEN\"),\n",
+        "            gpu_memory_utilization=0.5  # Lower for verification\n",
         "        )\n",
         "        \n",
         "        # Test generation\n",
+        "        sampling_params = SamplingParams(\n",
+        "            temperature=0.0,\n",
+        "            max_tokens=10\n",
+        "        )\n",
         "        \n",
+        "        test_prompt = \"You are the Router Agent. Test prompt.\"\n",
+        "        outputs = llm.generate([test_prompt], sampling_params)\n",
         "        \n",
+        "        generated_text = outputs[0].outputs[0].text\n",
+        "        print(f\"✅ vLLM loads and generates correctly\")\n",
         "        print(f\"Generated: {generated_text[:100]}...\")\n",
         "        \n",
+        "        del llm\n",
         "        torch.cuda.empty_cache()\n",
         "        \n",
         "        return True\n",
         "    except Exception as e:\n",
+        "        print(f\"❌ vLLM verification failed: {e}\")\n",
         "        import traceback\n",
         "        traceback.print_exc()\n",
         "        return False\n",
         "\n",
+        "def verify_awq_model_transformers(repo_id: str):\n",
+        "    \"\"\"Verify AWQ model can be loaded with Transformers (fallback).\"\"\"\n",
+        "    print(f\"\\nVerifying {repo_id} with Transformers...\")\n",
+        "    \n",
+        "    try:\n",
+        "        # Load tokenizer\n",
+        "        tokenizer = AutoTokenizer.from_pretrained(\n",
+        "            repo_id,\n",
+        "            trust_remote_code=True,\n",
+        "            token=os.environ.get(\"HF_TOKEN\")\n",
+        "        )\n",
+        "        \n",
+        "        # Try loading with AutoAWQ (if available)\n",
+        "        try:\n",
+        "            from awq import AutoAWQForCausalLM\n",
+        "            model = AutoAWQForCausalLM.from_quantized(\n",
+        "                repo_id,\n",
+        "                fuse_layers=True,\n",
+        "                trust_remote_code=True,\n",
+        "                device_map=\"auto\",\n",
+        "                token=os.environ.get(\"HF_TOKEN\")\n",
+        "            )\n",
+        "            \n",
+        "            # Test generation\n",
+        "            test_prompt = \"You are the Router Agent. Test prompt.\"\n",
+        "            inputs = tokenizer(test_prompt, return_tensors=\"pt\").to(model.device)\n",
+        "            \n",
+        "            with torch.inference_mode():\n",
+        "                outputs = model.generate(\n",
+        "                    **inputs,\n",
+        "                    max_new_tokens=10,\n",
+        "                    do_sample=False\n",
+        "                )\n",
+        "            \n",
+        "            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+        "            print(f\"✅ Transformers loads and generates correctly\")\n",
+        "            print(f\"Generated: {generated_text[:100]}...\")\n",
+        "            \n",
+        "            del model\n",
+        "            del tokenizer\n",
+        "            torch.cuda.empty_cache()\n",
+        "            \n",
+        "            return True\n",
+        "        except ImportError:\n",
+        "            print(\"⚠️ AutoAWQ not available, skipping Transformers verification\")\n",
+        "            return False\n",
+        "    except Exception as e:\n",
+        "        print(f\"❌ Transformers verification failed: {e}\")\n",
+        "        import traceback\n",
+        "        traceback.print_exc()\n",
+        "        return False\n",
+        "\n",
+        "# Verify both models (prefer vLLM)\n",
         "for model_key, model_info in MODELS_TO_QUANTIZE.items():\n",
+        "    print(f\"\\n{'='*60}\")\n",
+        "    print(f\"Verifying {model_key}\")\n",
+        "    print(f\"{'='*60}\")\n",
+        "    \n",
+        "    # Try vLLM first (recommended)\n",
+        "    vllm_ok = verify_awq_model_vllm(model_info[\"output_repo\"])\n",
+        "    \n",
+        "    # Fallback to Transformers if vLLM not available\n",
+        "    if not vllm_ok:\n",
+        "        verify_awq_model_transformers(model_info[\"output_repo\"])\n"
       ]
     },
     {