Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 10, 2025

Commit

24107f3

1 Parent(s): 2dff966

Add disk space cleanup after quantization in Colab notebook

- Delete source model from Hugging Face cache after quantization
- Free GPU memory and force garbage collection
- Check disk space before/after operations
- Add warnings for low disk space
- Critical for Colab's limited disk space

Files changed (1) hide show

quantize_to_awq_colab.ipynb +60 -6

quantize_to_awq_colab.ipynb CHANGED Viewed

@@ -30,7 +30,18 @@
       "source": [
         "# Install required packages\n",
         "%pip install -q autoawq transformers accelerate huggingface_hub\n",
-        "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n"
       ]
     },
     {
@@ -108,8 +119,10 @@
       "source": [
         "from awq import AutoAWQForCausalLM\n",
         "from transformers import AutoTokenizer\n",
-        "from huggingface_hub import HfApi\n",
         "import torch\n",
         "\n",
         "def quantize_model_to_awq(\n",
         "    model_name: str,\n",
@@ -148,6 +161,11 @@
         "    print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
         "    print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
         "    \n",
         "    model = AutoAWQForCausalLM.from_pretrained(\n",
         "        repo_id,\n",
         "        device_map=\"auto\",\n",
@@ -230,13 +248,41 @@
         "    \n",
         "    print(f\"✅ Quantized model saved to {output_repo}\")\n",
         "    \n",
-        "    # Clean up memory\n",
         "    del model\n",
         "    del tokenizer\n",
         "    torch.cuda.empty_cache()\n",
         "    \n",
         "    print(f\"\\n✅ {model_name} quantization complete!\")\n",
-        "    print(f\"Model available at: https://huggingface.co/{output_repo}\")\n"
       ]
     },
     {
@@ -363,8 +409,16 @@
         "\n",
         "- **GPU Required**: This quantization requires a GPU with at least 40GB VRAM (A100/H100 recommended)\n",
         "- **Time**: Each model takes approximately 30-60 minutes to quantize\n",
-        "- **Memory**: Ensure you have enough disk space (models are ~20-30GB each)\n",
-        "- **Output Repos**: You can either create new repos (with `-awq` suffix) or upload to existing repos\n",
         "- **Usage**: After quantization, update your `app.py` to use the AWQ repos:\n",
         "  ```python\n",
         "  MODELS = {\n",

       "source": [
         "# Install required packages\n",
         "%pip install -q autoawq transformers accelerate huggingface_hub\n",
+        "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n",
+        "\n",
+        "# Utility function to check disk space\n",
+        "import shutil\n",
+        "def check_disk_space():\n",
+        "    \"\"\"Check available disk space.\"\"\"\n",
+        "    total, used, free = shutil.disk_usage(\"/\")\n",
+        "    print(f\"Disk Space: {free / (1024**3):.2f} GB free out of {total / (1024**3):.2f} GB total\")\n",
+        "    return free / (1024**3)  # Return free space in GB\n",
+        "\n",
+        "print(\"Initial disk space:\")\n",
+        "check_disk_space()\n"
       ]
     },
     {
       "source": [
         "from awq import AutoAWQForCausalLM\n",
         "from transformers import AutoTokenizer\n",
+        "from huggingface_hub import HfApi, scan_cache_dir, delete_revisions\n",
         "import torch\n",
+        "import shutil\n",
+        "import gc\n",
         "\n",
         "def quantize_model_to_awq(\n",
         "    model_name: str,\n",
         "    print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
         "    print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
         "    \n",
+        "    # Check disk space before loading\n",
+        "    free_space_before = check_disk_space()\n",
+        "    if free_space_before < 30:\n",
+        "        print(f\"⚠️ WARNING: Low disk space ({free_space_before:.2f} GB). Model loading may fail.\")\n",
+        "    \n",
         "    model = AutoAWQForCausalLM.from_pretrained(\n",
         "        repo_id,\n",
         "        device_map=\"auto\",\n",
         "    \n",
         "    print(f\"✅ Quantized model saved to {output_repo}\")\n",
         "    \n",
+        "    # Step 6: Clean up to free disk space (critical for Colab)\n",
+        "    print(f\"\\n[6/6] Cleaning up local files to free disk space...\")\n",
+        "    \n",
+        "    # Free GPU memory\n",
         "    del model\n",
         "    del tokenizer\n",
+        "    del calibration_data\n",
         "    torch.cuda.empty_cache()\n",
+        "    gc.collect()\n",
+        "    \n",
+        "    # Clear Hugging Face cache for the source model (frees ~50-70GB)\n",
+        "    print(f\"  → Clearing Hugging Face cache for {repo_id}...\")\n",
+        "    try:\n",
+        "        cache_info = scan_cache_dir()\n",
+        "        # Find and delete revisions for the source model\n",
+        "        revisions_to_delete = []\n",
+        "        for repo in cache_info.revisions:\n",
+        "            if repo.repo_id == repo_id:\n",
+        "                revisions_to_delete.append(repo)\n",
+        "        \n",
+        "        if revisions_to_delete:\n",
+        "            delete_revisions(revisions_to_delete)\n",
+        "            print(f\"  ✅ Deleted {len(revisions_to_delete)} cached revision(s) for {repo_id}\")\n",
+        "        else:\n",
+        "            print(f\"  ℹ️ No cached revisions found for {repo_id}\")\n",
+        "    except Exception as e:\n",
+        "        print(f\"  ⚠️ Cache cleanup warning: {e} (continuing...)\")\n",
+        "    \n",
+        "    # Check disk space after cleanup\n",
+        "    free_space_after = check_disk_space()\n",
+        "    print(f\"\\n✅ Cleanup complete! Free space: {free_space_after:.2f} GB\")\n",
         "    \n",
         "    print(f\"\\n✅ {model_name} quantization complete!\")\n",
+        "    print(f\"Model available at: https://huggingface.co/{output_repo}\")\n",
+        "    print(f\"💾 Local model files deleted to save disk space\")\n"
       ]
     },
     {
         "\n",
         "- **GPU Required**: This quantization requires a GPU with at least 40GB VRAM (A100/H100 recommended)\n",
         "- **Time**: Each model takes approximately 30-60 minutes to quantize\n",
+        "- **Disk Space**: \n",
+        "  - Colab has limited disk space (~80GB free)\n",
+        "  - Each source model is ~50-70GB (BF16)\n",
+        "  - Quantized models are ~15-20GB (AWQ 4-bit)\n",
+        "  - **The notebook automatically deletes source models after quantization to save space**\n",
+        "- **Cleanup**: After each model is quantized and uploaded:\n",
+        "  - GPU memory is freed\n",
+        "  - Hugging Face cache for source model is cleared\n",
+        "  - Disk space is checked before/after\n",
+        "- **Output Repos**: Models are saved to new repos with `-awq` suffix\n",
         "- **Usage**: After quantization, update your `app.py` to use the AWQ repos:\n",
         "  ```python\n",
         "  MODELS = {\n",