Spaces:
Sleeping
Sleeping
Commit
·
24107f3
1
Parent(s):
2dff966
Add disk space cleanup after quantization in Colab notebook
Browse files- Delete source model from Hugging Face cache after quantization
- Free GPU memory and force garbage collection
- Check disk space before/after operations
- Add warnings for low disk space
- Critical for Colab's limited disk space
- quantize_to_awq_colab.ipynb +60 -6
quantize_to_awq_colab.ipynb
CHANGED
|
@@ -30,7 +30,18 @@
|
|
| 30 |
"source": [
|
| 31 |
"# Install required packages\n",
|
| 32 |
"%pip install -q autoawq transformers accelerate huggingface_hub\n",
|
| 33 |
-
"%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
]
|
| 35 |
},
|
| 36 |
{
|
|
@@ -108,8 +119,10 @@
|
|
| 108 |
"source": [
|
| 109 |
"from awq import AutoAWQForCausalLM\n",
|
| 110 |
"from transformers import AutoTokenizer\n",
|
| 111 |
-
"from huggingface_hub import HfApi\n",
|
| 112 |
"import torch\n",
|
|
|
|
|
|
|
| 113 |
"\n",
|
| 114 |
"def quantize_model_to_awq(\n",
|
| 115 |
" model_name: str,\n",
|
|
@@ -148,6 +161,11 @@
|
|
| 148 |
" print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
|
| 149 |
" print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
|
| 150 |
" \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
" model = AutoAWQForCausalLM.from_pretrained(\n",
|
| 152 |
" repo_id,\n",
|
| 153 |
" device_map=\"auto\",\n",
|
|
@@ -230,13 +248,41 @@
|
|
| 230 |
" \n",
|
| 231 |
" print(f\"✅ Quantized model saved to {output_repo}\")\n",
|
| 232 |
" \n",
|
| 233 |
-
" # Clean up
|
|
|
|
|
|
|
|
|
|
| 234 |
" del model\n",
|
| 235 |
" del tokenizer\n",
|
|
|
|
| 236 |
" torch.cuda.empty_cache()\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
" \n",
|
| 238 |
" print(f\"\\n✅ {model_name} quantization complete!\")\n",
|
| 239 |
-
" print(f\"Model available at: https://huggingface.co/{output_repo}\")\n"
|
|
|
|
| 240 |
]
|
| 241 |
},
|
| 242 |
{
|
|
@@ -363,8 +409,16 @@
|
|
| 363 |
"\n",
|
| 364 |
"- **GPU Required**: This quantization requires a GPU with at least 40GB VRAM (A100/H100 recommended)\n",
|
| 365 |
"- **Time**: Each model takes approximately 30-60 minutes to quantize\n",
|
| 366 |
-
"- **
|
| 367 |
-
"-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
"- **Usage**: After quantization, update your `app.py` to use the AWQ repos:\n",
|
| 369 |
" ```python\n",
|
| 370 |
" MODELS = {\n",
|
|
|
|
| 30 |
"source": [
|
| 31 |
"# Install required packages\n",
|
| 32 |
"%pip install -q autoawq transformers accelerate huggingface_hub\n",
|
| 33 |
+
"%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"# Utility function to check disk space\n",
|
| 36 |
+
"import shutil\n",
|
| 37 |
+
"def check_disk_space():\n",
|
| 38 |
+
" \"\"\"Check available disk space.\"\"\"\n",
|
| 39 |
+
" total, used, free = shutil.disk_usage(\"/\")\n",
|
| 40 |
+
" print(f\"Disk Space: {free / (1024**3):.2f} GB free out of {total / (1024**3):.2f} GB total\")\n",
|
| 41 |
+
" return free / (1024**3) # Return free space in GB\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"print(\"Initial disk space:\")\n",
|
| 44 |
+
"check_disk_space()\n"
|
| 45 |
]
|
| 46 |
},
|
| 47 |
{
|
|
|
|
| 119 |
"source": [
|
| 120 |
"from awq import AutoAWQForCausalLM\n",
|
| 121 |
"from transformers import AutoTokenizer\n",
|
| 122 |
+
"from huggingface_hub import HfApi, scan_cache_dir, delete_revisions\n",
|
| 123 |
"import torch\n",
|
| 124 |
+
"import shutil\n",
|
| 125 |
+
"import gc\n",
|
| 126 |
"\n",
|
| 127 |
"def quantize_model_to_awq(\n",
|
| 128 |
" model_name: str,\n",
|
|
|
|
| 161 |
" print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
|
| 162 |
" print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
|
| 163 |
" \n",
|
| 164 |
+
" # Check disk space before loading\n",
|
| 165 |
+
" free_space_before = check_disk_space()\n",
|
| 166 |
+
" if free_space_before < 30:\n",
|
| 167 |
+
" print(f\"⚠️ WARNING: Low disk space ({free_space_before:.2f} GB). Model loading may fail.\")\n",
|
| 168 |
+
" \n",
|
| 169 |
" model = AutoAWQForCausalLM.from_pretrained(\n",
|
| 170 |
" repo_id,\n",
|
| 171 |
" device_map=\"auto\",\n",
|
|
|
|
| 248 |
" \n",
|
| 249 |
" print(f\"✅ Quantized model saved to {output_repo}\")\n",
|
| 250 |
" \n",
|
| 251 |
+
" # Step 6: Clean up to free disk space (critical for Colab)\n",
|
| 252 |
+
" print(f\"\\n[6/6] Cleaning up local files to free disk space...\")\n",
|
| 253 |
+
" \n",
|
| 254 |
+
" # Free GPU memory\n",
|
| 255 |
" del model\n",
|
| 256 |
" del tokenizer\n",
|
| 257 |
+
" del calibration_data\n",
|
| 258 |
" torch.cuda.empty_cache()\n",
|
| 259 |
+
" gc.collect()\n",
|
| 260 |
+
" \n",
|
| 261 |
+
" # Clear Hugging Face cache for the source model (frees ~50-70GB)\n",
|
| 262 |
+
" print(f\" → Clearing Hugging Face cache for {repo_id}...\")\n",
|
| 263 |
+
" try:\n",
|
| 264 |
+
" cache_info = scan_cache_dir()\n",
|
| 265 |
+
" # Find and delete revisions for the source model\n",
|
| 266 |
+
" revisions_to_delete = []\n",
|
| 267 |
+
" for repo in cache_info.revisions:\n",
|
| 268 |
+
" if repo.repo_id == repo_id:\n",
|
| 269 |
+
" revisions_to_delete.append(repo)\n",
|
| 270 |
+
" \n",
|
| 271 |
+
" if revisions_to_delete:\n",
|
| 272 |
+
" delete_revisions(revisions_to_delete)\n",
|
| 273 |
+
" print(f\" ✅ Deleted {len(revisions_to_delete)} cached revision(s) for {repo_id}\")\n",
|
| 274 |
+
" else:\n",
|
| 275 |
+
" print(f\" ℹ️ No cached revisions found for {repo_id}\")\n",
|
| 276 |
+
" except Exception as e:\n",
|
| 277 |
+
" print(f\" ⚠️ Cache cleanup warning: {e} (continuing...)\")\n",
|
| 278 |
+
" \n",
|
| 279 |
+
" # Check disk space after cleanup\n",
|
| 280 |
+
" free_space_after = check_disk_space()\n",
|
| 281 |
+
" print(f\"\\n✅ Cleanup complete! Free space: {free_space_after:.2f} GB\")\n",
|
| 282 |
" \n",
|
| 283 |
" print(f\"\\n✅ {model_name} quantization complete!\")\n",
|
| 284 |
+
" print(f\"Model available at: https://huggingface.co/{output_repo}\")\n",
|
| 285 |
+
" print(f\"💾 Local model files deleted to save disk space\")\n"
|
| 286 |
]
|
| 287 |
},
|
| 288 |
{
|
|
|
|
| 409 |
"\n",
|
| 410 |
"- **GPU Required**: This quantization requires a GPU with at least 40GB VRAM (A100/H100 recommended)\n",
|
| 411 |
"- **Time**: Each model takes approximately 30-60 minutes to quantize\n",
|
| 412 |
+
"- **Disk Space**: \n",
|
| 413 |
+
" - Colab has limited disk space (~80GB free)\n",
|
| 414 |
+
" - Each source model is ~50-70GB (BF16)\n",
|
| 415 |
+
" - Quantized models are ~15-20GB (AWQ 4-bit)\n",
|
| 416 |
+
" - **The notebook automatically deletes source models after quantization to save space**\n",
|
| 417 |
+
"- **Cleanup**: After each model is quantized and uploaded:\n",
|
| 418 |
+
" - GPU memory is freed\n",
|
| 419 |
+
" - Hugging Face cache for source model is cleared\n",
|
| 420 |
+
" - Disk space is checked before/after\n",
|
| 421 |
+
"- **Output Repos**: Models are saved to new repos with `-awq` suffix\n",
|
| 422 |
"- **Usage**: After quantization, update your `app.py` to use the AWQ repos:\n",
|
| 423 |
" ```python\n",
|
| 424 |
" MODELS = {\n",
|