Alikestocode commited on
Commit
24107f3
·
1 Parent(s): 2dff966

Add disk space cleanup after quantization in Colab notebook

Browse files

- Delete source model from Hugging Face cache after quantization
- Free GPU memory and force garbage collection
- Check disk space before/after operations
- Add warnings for low disk space
- Critical for Colab's limited disk space

Files changed (1) hide show
  1. quantize_to_awq_colab.ipynb +60 -6
quantize_to_awq_colab.ipynb CHANGED
@@ -30,7 +30,18 @@
30
  "source": [
31
  "# Install required packages\n",
32
  "%pip install -q autoawq transformers accelerate huggingface_hub\n",
33
- "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n"
 
 
 
 
 
 
 
 
 
 
 
34
  ]
35
  },
36
  {
@@ -108,8 +119,10 @@
108
  "source": [
109
  "from awq import AutoAWQForCausalLM\n",
110
  "from transformers import AutoTokenizer\n",
111
- "from huggingface_hub import HfApi\n",
112
  "import torch\n",
 
 
113
  "\n",
114
  "def quantize_model_to_awq(\n",
115
  " model_name: str,\n",
@@ -148,6 +161,11 @@
148
  " print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
149
  " print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
150
  " \n",
 
 
 
 
 
151
  " model = AutoAWQForCausalLM.from_pretrained(\n",
152
  " repo_id,\n",
153
  " device_map=\"auto\",\n",
@@ -230,13 +248,41 @@
230
  " \n",
231
  " print(f\"✅ Quantized model saved to {output_repo}\")\n",
232
  " \n",
233
- " # Clean up memory\n",
 
 
 
234
  " del model\n",
235
  " del tokenizer\n",
 
236
  " torch.cuda.empty_cache()\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  " \n",
238
  " print(f\"\\n✅ {model_name} quantization complete!\")\n",
239
- " print(f\"Model available at: https://huggingface.co/{output_repo}\")\n"
 
240
  ]
241
  },
242
  {
@@ -363,8 +409,16 @@
363
  "\n",
364
  "- **GPU Required**: This quantization requires a GPU with at least 40GB VRAM (A100/H100 recommended)\n",
365
  "- **Time**: Each model takes approximately 30-60 minutes to quantize\n",
366
- "- **Memory**: Ensure you have enough disk space (models are ~20-30GB each)\n",
367
- "- **Output Repos**: You can either create new repos (with `-awq` suffix) or upload to existing repos\n",
 
 
 
 
 
 
 
 
368
  "- **Usage**: After quantization, update your `app.py` to use the AWQ repos:\n",
369
  " ```python\n",
370
  " MODELS = {\n",
 
30
  "source": [
31
  "# Install required packages\n",
32
  "%pip install -q autoawq transformers accelerate huggingface_hub\n",
33
+ "%pip install -q torch --index-url https://download.pytorch.org/whl/cu118\n",
34
+ "\n",
35
+ "# Utility function to check disk space\n",
36
+ "import shutil\n",
37
+ "def check_disk_space():\n",
38
+ " \"\"\"Check available disk space.\"\"\"\n",
39
+ " total, used, free = shutil.disk_usage(\"/\")\n",
40
+ " print(f\"Disk Space: {free / (1024**3):.2f} GB free out of {total / (1024**3):.2f} GB total\")\n",
41
+ " return free / (1024**3) # Return free space in GB\n",
42
+ "\n",
43
+ "print(\"Initial disk space:\")\n",
44
+ "check_disk_space()\n"
45
  ]
46
  },
47
  {
 
119
  "source": [
120
  "from awq import AutoAWQForCausalLM\n",
121
  "from transformers import AutoTokenizer\n",
122
+ "from huggingface_hub import HfApi, scan_cache_dir, delete_revisions\n",
123
  "import torch\n",
124
+ "import shutil\n",
125
+ "import gc\n",
126
  "\n",
127
  "def quantize_model_to_awq(\n",
128
  " model_name: str,\n",
 
161
  " print(f\"\\n[2/5] Loading model from {repo_id}...\")\n",
162
  " print(\"⚠️ This may take several minutes and requires significant GPU memory...\")\n",
163
  " \n",
164
+ " # Check disk space before loading\n",
165
+ " free_space_before = check_disk_space()\n",
166
+ " if free_space_before < 30:\n",
167
+ " print(f\"⚠️ WARNING: Low disk space ({free_space_before:.2f} GB). Model loading may fail.\")\n",
168
+ " \n",
169
  " model = AutoAWQForCausalLM.from_pretrained(\n",
170
  " repo_id,\n",
171
  " device_map=\"auto\",\n",
 
248
  " \n",
249
  " print(f\"✅ Quantized model saved to {output_repo}\")\n",
250
  " \n",
251
+ " # Step 6: Clean up to free disk space (critical for Colab)\n",
252
+ " print(f\"\\n[6/6] Cleaning up local files to free disk space...\")\n",
253
+ " \n",
254
+ " # Free GPU memory\n",
255
  " del model\n",
256
  " del tokenizer\n",
257
+ " del calibration_data\n",
258
  " torch.cuda.empty_cache()\n",
259
+ " gc.collect()\n",
260
+ " \n",
261
+ " # Clear Hugging Face cache for the source model (frees ~50-70GB)\n",
262
+ " print(f\" → Clearing Hugging Face cache for {repo_id}...\")\n",
263
+ " try:\n",
264
+ " cache_info = scan_cache_dir()\n",
265
+ " # Find and delete revisions for the source model\n",
266
+ " revisions_to_delete = []\n",
267
+ " for repo in cache_info.revisions:\n",
268
+ " if repo.repo_id == repo_id:\n",
269
+ " revisions_to_delete.append(repo)\n",
270
+ " \n",
271
+ " if revisions_to_delete:\n",
272
+ " delete_revisions(revisions_to_delete)\n",
273
+ " print(f\" ✅ Deleted {len(revisions_to_delete)} cached revision(s) for {repo_id}\")\n",
274
+ " else:\n",
275
+ " print(f\" ℹ️ No cached revisions found for {repo_id}\")\n",
276
+ " except Exception as e:\n",
277
+ " print(f\" ⚠️ Cache cleanup warning: {e} (continuing...)\")\n",
278
+ " \n",
279
+ " # Check disk space after cleanup\n",
280
+ " free_space_after = check_disk_space()\n",
281
+ " print(f\"\\n✅ Cleanup complete! Free space: {free_space_after:.2f} GB\")\n",
282
  " \n",
283
  " print(f\"\\n✅ {model_name} quantization complete!\")\n",
284
+ " print(f\"Model available at: https://huggingface.co/{output_repo}\")\n",
285
+ " print(f\"💾 Local model files deleted to save disk space\")\n"
286
  ]
287
  },
288
  {
 
409
  "\n",
410
  "- **GPU Required**: This quantization requires a GPU with at least 40GB VRAM (A100/H100 recommended)\n",
411
  "- **Time**: Each model takes approximately 30-60 minutes to quantize\n",
412
+ "- **Disk Space**: \n",
413
+ " - Colab has limited disk space (~80GB free)\n",
414
+ " - Each source model is ~50-70GB (BF16)\n",
415
+ " - Quantized models are ~15-20GB (AWQ 4-bit)\n",
416
+ " - **The notebook automatically deletes source models after quantization to save space**\n",
417
+ "- **Cleanup**: After each model is quantized and uploaded:\n",
418
+ " - GPU memory is freed\n",
419
+ " - Hugging Face cache for source model is cleared\n",
420
+ " - Disk space is checked before/after\n",
421
+ "- **Output Repos**: Models are saved to new repos with `-awq` suffix\n",
422
  "- **Usage**: After quantization, update your `app.py` to use the AWQ repos:\n",
423
  " ```python\n",
424
  " MODELS = {\n",