remiai3
/

colab_notebook_for_code_generation_models_project_guide

Model card Files Files and versions

xet

Community

remiai3 commited on Aug 22, 2025

Commit

9acfa5a

verified ·

1 Parent(s): a1d71de

Update code/codellama_7b_instruct_gguf_q4_k_m.ipynb

Browse files

Files changed (1) hide show

code/codellama_7b_instruct_gguf_q4_k_m.ipynb +153 -264

code/codellama_7b_instruct_gguf_q4_k_m.ipynb CHANGED Viewed

@@ -1,264 +1,153 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d5c1cbb7",
-   "metadata": {},
-   "source": [
-    "# 🚀 CodeLlama 7B Instruct (GGUF Q4_K_M) — Colab (GGUF via llama.cpp)\n",
-    "\n",
-    "**One-click notebook** to run `TheBloke/CodeLlama-7B-Instruct-GGUF` (`codellama-7b-instruct.Q4_K_M.gguf`) in Google Colab using **llama-cpp-python**.\n",
-    "\n",
-    "**Features**\n",
-    "- Hugging Face login (optional for gated repos)\n",
-    "- Automatic GPU offload (T4/A100) with CPU fallback\n",
-    "- Download GGUF to Colab temp disk (no Drive required)\n",
-    "- Prompt templates optimized for **code generation**\n",
-    "- Interactive chat UI (code-focused)\n",
-    "- Optional local API server\n",
-    "\n",
-    "Best for general coding tasks (Python/JS/C++).\n",
-    "\n",
-    "> Tip: In Colab use **Runtime → Change runtime type → GPU (T4)** for speed.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5b152f1d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title 🔧 Check environment\n",
-    "!nvidia-smi || echo \"No NVIDIA GPU detected (CPU mode will be used)\"\n",
-    "!python --version"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5285c04d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title ⬇️ Install dependencies (GPU wheel if possible; fallback to CPU)\n",
-    "import sys, subprocess\n",
-    "\n",
-    "def pip_install(args):\n",
-    "    print(\"pip install\", \" \".join(args))\n",
-    "    return subprocess.call([sys.executable, \"-m\", \"pip\", \"install\", \"-qU\"] + args)\n",
-    "\n",
-    "cuda_spec = \"cu121\"\n",
-    "gpu_index = f\"https://abetlen.github.io/llama-cpp-python/whl/{cuda_spec}\"\n",
-    "# Try GPU wheel first\n",
-    "rc = pip_install([f\"--extra-index-url={gpu_index}\", \"llama-cpp-python>=0.2.90\", \"huggingface_hub>=0.23.0\",\n",
-    "                  \"ipywidgets\", \"pydantic<3\", \"uvicorn\", \"fastapi\"])\n",
-    "if rc != 0:\n",
-    "    print(\"⚠️ GPU wheel failed, trying CPU wheel...\")\n",
-    "    rc2 = pip_install([\"llama-cpp-python>=0.2.90\", \"huggingface_hub>=0.23.0\",\n",
-    "                       \"ipywidgets\", \"pydantic<3\", \"uvicorn\", \"fastapi\"])\n",
-    "    if rc2 != 0:\n",
-    "        raise RuntimeError(\"Failed to install llama-cpp-python\")\n",
-    "print(\"✅ Installation complete\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "80157423",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title 🔐 (Optional) Hugging Face login\n",
-    "HF_TOKEN = \"\"  #@param {type:\"string\"}\n",
-    "from huggingface_hub import login\n",
-    "if HF_TOKEN.strip():\n",
-    "    login(token=HF_TOKEN.strip(), add_to_git_credential=True)\n",
-    "    print(\"Logged in to Hugging Face\")\n",
-    "else:\n",
-    "    print(\"Skipping login (no token provided)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "82dd88aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title 📦 Download model (GGUF) from Hugging Face\n",
-    "from huggingface_hub import hf_hub_download\n",
-    "\n",
-    "REPO_ID = \"TheBloke/CodeLlama-7B-Instruct-GGUF\"  #@param [\"TheBloke/CodeLlama-7B-Instruct-GGUF\"] {allow-input: true}\n",
-    "FILENAME = \"codellama-7b-instruct.Q4_K_M.gguf\"  #@param [\"codellama-7b-instruct.Q4_K_M.gguf\"] {allow-input: true}\n",
-    "\n",
-    "model_path = hf_hub_download(\n",
-    "    repo_id=REPO_ID,\n",
-    "    filename=FILENAME,\n",
-    "    local_dir=\"models\",\n",
-    "    local_dir_use_symlinks=False\n",
-    ")\n",
-    "print(\"✅ Downloaded:\", model_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7862b7f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title ⚙️ Load model with llama.cpp (auto GPU offload)\n",
-    "from llama_cpp import Llama\n",
-    "\n",
-    "def try_load(n_gpu_layers):\n",
-    "    print(f\"Trying n_gpu_layers={n_gpu_layers} ...\")\n",
-    "    return Llama(\n",
-    "        model_path=model_path,\n",
-    "        n_ctx=4096,\n",
-    "        n_threads=None,\n",
-    "        n_gpu_layers=n_gpu_layers,  # -1 = all layers on GPU (if possible)\n",
-    "        logits_all=False,\n",
-    "        verbose=False,\n",
-    "    )\n",
-    "\n",
-    "llm = None\n",
-    "for attempt in (-1, 40, 20, 0):\n",
-    "    try:\n",
-    "        llm = try_load(attempt)\n",
-    "        print(\"✅ Loaded with n_gpu_layers =\", attempt)\n",
-    "        break\n",
-    "    except Exception as e:\n",
-    "        print(\"Load failed:\", e)\n",
-    "\n",
-    "if llm is None:\n",
-    "    raise RuntimeError(\"Could not load the model. Try a smaller quant or reduce context.\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ab41cbde",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title 🧩 Prompt builder (code-first templates)\n",
-    "from textwrap import dedent\n",
-    "\n",
-    "def build_prompt(user_query, system=\"You are an expert software engineer. Output concise, correct code. If possible, return code only.\"):\n",
-    "    instruct = dedent(f\"\"\"\n",
-    "    <|system|>\n",
-    "    {system}\n",
-    "    <|user|>\n",
-    "    {user_query}\n",
-    "    <|assistant|>\n",
-    "    \"\"\").strip()\n",
-    "    return instruct\n",
-    "\n",
-    "print(build_prompt(\"Write a Python function `is_prime(n)`.\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c71af4c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title 🧪 Generate (single turn)\n",
-    "user_request = \"Write a Python function `two_sum(nums, target)` returning indices.\"  #@param {type:\"string\"}\n",
-    "max_tokens = 512  #@param {type:\"slider\", min:64, max:2048, step:32}\n",
-    "temperature = 0.2  #@param {type:\"number\"}\n",
-    "code_only = True  #@param {type:\"boolean\"}\n",
-    "\n",
-    "sys_prompt = \"You are an expert programmer. Prefer minimal, correct code. If possible, output only code.\"\n",
-    "prompt = build_prompt(user_request, system=sys_prompt)\n",
-    "\n",
-    "stops = [\"<|user|>\", \"<|system|>\", \"</s>\", \"```\"] if code_only else [\"<|user|>\", \"<|system|>\", \"</s>\"]\n",
-    "out = llm(prompt, max_tokens=max_tokens, temperature=temperature, stop=stops)\n",
-    "text = out[\"choices\"][0][\"text\"]\n",
-    "\n",
-    "if code_only and \"```\" not in text:\n",
-    "    text = \"```python\\n\" + text.strip() + \"\\n```\"\n",
-    "\n",
-    "print(text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2701cdb8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title 💬 Interactive code chat (UI)\n",
-    "import ipywidgets as widgets\n",
-    "from IPython.display import display, Markdown\n",
-    "\n",
-    "sys_area = widgets.Textarea(\n",
-    "    value=\"You are an expert programmer. Prefer minimal, correct code. If possible, output only code.\",\n",
-    "    description=\"System\",\n",
-    "    layout=widgets.Layout(width=\"100%\", height=\"80px\")\n",
-    ")\n",
-    "user_area = widgets.Textarea(\n",
-    "    value=\"Write a Python function to parse a CSV file and compute average of a column named 'score'.\",\n",
-    "    description=\"Prompt\",\n",
-    "    layout=widgets.Layout(width=\"100%\", height=\"100px\")\n",
-    ")\n",
-    "temp = widgets.FloatSlider(value=0.2, min=0.0, max=1.2, step=0.05, description=\"Temperature\")\n",
-    "maxtok = widgets.IntSlider(value=512, min=64, max=2048, step=32, description=\"Max tokens\")\n",
-    "code_only_box = widgets.Checkbox(value=True, description=\"Code only\")\n",
-    "run_btn = widgets.Button(description=\"Generate\", button_style=\"success\")\n",
-    "out_area = widgets.Output()\n",
-    "\n",
-    "def on_run(_):\n",
-    "    out_area.clear_output()\n",
-    "    with out_area:\n",
-    "        prompt = build_prompt(user_area.value, system=sys_area.value)\n",
-    "        stops = [\"<|user|>\", \"<|system|>\", \"</s>\", \"```\"] if code_only_box.value else [\"<|user|>\", \"<|system|>\", \"</s>\"]\n",
-    "        result = llm(prompt, max_tokens=maxtok.value, temperature=temp.value, stop=stops)\n",
-    "        text = result[\"choices\"][0][\"text\"]\n",
-    "        if code_only_box.value and \"```\" not in text:\n",
-    "            text = \"```python\\n\" + text.strip() + \"\\n```\"\n",
-    "        display(Markdown(text))\n",
-    "\n",
-    "run_btn.on_click(on_run)\n",
-    "display(widgets.VBox([sys_area, user_area, temp, maxtok, code_only_box, run_btn, out_area]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "37a7a7f9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#@title 🌐 Optional: start local API server (OpenAI-like)\n",
-    "# After running, open http://127.0.0.1:8000/docs inside Colab to test.\n",
-    "import threading\n",
-    "from llama_cpp.server.app import create_app\n",
-    "from fastapi.middleware.cors import CORSMiddleware\n",
-    "import uvicorn\n",
-    "\n",
-    "app = create_app(llm)\n",
-    "app.add_middleware(\n",
-    "    CORSMiddleware,\n",
-    "    allow_origins=[\"*\"],\n",
-    "    allow_credentials=True,\n",
-    "    allow_methods=[\"*\"],\n",
-    "    allow_headers=[\"*\"],\n",
-    ")\n",
-    "\n",
-    "def run_server():\n",
-    "    uvicorn.run(app, host=\"0.0.0.0\", port=8000, log_level=\"info\")\n",
-    "\n",
-    "thread = threading.Thread(target=run_server, daemon=True)\n",
-    "thread.start()\n",
-    "print(\"Server starting on http://127.0.0.1:8000\")"
-   ]
-  }
- ],
- "metadata": {},
- "nbformat": 4,
- "nbformat_minor": 5
-}

+#
+# 🚀 CodeLlama 7B Instruct (GGUF Q4_K_M) — Colab Notebook
+# This notebook runs TheBloke/CodeLlama-7B-Instruct-GGUF in Google Colab
+# using llama-cpp-python with automatic GPU offloading.
+#
+# Cell 1: Check environment
+#@title 🔧 Check environment
+!nvidia-smi || echo "No NVIDIA GPU detected (CPU mode will be used)"
+!python --version
+# Cell 2: Install dependencies
+#@title ⬇️ Install dependencies (GPU wheel if possible; fallback to CPU)
+import sys, subprocess
+def pip_install(args):
+    print("pip install", " ".join(args))
+    return subprocess.call([sys.executable, "-m", "pip", "install", "-qU"] + args)
+cuda_spec = "cu121"
+gpu_index = f"https://abetlen.github.io/llama-cpp-python/whl/{cuda_spec}"
+# Try GPU wheel first
+rc = pip_install([f"--extra-index-url={gpu_index}", "llama-cpp-python>=0.2.90", "huggingface_hub>=0.23.0", "ipywidgets"])
+if rc != 0:
+    print("⚠️ GPU wheel failed, trying CPU wheel...")
+    rc2 = pip_install(["llama-cpp-python>=0.2.90", "huggingface_hub>=0.23.0", "ipywidgets"])
+    if rc2 != 0:
+        raise RuntimeError("Failed to install llama-cpp-python")
+print("✅ Installation complete")
+# Cell 3: (Optional) Hugging Face login
+#@title 🔐 (Optional) Hugging Face login
+from google.colab import userdata
+from huggingface_hub import login
+# Use Colab secrets to store your HF token
+try:
+    HF_TOKEN = userdata.get('HF_TOKEN')
+    login(token=HF_TOKEN, add_to_git_credential=True)
+    print("✅ Logged in to Hugging Face")
+except userdata.SecretNotFoundError:
+    print("Skipping login (HF_TOKEN secret not found)")
+except Exception as e:
+    print(f"Login failed: {e}")
+# Cell 4: Download model (GGUF) from Hugging Face
+#@title 📦 Download model (GGUF) from Hugging Face
+from huggingface_hub import hf_hub_download
+REPO_ID = "TheBloke/CodeLlama-7B-Instruct-GGUF"
+FILENAME = "codellama-7b-instruct.Q4_K_M.gguf"
+print(f"Downloading {FILENAME} from {REPO_ID}...")
+model_path = hf_hub_download(
+    repo_id=REPO_ID,
+    filename=FILENAME,
+    local_dir="models",
+    local_dir_use_symlinks=False
+)
+print("✅ Downloaded:", model_path)
+# Cell 5: Load model with llama.cpp (auto GPU offload)
+#@title ⚙️ Load model with llama.cpp (auto GPU offload)
+from llama_cpp import Llama
+def try_load(n_gpu_layers):
+    print(f"Trying to load model with n_gpu_layers={n_gpu_layers} ...")
+    return Llama(
+        model_path=model_path,
+        n_ctx=4096,
+        n_threads=None, # Auto-detect
+        n_gpu_layers=n_gpu_layers,  # -1 = all layers on GPU (if possible)
+        verbose=False,
+    )
+llm = None
+# Attempt to load with max GPU layers, then fewer, then CPU only
+for attempt in (-1, 40, 0):
+    try:
+        llm = try_load(attempt)
+        print(f"✅ Model loaded successfully with n_gpu_layers = {attempt}")
+        break
+    except Exception as e:
+        print(f"Load failed with {attempt} GPU layers: {e}")
+if llm is None:
+    raise RuntimeError("Could not load the model. Ensure you have enough RAM/VRAM.")
+# Cell 6: Prompt builder
+#@title 🧩 Prompt builder (code-first templates)
+from textwrap import dedent
+def build_prompt(user_query, system="You are an expert software engineer. Output concise, correct code. If possible, return code only."):
+    return dedent(f"""
+        <|system|>
+        {system}
+        <|user|>
+        {user_query}
+        <|assistant|>
+    """).strip()
+# Example of a built prompt
+print("--- Example Prompt ---")
+print(build_prompt("Write a Python function `is_prime(n)`."))
+print("----------------------")
+# Cell 7: Interactive code chat UI
+#@title 💬 Interactive code chat (UI)
+from google.colab import output
+output.enable_custom_widget_manager() # Enable widgets in Colab
+import ipywidgets as widgets
+from IPython.display import display, Markdown
+sys_area = widgets.Textarea(
+    value="You are an expert programmer. Prefer minimal, correct code. If possible, output only code.",
+    description="System",
+    layout=widgets.Layout(width="100%", height="80px")
+)
+user_area = widgets.Textarea(
+    value="Write a Python function to parse a CSV file and compute the average of a column named 'score'.",
+    description="Prompt",
+    layout=widgets.Layout(width="100%", height="100px")
+)
+temp = widgets.FloatSlider(value=0.2, min=0.0, max=1.2, step=0.05, description="Temperature")
+maxtok = widgets.IntSlider(value=512, min=64, max=2048, step=32, description="Max tokens")
+code_only_box = widgets.Checkbox(value=True, description="Code only")
+run_btn = widgets.Button(description="Generate", button_style="success")
+out_area = widgets.Output()
+def on_run(_):
+    out_area.clear_output()
+    with out_area:
+        print("Generating response...")
+        prompt = build_prompt(user_area.value, system=sys_area.value)
+        stops = ["<|user|>", "<|system|>", "</s>", "```"] if code_only_box.value else ["<|user|>", "<|system|>", "</s>"]
+        result = llm(prompt, max_tokens=maxtok.value, temperature=temp.value, stop=stops)
+        text = result["choices"][0]["text"]
+        # Post-processing to ensure it's a clean code block
+        if code_only_box.value and "```" not in text:
+            text = "```python\n" + text.strip() + "\n```"
+        out_area.clear_output() # Clear "Generating..." message
+        display(Markdown(text))
+run_btn.on_click(on_run)
+display(widgets.VBox([sys_area, user_area, temp, maxtok, code_only_box, run_btn, out_area]))