remiai3 commited on
Commit
9acfa5a
·
verified ·
1 Parent(s): a1d71de

Update code/codellama_7b_instruct_gguf_q4_k_m.ipynb

Browse files
code/codellama_7b_instruct_gguf_q4_k_m.ipynb CHANGED
@@ -1,264 +1,153 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "d5c1cbb7",
6
- "metadata": {},
7
- "source": [
8
- "# 🚀 CodeLlama 7B Instruct (GGUF Q4_K_M) — Colab (GGUF via llama.cpp)\n",
9
- "\n",
10
- "**One-click notebook** to run `TheBloke/CodeLlama-7B-Instruct-GGUF` (`codellama-7b-instruct.Q4_K_M.gguf`) in Google Colab using **llama-cpp-python**.\n",
11
- "\n",
12
- "**Features**\n",
13
- "- Hugging Face login (optional for gated repos)\n",
14
- "- Automatic GPU offload (T4/A100) with CPU fallback\n",
15
- "- Download GGUF to Colab temp disk (no Drive required)\n",
16
- "- Prompt templates optimized for **code generation**\n",
17
- "- Interactive chat UI (code-focused)\n",
18
- "- Optional local API server\n",
19
- "\n",
20
- "Best for general coding tasks (Python/JS/C++).\n",
21
- "\n",
22
- "> Tip: In Colab use **Runtime → Change runtime type → GPU (T4)** for speed.\n"
23
- ]
24
- },
25
- {
26
- "cell_type": "code",
27
- "execution_count": null,
28
- "id": "5b152f1d",
29
- "metadata": {},
30
- "outputs": [],
31
- "source": [
32
- "#@title 🔧 Check environment\n",
33
- "!nvidia-smi || echo \"No NVIDIA GPU detected (CPU mode will be used)\"\n",
34
- "!python --version"
35
- ]
36
- },
37
- {
38
- "cell_type": "code",
39
- "execution_count": null,
40
- "id": "5285c04d",
41
- "metadata": {},
42
- "outputs": [],
43
- "source": [
44
- "#@title ⬇️ Install dependencies (GPU wheel if possible; fallback to CPU)\n",
45
- "import sys, subprocess\n",
46
- "\n",
47
- "def pip_install(args):\n",
48
- " print(\"pip install\", \" \".join(args))\n",
49
- " return subprocess.call([sys.executable, \"-m\", \"pip\", \"install\", \"-qU\"] + args)\n",
50
- "\n",
51
- "cuda_spec = \"cu121\"\n",
52
- "gpu_index = f\"https://abetlen.github.io/llama-cpp-python/whl/{cuda_spec}\"\n",
53
- "# Try GPU wheel first\n",
54
- "rc = pip_install([f\"--extra-index-url={gpu_index}\", \"llama-cpp-python>=0.2.90\", \"huggingface_hub>=0.23.0\",\n",
55
- " \"ipywidgets\", \"pydantic<3\", \"uvicorn\", \"fastapi\"])\n",
56
- "if rc != 0:\n",
57
- " print(\"⚠️ GPU wheel failed, trying CPU wheel...\")\n",
58
- " rc2 = pip_install([\"llama-cpp-python>=0.2.90\", \"huggingface_hub>=0.23.0\",\n",
59
- " \"ipywidgets\", \"pydantic<3\", \"uvicorn\", \"fastapi\"])\n",
60
- " if rc2 != 0:\n",
61
- " raise RuntimeError(\"Failed to install llama-cpp-python\")\n",
62
- "print(\"✅ Installation complete\")"
63
- ]
64
- },
65
- {
66
- "cell_type": "code",
67
- "execution_count": null,
68
- "id": "80157423",
69
- "metadata": {},
70
- "outputs": [],
71
- "source": [
72
- "#@title 🔐 (Optional) Hugging Face login\n",
73
- "HF_TOKEN = \"\" #@param {type:\"string\"}\n",
74
- "from huggingface_hub import login\n",
75
- "if HF_TOKEN.strip():\n",
76
- " login(token=HF_TOKEN.strip(), add_to_git_credential=True)\n",
77
- " print(\"Logged in to Hugging Face\")\n",
78
- "else:\n",
79
- " print(\"Skipping login (no token provided)\")"
80
- ]
81
- },
82
- {
83
- "cell_type": "code",
84
- "execution_count": null,
85
- "id": "82dd88aa",
86
- "metadata": {},
87
- "outputs": [],
88
- "source": [
89
- "#@title 📦 Download model (GGUF) from Hugging Face\n",
90
- "from huggingface_hub import hf_hub_download\n",
91
- "\n",
92
- "REPO_ID = \"TheBloke/CodeLlama-7B-Instruct-GGUF\" #@param [\"TheBloke/CodeLlama-7B-Instruct-GGUF\"] {allow-input: true}\n",
93
- "FILENAME = \"codellama-7b-instruct.Q4_K_M.gguf\" #@param [\"codellama-7b-instruct.Q4_K_M.gguf\"] {allow-input: true}\n",
94
- "\n",
95
- "model_path = hf_hub_download(\n",
96
- " repo_id=REPO_ID,\n",
97
- " filename=FILENAME,\n",
98
- " local_dir=\"models\",\n",
99
- " local_dir_use_symlinks=False\n",
100
- ")\n",
101
- "print(\"✅ Downloaded:\", model_path)"
102
- ]
103
- },
104
- {
105
- "cell_type": "code",
106
- "execution_count": null,
107
- "id": "7862b7f1",
108
- "metadata": {},
109
- "outputs": [],
110
- "source": [
111
- "#@title ⚙️ Load model with llama.cpp (auto GPU offload)\n",
112
- "from llama_cpp import Llama\n",
113
- "\n",
114
- "def try_load(n_gpu_layers):\n",
115
- " print(f\"Trying n_gpu_layers={n_gpu_layers} ...\")\n",
116
- " return Llama(\n",
117
- " model_path=model_path,\n",
118
- " n_ctx=4096,\n",
119
- " n_threads=None,\n",
120
- " n_gpu_layers=n_gpu_layers, # -1 = all layers on GPU (if possible)\n",
121
- " logits_all=False,\n",
122
- " verbose=False,\n",
123
- " )\n",
124
- "\n",
125
- "llm = None\n",
126
- "for attempt in (-1, 40, 20, 0):\n",
127
- " try:\n",
128
- " llm = try_load(attempt)\n",
129
- " print(\"✅ Loaded with n_gpu_layers =\", attempt)\n",
130
- " break\n",
131
- " except Exception as e:\n",
132
- " print(\"Load failed:\", e)\n",
133
- "\n",
134
- "if llm is None:\n",
135
- " raise RuntimeError(\"Could not load the model. Try a smaller quant or reduce context.\")"
136
- ]
137
- },
138
- {
139
- "cell_type": "code",
140
- "execution_count": null,
141
- "id": "ab41cbde",
142
- "metadata": {},
143
- "outputs": [],
144
- "source": [
145
- "#@title 🧩 Prompt builder (code-first templates)\n",
146
- "from textwrap import dedent\n",
147
- "\n",
148
- "def build_prompt(user_query, system=\"You are an expert software engineer. Output concise, correct code. If possible, return code only.\"):\n",
149
- " instruct = dedent(f\"\"\"\n",
150
- " <|system|>\n",
151
- " {system}\n",
152
- " <|user|>\n",
153
- " {user_query}\n",
154
- " <|assistant|>\n",
155
- " \"\"\").strip()\n",
156
- " return instruct\n",
157
- "\n",
158
- "print(build_prompt(\"Write a Python function `is_prime(n)`.\"))"
159
- ]
160
- },
161
- {
162
- "cell_type": "code",
163
- "execution_count": null,
164
- "id": "c71af4c4",
165
- "metadata": {},
166
- "outputs": [],
167
- "source": [
168
- "#@title 🧪 Generate (single turn)\n",
169
- "user_request = \"Write a Python function `two_sum(nums, target)` returning indices.\" #@param {type:\"string\"}\n",
170
- "max_tokens = 512 #@param {type:\"slider\", min:64, max:2048, step:32}\n",
171
- "temperature = 0.2 #@param {type:\"number\"}\n",
172
- "code_only = True #@param {type:\"boolean\"}\n",
173
- "\n",
174
- "sys_prompt = \"You are an expert programmer. Prefer minimal, correct code. If possible, output only code.\"\n",
175
- "prompt = build_prompt(user_request, system=sys_prompt)\n",
176
- "\n",
177
- "stops = [\"<|user|>\", \"<|system|>\", \"</s>\", \"```\"] if code_only else [\"<|user|>\", \"<|system|>\", \"</s>\"]\n",
178
- "out = llm(prompt, max_tokens=max_tokens, temperature=temperature, stop=stops)\n",
179
- "text = out[\"choices\"][0][\"text\"]\n",
180
- "\n",
181
- "if code_only and \"```\" not in text:\n",
182
- " text = \"```python\\n\" + text.strip() + \"\\n```\"\n",
183
- "\n",
184
- "print(text)"
185
- ]
186
- },
187
- {
188
- "cell_type": "code",
189
- "execution_count": null,
190
- "id": "2701cdb8",
191
- "metadata": {},
192
- "outputs": [],
193
- "source": [
194
- "#@title 💬 Interactive code chat (UI)\n",
195
- "import ipywidgets as widgets\n",
196
- "from IPython.display import display, Markdown\n",
197
- "\n",
198
- "sys_area = widgets.Textarea(\n",
199
- " value=\"You are an expert programmer. Prefer minimal, correct code. If possible, output only code.\",\n",
200
- " description=\"System\",\n",
201
- " layout=widgets.Layout(width=\"100%\", height=\"80px\")\n",
202
- ")\n",
203
- "user_area = widgets.Textarea(\n",
204
- " value=\"Write a Python function to parse a CSV file and compute average of a column named 'score'.\",\n",
205
- " description=\"Prompt\",\n",
206
- " layout=widgets.Layout(width=\"100%\", height=\"100px\")\n",
207
- ")\n",
208
- "temp = widgets.FloatSlider(value=0.2, min=0.0, max=1.2, step=0.05, description=\"Temperature\")\n",
209
- "maxtok = widgets.IntSlider(value=512, min=64, max=2048, step=32, description=\"Max tokens\")\n",
210
- "code_only_box = widgets.Checkbox(value=True, description=\"Code only\")\n",
211
- "run_btn = widgets.Button(description=\"Generate\", button_style=\"success\")\n",
212
- "out_area = widgets.Output()\n",
213
- "\n",
214
- "def on_run(_):\n",
215
- " out_area.clear_output()\n",
216
- " with out_area:\n",
217
- " prompt = build_prompt(user_area.value, system=sys_area.value)\n",
218
- " stops = [\"<|user|>\", \"<|system|>\", \"</s>\", \"```\"] if code_only_box.value else [\"<|user|>\", \"<|system|>\", \"</s>\"]\n",
219
- " result = llm(prompt, max_tokens=maxtok.value, temperature=temp.value, stop=stops)\n",
220
- " text = result[\"choices\"][0][\"text\"]\n",
221
- " if code_only_box.value and \"```\" not in text:\n",
222
- " text = \"```python\\n\" + text.strip() + \"\\n```\"\n",
223
- " display(Markdown(text))\n",
224
- "\n",
225
- "run_btn.on_click(on_run)\n",
226
- "display(widgets.VBox([sys_area, user_area, temp, maxtok, code_only_box, run_btn, out_area]))"
227
- ]
228
- },
229
- {
230
- "cell_type": "code",
231
- "execution_count": null,
232
- "id": "37a7a7f9",
233
- "metadata": {},
234
- "outputs": [],
235
- "source": [
236
- "#@title 🌐 Optional: start local API server (OpenAI-like)\n",
237
- "# After running, open http://127.0.0.1:8000/docs inside Colab to test.\n",
238
- "import threading\n",
239
- "from llama_cpp.server.app import create_app\n",
240
- "from fastapi.middleware.cors import CORSMiddleware\n",
241
- "import uvicorn\n",
242
- "\n",
243
- "app = create_app(llm)\n",
244
- "app.add_middleware(\n",
245
- " CORSMiddleware,\n",
246
- " allow_origins=[\"*\"],\n",
247
- " allow_credentials=True,\n",
248
- " allow_methods=[\"*\"],\n",
249
- " allow_headers=[\"*\"],\n",
250
- ")\n",
251
- "\n",
252
- "def run_server():\n",
253
- " uvicorn.run(app, host=\"0.0.0.0\", port=8000, log_level=\"info\")\n",
254
- "\n",
255
- "thread = threading.Thread(target=run_server, daemon=True)\n",
256
- "thread.start()\n",
257
- "print(\"Server starting on http://127.0.0.1:8000\")"
258
- ]
259
- }
260
- ],
261
- "metadata": {},
262
- "nbformat": 4,
263
- "nbformat_minor": 5
264
- }
 
1
+ #
2
+ # 🚀 CodeLlama 7B Instruct (GGUF Q4_K_M) — Colab Notebook
3
+ # This notebook runs TheBloke/CodeLlama-7B-Instruct-GGUF in Google Colab
4
+ # using llama-cpp-python with automatic GPU offloading.
5
+ #
6
+
7
+ # Cell 1: Check environment
8
+ #@title 🔧 Check environment
9
+ !nvidia-smi || echo "No NVIDIA GPU detected (CPU mode will be used)"
10
+ !python --version
11
+
12
+ # Cell 2: Install dependencies
13
+ #@title ⬇️ Install dependencies (GPU wheel if possible; fallback to CPU)
14
+ import sys, subprocess
15
+
16
+ def pip_install(args):
17
+ print("pip install", " ".join(args))
18
+ return subprocess.call([sys.executable, "-m", "pip", "install", "-qU"] + args)
19
+
20
+ cuda_spec = "cu121"
21
+ gpu_index = f"https://abetlen.github.io/llama-cpp-python/whl/{cuda_spec}"
22
+ # Try GPU wheel first
23
+ rc = pip_install([f"--extra-index-url={gpu_index}", "llama-cpp-python>=0.2.90", "huggingface_hub>=0.23.0", "ipywidgets"])
24
+ if rc != 0:
25
+ print("⚠️ GPU wheel failed, trying CPU wheel...")
26
+ rc2 = pip_install(["llama-cpp-python>=0.2.90", "huggingface_hub>=0.23.0", "ipywidgets"])
27
+ if rc2 != 0:
28
+ raise RuntimeError("Failed to install llama-cpp-python")
29
+ print("✅ Installation complete")
30
+
31
+ # Cell 3: (Optional) Hugging Face login
32
+ #@title 🔐 (Optional) Hugging Face login
33
+ from google.colab import userdata
34
+ from huggingface_hub import login
35
+
36
+ # Use Colab secrets to store your HF token
37
+ try:
38
+ HF_TOKEN = userdata.get('HF_TOKEN')
39
+ login(token=HF_TOKEN, add_to_git_credential=True)
40
+ print(" Logged in to Hugging Face")
41
+ except userdata.SecretNotFoundError:
42
+ print("Skipping login (HF_TOKEN secret not found)")
43
+ except Exception as e:
44
+ print(f"Login failed: {e}")
45
+
46
+
47
+ # Cell 4: Download model (GGUF) from Hugging Face
48
+ #@title 📦 Download model (GGUF) from Hugging Face
49
+ from huggingface_hub import hf_hub_download
50
+
51
+ REPO_ID = "TheBloke/CodeLlama-7B-Instruct-GGUF"
52
+ FILENAME = "codellama-7b-instruct.Q4_K_M.gguf"
53
+
54
+ print(f"Downloading {FILENAME} from {REPO_ID}...")
55
+ model_path = hf_hub_download(
56
+ repo_id=REPO_ID,
57
+ filename=FILENAME,
58
+ local_dir="models",
59
+ local_dir_use_symlinks=False
60
+ )
61
+ print(" Downloaded:", model_path)
62
+
63
+
64
+ # Cell 5: Load model with llama.cpp (auto GPU offload)
65
+ #@title ⚙️ Load model with llama.cpp (auto GPU offload)
66
+ from llama_cpp import Llama
67
+
68
+ def try_load(n_gpu_layers):
69
+ print(f"Trying to load model with n_gpu_layers={n_gpu_layers} ...")
70
+ return Llama(
71
+ model_path=model_path,
72
+ n_ctx=4096,
73
+ n_threads=None, # Auto-detect
74
+ n_gpu_layers=n_gpu_layers, # -1 = all layers on GPU (if possible)
75
+ verbose=False,
76
+ )
77
+
78
+ llm = None
79
+ # Attempt to load with max GPU layers, then fewer, then CPU only
80
+ for attempt in (-1, 40, 0):
81
+ try:
82
+ llm = try_load(attempt)
83
+ print(f" Model loaded successfully with n_gpu_layers = {attempt}")
84
+ break
85
+ except Exception as e:
86
+ print(f"Load failed with {attempt} GPU layers: {e}")
87
+
88
+ if llm is None:
89
+ raise RuntimeError("Could not load the model. Ensure you have enough RAM/VRAM.")
90
+
91
+
92
+ # Cell 6: Prompt builder
93
+ #@title 🧩 Prompt builder (code-first templates)
94
+ from textwrap import dedent
95
+
96
+ def build_prompt(user_query, system="You are an expert software engineer. Output concise, correct code. If possible, return code only."):
97
+ return dedent(f"""
98
+ <|system|>
99
+ {system}
100
+ <|user|>
101
+ {user_query}
102
+ <|assistant|>
103
+ """).strip()
104
+
105
+ # Example of a built prompt
106
+ print("--- Example Prompt ---")
107
+ print(build_prompt("Write a Python function `is_prime(n)`."))
108
+ print("----------------------")
109
+
110
+
111
+ # Cell 7: Interactive code chat UI
112
+ #@title 💬 Interactive code chat (UI)
113
+ from google.colab import output
114
+ output.enable_custom_widget_manager() # Enable widgets in Colab
115
+
116
+ import ipywidgets as widgets
117
+ from IPython.display import display, Markdown
118
+
119
+ sys_area = widgets.Textarea(
120
+ value="You are an expert programmer. Prefer minimal, correct code. If possible, output only code.",
121
+ description="System",
122
+ layout=widgets.Layout(width="100%", height="80px")
123
+ )
124
+ user_area = widgets.Textarea(
125
+ value="Write a Python function to parse a CSV file and compute the average of a column named 'score'.",
126
+ description="Prompt",
127
+ layout=widgets.Layout(width="100%", height="100px")
128
+ )
129
+ temp = widgets.FloatSlider(value=0.2, min=0.0, max=1.2, step=0.05, description="Temperature")
130
+ maxtok = widgets.IntSlider(value=512, min=64, max=2048, step=32, description="Max tokens")
131
+ code_only_box = widgets.Checkbox(value=True, description="Code only")
132
+ run_btn = widgets.Button(description="Generate", button_style="success")
133
+ out_area = widgets.Output()
134
+
135
+ def on_run(_):
136
+ out_area.clear_output()
137
+ with out_area:
138
+ print("Generating response...")
139
+ prompt = build_prompt(user_area.value, system=sys_area.value)
140
+ stops = ["<|user|>", "<|system|>", "</s>", "```"] if code_only_box.value else ["<|user|>", "<|system|>", "</s>"]
141
+
142
+ result = llm(prompt, max_tokens=maxtok.value, temperature=temp.value, stop=stops)
143
+ text = result["choices"][0]["text"]
144
+
145
+ # Post-processing to ensure it's a clean code block
146
+ if code_only_box.value and "```" not in text:
147
+ text = "```python\n" + text.strip() + "\n```"
148
+
149
+ out_area.clear_output() # Clear "Generating..." message
150
+ display(Markdown(text))
151
+
152
+ run_btn.on_click(on_run)
153
+ display(widgets.VBox([sys_area, user_area, temp, maxtok, code_only_box, run_btn, out_area]))