juliaturc commited on
Commit
73702d3
·
1 Parent(s): a1892e6

Fix IpynbChunker and add unit tests.

Browse files
repo2vec/chunker.py CHANGED
@@ -261,10 +261,13 @@ class IpynbFileChunker(Chunker):
261
 
262
  notebook = nbformat.reads(content, as_version=nbformat.NO_CONVERT)
263
  python_code = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == "code"])
264
- chunks = self.code_chunker.chunk(filename.replace(".ipynb", ".py"), python_code)
265
- # Change back the filenames to .ipynb.
 
 
266
  for chunk in chunks:
267
- chunk.filename = chunk.filename.replace(".py", ".ipynb")
 
268
  return chunks
269
 
270
 
 
261
 
262
  notebook = nbformat.reads(content, as_version=nbformat.NO_CONVERT)
263
  python_code = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == "code"])
264
+
265
+ tmp_metadata = {"file_path": filename.replace(".ipynb", ".py")}
266
+ chunks = self.code_chunker.chunk(python_code, tmp_metadata)
267
+
268
  for chunk in chunks:
269
+ # Update filenames back to .ipynb
270
+ chunk.metadata = metadata
271
  return chunks
272
 
273
 
tests/__init__.py ADDED
File without changes
tests/assets/sample-notebook.ipynb ADDED
@@ -0,0 +1,751 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "id": "5norOZI0mA6s"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "# Copyright 2023 Google LLC\n",
12
+ "#\n",
13
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
14
+ "# you may not use this file except in compliance with the License.\n",
15
+ "# You may obtain a copy of the License at\n",
16
+ "#\n",
17
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
18
+ "#\n",
19
+ "# Unless required by applicable law or agreed to in writing, software\n",
20
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
21
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
22
+ "# See the License for the specific language governing permissions and\n",
23
+ "# limitations under the License."
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {
29
+ "id": "XNPE46X8mJj4"
30
+ },
31
+ "source": [
32
+ "# Use Retrieval Augmented Generation (RAG) with Codey APIs\n",
33
+ "\n",
34
+ "<table align=\"left\">\n",
35
+ "\n",
36
+ " <td style=\"text-align: center\">\n",
37
+ " <a href=\"https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.ipynb\">\n",
38
+ " <img src=\"https://cloud.google.com/ml-engine/images/colab-logo-32px.png\" alt=\"Google Colaboratory logo\"><br> Open in Colab\n",
39
+ " </a>\n",
40
+ " </td>\n",
41
+ "\n",
42
+ " <td style=\"text-align: center\">\n",
43
+ " <a href=\"https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Flanguage%2Fcode%2Fcode_retrieval_augmented_generation.ipynb\">\n",
44
+ " <img width=\"32px\" src=\"https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png\" alt=\"Google Cloud Colab Enterprise logo\"><br> Open in Colab Enterprise\n",
45
+ " </a>\n",
46
+ " </td>\n",
47
+ " <td style=\"text-align: center\">\n",
48
+ " <a href=\"https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.ipynb\">\n",
49
+ " <img src=\"https://cloud.google.com/ml-engine/images/github-logo-32px.png\" alt=\"GitHub logo\"><br> View on GitHub\n",
50
+ " </a>\n",
51
+ " </td>\n",
52
+ " <td style=\"text-align: center\">\n",
53
+ " <a href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/code/code_retrieval_augmented_generation.ipynb\">\n",
54
+ " <img src=\"https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32\" alt=\"Vertex AI logo\"><br> Open in Workbench\n",
55
+ " </a>\n",
56
+ " </td>\n",
57
+ "</table>"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "metadata": {
63
+ "id": "VrLtlKPFqSxB"
64
+ },
65
+ "source": [
66
+ "| | |\n",
67
+ "|-|-|\n",
68
+ "|Author(s) | [Lavi Nigam](https://github.com/lavinigam-gcp), [Polong Lin](https://github.com/polong-lin) |"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "markdown",
73
+ "metadata": {
74
+ "id": "zNAEdYNFmQcP"
75
+ },
76
+ "source": [
77
+ "### Objective\n",
78
+ "\n",
79
+ "This notebook demonstrates how you augment output from Gemini APIs by bringing in external knowledge. An example is provided using Code Retrieval Augmented Generation(RAG) pattern using [Google Cloud's Generative AI github repository](https://github.com/GoogleCloudPlatform/generative-ai) as external knowledge. The notebook uses [Vertex AI Gemini API](https://ai.google.dev/gemini-api), [Embeddings for Text API](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings), FAISS vector store and [LangChain 🦜️🔗](https://python.langchain.com/en/latest/).\n",
80
+ "\n",
81
+ "### Overview\n",
82
+ "\n",
83
+ "Here is overview of what we'll go over.\n",
84
+ "\n",
85
+ "Index Creation:\n",
86
+ "\n",
87
+ "1. Recursively list the files(.ipynb) in github repo\n",
88
+ "2. Extract code and markdown from the files\n",
89
+ "3. Chunk & generate embeddings for each code strings and add initialize the vector store\n",
90
+ "\n",
91
+ "Runtime:\n",
92
+ "\n",
93
+ "4. User enters a prompt or asks a question as a prompt\n",
94
+ "5. Try zero-shot prompt\n",
95
+ "6. Run prompt using RAG Chain & compare results.To generate response we use **gemini-1.5-pro**\n",
96
+ "\n",
97
+ "### Cost\n",
98
+ "\n",
99
+ "This tutorial uses billable components of Google Cloud:\n",
100
+ "\n",
101
+ "- Vertex AI Gemini APIs offered by Google Cloud\n",
102
+ "\n",
103
+ "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.\n",
104
+ "\n",
105
+ "**Note:** We are using local vector store(FAISS) for this example however recommend managed highly scalable vector store for production usage such as [Vertex AI Vector Search](https://cloud.google.com/vertex-ai/docs/vector-search/overview) or [AlloyDB for PostgreSQL](https://cloud.google.com/alloydb/docs/ai/work-with-embeddings) or [Cloud SQL for PostgreSQL](https://cloud.google.com/sql/docs/postgres/features) using pgvector extension."
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "markdown",
110
+ "metadata": {
111
+ "id": "2cab0c8509c9"
112
+ },
113
+ "source": [
114
+ "## Get started"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "markdown",
119
+ "metadata": {
120
+ "id": "b56b5a5d28c1"
121
+ },
122
+ "source": [
123
+ "### Install Vertex AI SDK for Python and other required packages\n"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "metadata": {
130
+ "id": "QHaqV20Csqkt"
131
+ },
132
+ "outputs": [],
133
+ "source": [
134
+ "!pip3 install --upgrade --user -q google-cloud-aiplatform \\\n",
135
+ " langchain \\\n",
136
+ " langchain_google_vertexai \\\n",
137
+ " langchain-community \\\n",
138
+ " faiss-cpu \\\n",
139
+ " nbformat"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "markdown",
144
+ "metadata": {
145
+ "id": "-VUWOgz6M1rZ"
146
+ },
147
+ "source": [
148
+ "### Restart runtime (Colab only)\n",
149
+ "\n",
150
+ "To use the newly installed packages, you must restart the runtime on Google Colab."
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": null,
156
+ "metadata": {
157
+ "id": "BIS8EYgkMy8T"
158
+ },
159
+ "outputs": [],
160
+ "source": [
161
+ "import sys\n",
162
+ "\n",
163
+ "if \"google.colab\" in sys.modules:\n",
164
+ " import IPython\n",
165
+ "\n",
166
+ " app = IPython.Application.instance()\n",
167
+ " app.kernel.do_shutdown(True)"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "markdown",
172
+ "metadata": {
173
+ "id": "0af13c10a26a"
174
+ },
175
+ "source": [
176
+ "<div class=\"alert alert-block alert-warning\">\n",
177
+ "<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>\n",
178
+ "</div>\n"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "markdown",
183
+ "metadata": {
184
+ "id": "uZcP9WBENG0e"
185
+ },
186
+ "source": [
187
+ "### Authenticate your notebook environment (Colab only)\n",
188
+ "\n",
189
+ "Authenticate your environment on Google Colab.\n"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "metadata": {
196
+ "id": "1S_HgQXQNcbz"
197
+ },
198
+ "outputs": [],
199
+ "source": [
200
+ "import sys\n",
201
+ "\n",
202
+ "if \"google.colab\" in sys.modules:\n",
203
+ " from google.colab import auth\n",
204
+ "\n",
205
+ " auth.authenticate_user()"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "markdown",
210
+ "metadata": {
211
+ "id": "rVmxMr43Nhoo"
212
+ },
213
+ "source": [
214
+ "### Import libraries"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "metadata": {
221
+ "id": "L-Tljm5asMBc"
222
+ },
223
+ "outputs": [],
224
+ "source": [
225
+ "import time\n",
226
+ "from typing import List, Optional\n",
227
+ "\n",
228
+ "from google.cloud import aiplatform\n",
229
+ "from langchain.chains import RetrievalQA\n",
230
+ "from langchain.prompts import PromptTemplate\n",
231
+ "from langchain.schema.document import Document\n",
232
+ "from langchain.text_splitter import Language, RecursiveCharacterTextSplitter\n",
233
+ "from langchain.vectorstores import FAISS\n",
234
+ "\n",
235
+ "# LangChain\n",
236
+ "from langchain_google_vertexai import VertexAI, VertexAIEmbeddings\n",
237
+ "import nbformat\n",
238
+ "import requests\n",
239
+ "\n",
240
+ "# Vertex AI\n",
241
+ "import vertexai\n",
242
+ "\n",
243
+ "# Print the version of Vertex AI SDK for Python\n",
244
+ "print(f\"Vertex AI SDK version: {aiplatform.__version__}\")"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "markdown",
249
+ "metadata": {
250
+ "id": "4f872cd812d0"
251
+ },
252
+ "source": [
253
+ "### Set Google Cloud project information and initialize Vertex AI SDK for Python\n",
254
+ "\n",
255
+ "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)."
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": null,
261
+ "metadata": {
262
+ "id": "eNGEcBKG0iK-"
263
+ },
264
+ "outputs": [],
265
+ "source": [
266
+ "# Initialize project\n",
267
+ "# Define project information\n",
268
+ "PROJECT_ID = \"YOUR_PROJECT_ID\" # @param {type:\"string\"}\n",
269
+ "LOCATION = \"us-central1\" # @param {type:\"string\"}\n",
270
+ "\n",
271
+ "vertexai.init(project=PROJECT_ID, location=LOCATION)\n",
272
+ "\n",
273
+ "# Code Generation\n",
274
+ "code_llm = VertexAI(\n",
275
+ " model_name=\"gemini-1.5-pro\",\n",
276
+ " max_output_tokens=2048,\n",
277
+ " temperature=0.1,\n",
278
+ " verbose=False,\n",
279
+ ")"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "markdown",
284
+ "metadata": {
285
+ "id": "o537exyZk9DI"
286
+ },
287
+ "source": [
288
+ "Next we need to create a GitHub personal token to be able to list all files in a repository.\n",
289
+ "\n",
290
+ "- Follow [this link](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) to create GitHub token with repo->public_repo scope and update `GITHUB_TOKEN` variable below."
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": null,
296
+ "metadata": {
297
+ "id": "Bt9IVDSqk7y4"
298
+ },
299
+ "outputs": [],
300
+ "source": [
301
+ "# provide GitHub personal access token\n",
302
+ "GITHUB_TOKEN = \"YOUR_GITHUB_TOKEN\" # @param {type:\"string\"}\n",
303
+ "GITHUB_REPO = \"GoogleCloudPlatform/generative-ai\" # @param {type:\"string\"}"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "markdown",
308
+ "metadata": {
309
+ "id": "dqq3GeEbOJbU"
310
+ },
311
+ "source": [
312
+ "# Index Creation\n",
313
+ "\n",
314
+ "We use the Google Cloud Generative AI github repository as the data source. First list all Jupyter Notebook files in the repo and store it in a text file.\n",
315
+ "\n",
316
+ "You can skip this step(#1) if you have executed it once and generated the output text file.\n",
317
+ "\n",
318
+ "### 1. Recursively list the files(.ipynb) in the github repository"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": null,
324
+ "metadata": {
325
+ "id": "eTA1Jt0uOX8y"
326
+ },
327
+ "outputs": [],
328
+ "source": [
329
+ "# Crawls a GitHub repository and returns a list of all ipynb files in the repository\n",
330
+ "def crawl_github_repo(url: str, is_sub_dir: bool, access_token: str = GITHUB_TOKEN):\n",
331
+ " ignore_list = [\"__init__.py\"]\n",
332
+ "\n",
333
+ " if not is_sub_dir:\n",
334
+ " api_url = f\"https://api.github.com/repos/{url}/contents\"\n",
335
+ "\n",
336
+ " else:\n",
337
+ " api_url = url\n",
338
+ "\n",
339
+ " headers = {\n",
340
+ " \"Accept\": \"application/vnd.github.v3+json\",\n",
341
+ " \"Authorization\": f\"Bearer {access_token}\",\n",
342
+ " }\n",
343
+ "\n",
344
+ " response = requests.get(api_url, headers=headers)\n",
345
+ " response.raise_for_status() # Check for any request errors\n",
346
+ "\n",
347
+ " files = []\n",
348
+ "\n",
349
+ " contents = response.json()\n",
350
+ "\n",
351
+ " for item in contents:\n",
352
+ " if (\n",
353
+ " item[\"type\"] == \"file\"\n",
354
+ " and item[\"name\"] not in ignore_list\n",
355
+ " and (item[\"name\"].endswith(\".py\") or item[\"name\"].endswith(\".ipynb\"))\n",
356
+ " ):\n",
357
+ " files.append(item[\"html_url\"])\n",
358
+ " elif item[\"type\"] == \"dir\" and not item[\"name\"].startswith(\".\"):\n",
359
+ " sub_files = crawl_github_repo(item[\"url\"], True)\n",
360
+ " time.sleep(0.1)\n",
361
+ " files.extend(sub_files)\n",
362
+ "\n",
363
+ " return files"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": null,
369
+ "metadata": {
370
+ "id": "5vaKaxcGO_R6"
371
+ },
372
+ "outputs": [],
373
+ "source": [
374
+ "code_files_urls = crawl_github_repo(GITHUB_REPO, False, GITHUB_TOKEN)\n",
375
+ "\n",
376
+ "# Write list to a file so you do not have to download each time\n",
377
+ "with open(\"code_files_urls.txt\", \"w\") as f:\n",
378
+ " for item in code_files_urls:\n",
379
+ " f.write(item + \"\\n\")\n",
380
+ "\n",
381
+ "len(code_files_urls)"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": null,
387
+ "metadata": {
388
+ "id": "c5hoNYJ5byMJ"
389
+ },
390
+ "outputs": [],
391
+ "source": [
392
+ "code_files_urls[0:10]"
393
+ ]
394
+ },
395
+ {
396
+ "cell_type": "markdown",
397
+ "metadata": {
398
+ "id": "mFNVieLnR8Ie"
399
+ },
400
+ "source": [
401
+ "### 2. Extract code from the Jupyter notebooks.\n",
402
+ "\n",
403
+ "You could also include .py file, shell scripts etc."
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "execution_count": null,
409
+ "metadata": {
410
+ "id": "ZsM1M4hn4cBu"
411
+ },
412
+ "outputs": [],
413
+ "source": [
414
+ "# Extracts the python code from an ipynb file from github\n",
415
+ "def extract_python_code_from_ipynb(github_url, cell_type=\"code\"):\n",
416
+ " raw_url = github_url.replace(\"github.com\", \"raw.githubusercontent.com\").replace(\n",
417
+ " \"/blob/\", \"/\"\n",
418
+ " )\n",
419
+ "\n",
420
+ " response = requests.get(raw_url)\n",
421
+ " response.raise_for_status() # Check for any request errors\n",
422
+ "\n",
423
+ " notebook_content = response.text\n",
424
+ "\n",
425
+ " notebook = nbformat.reads(notebook_content, as_version=nbformat.NO_CONVERT)\n",
426
+ "\n",
427
+ " python_code = None\n",
428
+ "\n",
429
+ " for cell in notebook.cells:\n",
430
+ " if cell.cell_type == cell_type:\n",
431
+ " if not python_code:\n",
432
+ " python_code = cell.source\n",
433
+ " else:\n",
434
+ " python_code += \"\\n\" + cell.source\n",
435
+ "\n",
436
+ " return python_code\n",
437
+ "\n",
438
+ "\n",
439
+ "def extract_python_code_from_py(github_url):\n",
440
+ " raw_url = github_url.replace(\"github.com\", \"raw.githubusercontent.com\").replace(\n",
441
+ " \"/blob/\", \"/\"\n",
442
+ " )\n",
443
+ "\n",
444
+ " response = requests.get(raw_url)\n",
445
+ " response.raise_for_status() # Check for any request errors\n",
446
+ "\n",
447
+ " python_code = response.text\n",
448
+ "\n",
449
+ " return python_code"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "execution_count": null,
455
+ "metadata": {
456
+ "id": "WCRp5Xtb48is"
457
+ },
458
+ "outputs": [],
459
+ "source": [
460
+ "with open(\"code_files_urls.txt\") as f:\n",
461
+ " code_files_urls = f.read().splitlines()\n",
462
+ "len(code_files_urls)"
463
+ ]
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": null,
468
+ "metadata": {
469
+ "id": "4Y9SMO7H4xgF"
470
+ },
471
+ "outputs": [],
472
+ "source": [
473
+ "code_strings = []\n",
474
+ "\n",
475
+ "for i in range(0, len(code_files_urls)):\n",
476
+ " if code_files_urls[i].endswith(\".ipynb\"):\n",
477
+ " content = extract_python_code_from_ipynb(code_files_urls[i], \"code\")\n",
478
+ " doc = Document(\n",
479
+ " page_content=content, metadata={\"url\": code_files_urls[i], \"file_index\": i}\n",
480
+ " )\n",
481
+ " code_strings.append(doc)"
482
+ ]
483
+ },
484
+ {
485
+ "cell_type": "markdown",
486
+ "metadata": {
487
+ "id": "T1AF3fhBSLOm"
488
+ },
489
+ "source": [
490
+ "### 3. Chunk & generate embeddings for each code strings & initialize the vector store\n",
491
+ "\n",
492
+ "We need to split code into usable chunks that the LLM can use for code generation. Therefore it's crucial to use the right chunking approach and chunk size."
493
+ ]
494
+ },
495
+ {
496
+ "cell_type": "code",
497
+ "execution_count": null,
498
+ "metadata": {
499
+ "id": "Rj1cCA2fqx64"
500
+ },
501
+ "outputs": [],
502
+ "source": [
503
+ "# Utility functions for Embeddings API with rate limiting\n",
504
+ "def rate_limit(max_per_minute):\n",
505
+ " period = 60 / max_per_minute\n",
506
+ " print(\"Waiting\")\n",
507
+ " while True:\n",
508
+ " before = time.time()\n",
509
+ " yield\n",
510
+ " after = time.time()\n",
511
+ " elapsed = after - before\n",
512
+ " sleep_time = max(0, period - elapsed)\n",
513
+ " if sleep_time > 0:\n",
514
+ " print(\".\", end=\"\")\n",
515
+ " time.sleep(sleep_time)\n",
516
+ "\n",
517
+ "\n",
518
+ "class CustomVertexAIEmbeddings(VertexAIEmbeddings):\n",
519
+ " requests_per_minute: int\n",
520
+ " num_instances_per_batch: int\n",
521
+ " model_name: str\n",
522
+ "\n",
523
+ " # Overriding embed_documents method\n",
524
+ " def embed_documents(\n",
525
+ " self, texts: List[str], batch_size: Optional[int] = None\n",
526
+ " ) -> List[List[float]]:\n",
527
+ " limiter = rate_limit(self.requests_per_minute)\n",
528
+ " results = []\n",
529
+ " docs = list(texts)\n",
530
+ "\n",
531
+ " while docs:\n",
532
+ " # Working in batches because the API accepts maximum 5\n",
533
+ " # documents per request to get embeddings\n",
534
+ " head, docs = (\n",
535
+ " docs[: self.num_instances_per_batch],\n",
536
+ " docs[self.num_instances_per_batch :],\n",
537
+ " )\n",
538
+ " chunk = self.client.get_embeddings(head)\n",
539
+ " results.extend(chunk)\n",
540
+ " next(limiter)\n",
541
+ "\n",
542
+ " return [r.values for r in results]"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": null,
548
+ "metadata": {
549
+ "id": "oae37l-pvzZ6"
550
+ },
551
+ "outputs": [],
552
+ "source": [
553
+ "# Chunk code strings\n",
554
+ "text_splitter = RecursiveCharacterTextSplitter.from_language(\n",
555
+ " language=Language.PYTHON, chunk_size=2000, chunk_overlap=200\n",
556
+ ")\n",
557
+ "\n",
558
+ "\n",
559
+ "texts = text_splitter.split_documents(code_strings)\n",
560
+ "print(len(texts))\n",
561
+ "\n",
562
+ "# Initialize Embedding API\n",
563
+ "EMBEDDING_QPM = 100\n",
564
+ "EMBEDDING_NUM_BATCH = 5\n",
565
+ "embeddings = CustomVertexAIEmbeddings(\n",
566
+ " requests_per_minute=EMBEDDING_QPM,\n",
567
+ " num_instances_per_batch=EMBEDDING_NUM_BATCH,\n",
568
+ " model_name=\"textembedding-gecko@latest\",\n",
569
+ ")\n",
570
+ "\n",
571
+ "# Create Index from embedded code chunks\n",
572
+ "db = FAISS.from_documents(texts, embeddings)\n",
573
+ "\n",
574
+ "# Init your retriever.\n",
575
+ "retriever = db.as_retriever(\n",
576
+ " search_type=\"similarity\", # Also test \"similarity\", \"mmr\"\n",
577
+ " search_kwargs={\"k\": 5},\n",
578
+ ")\n",
579
+ "\n",
580
+ "retriever"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "markdown",
585
+ "metadata": {
586
+ "id": "Q_gn89IyuHIT"
587
+ },
588
+ "source": [
589
+ "# Runtime\n",
590
+ "### 4. User enters a prompt or asks a question as a prompt"
591
+ ]
592
+ },
593
+ {
594
+ "cell_type": "code",
595
+ "execution_count": null,
596
+ "metadata": {
597
+ "id": "1vrvTkO7uFNi"
598
+ },
599
+ "outputs": [],
600
+ "source": [
601
+ "user_question = \"Create a Python function that takes a prompt and predicts using langchain.llms interface with Vertex AI text-bison model\""
602
+ ]
603
+ },
604
+ {
605
+ "cell_type": "code",
606
+ "execution_count": null,
607
+ "metadata": {
608
+ "id": "azbvOUFRvEp5"
609
+ },
610
+ "outputs": [],
611
+ "source": [
612
+ "# Define prompt templates\n",
613
+ "\n",
614
+ "# Zero Shot prompt template\n",
615
+ "prompt_zero_shot = \"\"\"\n",
616
+ " You are a proficient python developer. Respond with the syntactically correct & concise code for to the question below.\n",
617
+ "\n",
618
+ " Question:\n",
619
+ " {question}\n",
620
+ "\n",
621
+ " Output Code :\n",
622
+ " \"\"\"\n",
623
+ "\n",
624
+ "prompt_prompt_zero_shot = PromptTemplate(\n",
625
+ " input_variables=[\"question\"],\n",
626
+ " template=prompt_zero_shot,\n",
627
+ ")\n",
628
+ "\n",
629
+ "\n",
630
+ "# RAG template\n",
631
+ "prompt_RAG = \"\"\"\n",
632
+ " You are a proficient python developer. Respond with the syntactically correct code for to the question below. Make sure you follow these rules:\n",
633
+ " 1. Use context to understand the APIs and how to use it & apply.\n",
634
+ " 2. Do not add license information to the output code.\n",
635
+ " 3. Do not include Colab code in the output.\n",
636
+ " 4. Ensure all the requirements in the question are met.\n",
637
+ "\n",
638
+ " Question:\n",
639
+ " {question}\n",
640
+ "\n",
641
+ " Context:\n",
642
+ " {context}\n",
643
+ "\n",
644
+ " Helpful Response :\n",
645
+ " \"\"\"\n",
646
+ "\n",
647
+ "prompt_RAG_template = PromptTemplate(\n",
648
+ " template=prompt_RAG, input_variables=[\"context\", \"question\"]\n",
649
+ ")\n",
650
+ "\n",
651
+ "qa_chain = RetrievalQA.from_llm(\n",
652
+ " llm=code_llm,\n",
653
+ " prompt=prompt_RAG_template,\n",
654
+ " retriever=retriever,\n",
655
+ " return_source_documents=True,\n",
656
+ ")"
657
+ ]
658
+ },
659
+ {
660
+ "cell_type": "markdown",
661
+ "metadata": {
662
+ "id": "3NBaObAQSlIv"
663
+ },
664
+ "source": [
665
+ "### 5. Try zero-shot prompt"
666
+ ]
667
+ },
668
+ {
669
+ "cell_type": "code",
670
+ "execution_count": null,
671
+ "metadata": {
672
+ "id": "1svTVwtBS0zP"
673
+ },
674
+ "outputs": [],
675
+ "source": [
676
+ "response = code_llm.invoke(input=user_question, max_output_tokens=2048, temperature=0.1)\n",
677
+ "print(response)"
678
+ ]
679
+ },
680
+ {
681
+ "cell_type": "markdown",
682
+ "metadata": {
683
+ "id": "JPm8qdxzwPM0"
684
+ },
685
+ "source": [
686
+ "### 6. Run prompt using RAG Chain & compare results\n",
687
+ "To generate response we use code-bison however can also use code-gecko and codechat-bison"
688
+ ]
689
+ },
690
+ {
691
+ "cell_type": "code",
692
+ "execution_count": null,
693
+ "metadata": {
694
+ "id": "ZMz3nPMyVoj_"
695
+ },
696
+ "outputs": [],
697
+ "source": [
698
+ "results = qa_chain.invoke(input={\"query\": user_question})\n",
699
+ "print(results[\"result\"])"
700
+ ]
701
+ },
702
+ {
703
+ "cell_type": "markdown",
704
+ "metadata": {
705
+ "id": "HF3lVWK1wjxe"
706
+ },
707
+ "source": [
708
+ "### Let's try another prompt"
709
+ ]
710
+ },
711
+ {
712
+ "cell_type": "code",
713
+ "execution_count": null,
714
+ "metadata": {
715
+ "id": "jel0ON68XiU7"
716
+ },
717
+ "outputs": [],
718
+ "source": [
719
+ "user_question = \"Create python function that takes text input and returns embeddings using LangChain with Vertex AI textembedding-gecko model\"\n",
720
+ "\n",
721
+ "\n",
722
+ "response = code_llm.invoke(input=user_question, max_output_tokens=2048, temperature=0.1)\n",
723
+ "print(response)"
724
+ ]
725
+ },
726
+ {
727
+ "cell_type": "code",
728
+ "execution_count": null,
729
+ "metadata": {
730
+ "id": "G9bIkqE8sO6P"
731
+ },
732
+ "outputs": [],
733
+ "source": [
734
+ "results = qa_chain.invoke(input={\"query\": user_question})\n",
735
+ "print(results[\"result\"])"
736
+ ]
737
+ }
738
+ ],
739
+ "metadata": {
740
+ "colab": {
741
+ "name": "code_retrieval_augmented_generation.ipynb",
742
+ "toc_visible": true
743
+ },
744
+ "kernelspec": {
745
+ "display_name": "Python 3",
746
+ "name": "python3"
747
+ }
748
+ },
749
+ "nbformat": 4,
750
+ "nbformat_minor": 0
751
+ }
tests/conftest.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../repo2vec')))
tests/test_chunker.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for the classes under chunker.py.
2
+
3
+ These are minimal happy-path tests to ensure that the chunkers don't crash.
4
+
5
+ Dependencies:
6
+ pip install pytest
7
+ pip install pytest-mock
8
+ """
9
+
10
+ import os
11
+
12
+ import repo2vec.chunker
13
+
14
+
15
+ def test_text_chunker_happy_path():
16
+ """Tests the happy path for the TextFileChunker."""
17
+ chunker = repo2vec.chunker.TextFileChunker(max_tokens=100)
18
+
19
+ file_path = os.path.join(os.path.dirname(__file__), "../README.md")
20
+ with open(file_path, "r") as file:
21
+ content = file.read()
22
+ metadata = {"file_path": file_path}
23
+ chunks = chunker.chunk(content, metadata)
24
+
25
+ assert len(chunks) >= 1
26
+
27
+
28
+ def test_code_chunker_happy_path():
29
+ """Tests the happy path for the CodeFileChunker."""
30
+ chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)
31
+
32
+ file_path = os.path.join(os.path.dirname(__file__), "../repo2vec/chunker.py")
33
+ with open(file_path, "r") as file:
34
+ content = file.read()
35
+ metadata = {"file_path": file_path}
36
+ chunks = chunker.chunk(content, metadata)
37
+
38
+ assert len(chunks) >= 1
39
+
40
+
41
+ def test_ipynb_chunker_happy_path():
42
+ """Tests the happy path for the IPynbChunker."""
43
+ code_chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)
44
+ chunker = repo2vec.chunker.IpynbFileChunker(code_chunker)
45
+
46
+ file_path = os.path.join(os.path.dirname(__file__), "assets/sample-notebook.ipynb")
47
+ with open(file_path, "r") as file:
48
+ content = file.read()
49
+ metadata = {"file_path": file_path}
50
+ chunks = chunker.chunk(content, metadata)
51
+
52
+ assert len(chunks) >= 1