juliaturc commited on
Commit
40b4763
·
1 Parent(s): 9397e33

Add .ipynb chunker

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. requirements.txt +1 -0
  3. src/chunker.py +28 -0
  4. src/index.py +1 -0
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  .env
2
  __pycache__
3
- *.cpython.*
 
 
1
  .env
2
  __pycache__
3
+ *.cpython.*
4
+ repos/
requirements.txt CHANGED
@@ -4,6 +4,7 @@ gradio==4.42.0
4
  langchain==0.2.14
5
  langchain-community==0.2.12
6
  langchain-openai==0.1.22
 
7
  openai==1.42.0
8
  pinecone==5.0.1
9
  python-dotenv==1.0.1
 
4
  langchain==0.2.14
5
  langchain-community==0.2.12
6
  langchain-openai==0.1.22
7
+ nbformat==5.10.4
8
  openai==1.42.0
9
  pinecone==5.0.1
10
  python-dotenv==1.0.1
src/chunker.py CHANGED
@@ -1,6 +1,7 @@
1
  """Chunker abstraction and implementations."""
2
 
3
  import logging
 
4
  from abc import ABC, abstractmethod
5
  from dataclasses import dataclass
6
  from functools import lru_cache
@@ -172,6 +173,9 @@ class CodeChunker(Chunker):
172
 
173
  def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
174
  """Chunks a code file into smaller pieces."""
 
 
 
175
  tree = self.parse_tree(file_path, file_content)
176
  if tree is None:
177
  return []
@@ -226,6 +230,28 @@ class TextChunker(Chunker):
226
  return chunks
227
 
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  class UniversalChunker(Chunker):
230
  """Chunks a file into smaller pieces, regardless of whether it's code or text."""
231
 
@@ -234,6 +260,8 @@ class UniversalChunker(Chunker):
234
  self.text_chunker = TextChunker(max_tokens)
235
 
236
  def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
 
 
237
  if CodeChunker.is_code_file(file_path):
238
  return self.code_chunker.chunk(file_path, file_content)
239
  return self.text_chunker.chunk(file_path, file_content)
 
1
  """Chunker abstraction and implementations."""
2
 
3
  import logging
4
+ import nbformat
5
  from abc import ABC, abstractmethod
6
  from dataclasses import dataclass
7
  from functools import lru_cache
 
173
 
174
  def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
175
  """Chunks a code file into smaller pieces."""
176
+ if not file_content.strip():
177
+ return []
178
+
179
  tree = self.parse_tree(file_path, file_content)
180
  if tree is None:
181
  return []
 
230
  return chunks
231
 
232
 
233
+ class IPYNBChunker(Chunker):
234
+ """Extracts the python code from a Jupyter notebook, removing all the boilerplate.
235
+
236
+ Based on https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.ipynb
237
+ """
238
+ def __init__(self, code_chunker: CodeChunker):
239
+ self.code_chunker = code_chunker
240
+
241
+ def chunk(self, filename: str, content: str) -> List[Chunk]:
242
+ if not filename.lower().endswith(".ipynb"):
243
+ logging.warn("IPYNBChunker is only for .ipynb files.")
244
+ return []
245
+
246
+ notebook = nbformat.reads(content, as_version=nbformat.NO_CONVERT)
247
+ python_code = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == "code"])
248
+ chunks = self.code_chunker.chunk(filename.replace(".ipynb", ".py"), python_code)
249
+ # Change back the filenames to .ipynb.
250
+ for chunk in chunks:
251
+ chunk.filename = chunk.filename.replace(".py", ".ipynb")
252
+ return chunks
253
+
254
+
255
  class UniversalChunker(Chunker):
256
  """Chunks a file into smaller pieces, regardless of whether it's code or text."""
257
 
 
260
  self.text_chunker = TextChunker(max_tokens)
261
 
262
  def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
263
+ if file_path.lower().endswith(".ipynb"):
264
+ return IPYNBChunker(self.code_chunker).chunk(file_path, file_content)
265
  if CodeChunker.is_code_file(file_path):
266
  return self.code_chunker.chunk(file_path, file_content)
267
  return self.text_chunker.chunk(file_path, file_content)
src/index.py CHANGED
@@ -52,6 +52,7 @@ def main():
52
  )
53
  parser.add_argument(
54
  "--exclude",
 
55
  help="Path to a file containing a list of extensions to exclude. One extension per line.",
56
  )
57
  parser.add_argument(
 
52
  )
53
  parser.add_argument(
54
  "--exclude",
55
+ default="src/sample-exclude.txt",
56
  help="Path to a file containing a list of extensions to exclude. One extension per line.",
57
  )
58
  parser.add_argument(