Spaces:

Asish22
/

code-crawler

Running

App Files Files Community

juliaturc commited on Aug 29, 2024

Commit

40b4763

1 Parent(s): 9397e33

Add .ipynb chunker

Browse files

Files changed (4) hide show

.gitignore +2 -1
requirements.txt +1 -0
src/chunker.py +28 -0
src/index.py +1 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .env
 __pycache__
-*.cpython.*

 .env
 __pycache__
+*.cpython.*
+repos/

requirements.txt CHANGED Viewed

@@ -4,6 +4,7 @@ gradio==4.42.0
 langchain==0.2.14
 langchain-community==0.2.12
 langchain-openai==0.1.22
 openai==1.42.0
 pinecone==5.0.1
 python-dotenv==1.0.1

 langchain==0.2.14
 langchain-community==0.2.12
 langchain-openai==0.1.22
+nbformat==5.10.4
 openai==1.42.0
 pinecone==5.0.1
 python-dotenv==1.0.1

src/chunker.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Chunker abstraction and implementations."""
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
@@ -172,6 +173,9 @@ class CodeChunker(Chunker):
     def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
         """Chunks a code file into smaller pieces."""
         tree = self.parse_tree(file_path, file_content)
         if tree is None:
             return []
@@ -226,6 +230,28 @@ class TextChunker(Chunker):
         return chunks
 class UniversalChunker(Chunker):
     """Chunks a file into smaller pieces, regardless of whether it's code or text."""
@@ -234,6 +260,8 @@ class UniversalChunker(Chunker):
         self.text_chunker = TextChunker(max_tokens)
     def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
         if CodeChunker.is_code_file(file_path):
             return self.code_chunker.chunk(file_path, file_content)
         return self.text_chunker.chunk(file_path, file_content)

 """Chunker abstraction and implementations."""
 import logging
+import nbformat
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
     def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
         """Chunks a code file into smaller pieces."""
+        if not file_content.strip():
+            return []
         tree = self.parse_tree(file_path, file_content)
         if tree is None:
             return []
         return chunks
+class IPYNBChunker(Chunker):
+    """Extracts the python code from a Jupyter notebook, removing all the boilerplate.
+    Based on https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.ipynb
+    """
+    def __init__(self, code_chunker: CodeChunker):
+        self.code_chunker = code_chunker
+    def chunk(self, filename: str, content: str) -> List[Chunk]:
+        if not filename.lower().endswith(".ipynb"):
+            logging.warn("IPYNBChunker is only for .ipynb files.")
+            return []
+        notebook = nbformat.reads(content, as_version=nbformat.NO_CONVERT)
+        python_code = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == "code"])
+        chunks = self.code_chunker.chunk(filename.replace(".ipynb", ".py"), python_code)
+        # Change back the filenames to .ipynb.
+        for chunk in chunks:
+            chunk.filename = chunk.filename.replace(".py", ".ipynb")
+        return chunks
 class UniversalChunker(Chunker):
     """Chunks a file into smaller pieces, regardless of whether it's code or text."""
         self.text_chunker = TextChunker(max_tokens)
     def chunk(self, file_path: str, file_content: str) -> List[Chunk]:
+        if file_path.lower().endswith(".ipynb"):
+            return IPYNBChunker(self.code_chunker).chunk(file_path, file_content)
         if CodeChunker.is_code_file(file_path):
             return self.code_chunker.chunk(file_path, file_content)
         return self.text_chunker.chunk(file_path, file_content)

src/index.py CHANGED Viewed

@@ -52,6 +52,7 @@ def main():
     )
     parser.add_argument(
         "--exclude",
         help="Path to a file containing a list of extensions to exclude. One extension per line.",
     )
     parser.add_argument(

     )
     parser.add_argument(
         "--exclude",
+        default="src/sample-exclude.txt",
         help="Path to a file containing a list of extensions to exclude. One extension per line.",
     )
     parser.add_argument(