Spaces:
Runtime error
Runtime error
Update chunk_python_code.py
Browse files- chunk_python_code.py +25 -25
chunk_python_code.py
CHANGED
|
@@ -63,7 +63,7 @@ def _iterate_ast(python_code, documents, file_path):
|
|
| 63 |
_chunk_first_level_assign_node(first_level_node, documents, python_code))
|
| 64 |
else:
|
| 65 |
documents.extend(
|
| 66 |
-
|
| 67 |
|
| 68 |
|
| 69 |
def _chunk_import_only_python_code(python_code, file_path):
|
|
@@ -89,34 +89,12 @@ def _chunk_import_only_python_code(python_code, file_path):
|
|
| 89 |
|
| 90 |
|
| 91 |
|
| 92 |
-
def
|
| 93 |
documents = []
|
| 94 |
documents.extend(
|
| 95 |
-
_chunk_python_code_by_character)
|
| 96 |
return documents
|
| 97 |
|
| 98 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def _chunk_python_code_by_character(python_code):
|
| 102 |
-
documents = []
|
| 103 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
| 104 |
-
chunk_size=512,
|
| 105 |
-
chunk_overlap=128,
|
| 106 |
-
separators=[]
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
-
chunks = text_splitter.split_text(python_code)
|
| 110 |
-
|
| 111 |
-
for chunk in chunks:
|
| 112 |
-
doc = Document(
|
| 113 |
-
page_content=chunk
|
| 114 |
-
)
|
| 115 |
-
documents.append(doc)
|
| 116 |
-
|
| 117 |
-
return documents
|
| 118 |
-
|
| 119 |
-
|
| 120 |
|
| 121 |
def _chunk_nodeless_python_code(python_code, file_path):
|
| 122 |
"""
|
|
@@ -245,4 +223,26 @@ def _chunk_first_level_func_node(ast_node, python_code):
|
|
| 245 |
)
|
| 246 |
documents.append(doc)
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
return documents
|
|
|
|
| 63 |
_chunk_first_level_assign_node(first_level_node, documents, python_code))
|
| 64 |
else:
|
| 65 |
documents.extend(
|
| 66 |
+
_handle_not_defined_case(python_code))
|
| 67 |
|
| 68 |
|
| 69 |
def _chunk_import_only_python_code(python_code, file_path):
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
|
| 92 |
+
def _handle_not_defined_case(python_code):
|
| 93 |
documents = []
|
| 94 |
documents.extend(
|
| 95 |
+
_chunk_python_code_by_character(python_code)
|
| 96 |
return documents
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
def _chunk_nodeless_python_code(python_code, file_path):
|
| 100 |
"""
|
|
|
|
| 223 |
)
|
| 224 |
documents.append(doc)
|
| 225 |
|
| 226 |
+
return documents
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def _chunk_python_code_by_character(python_code):
|
| 233 |
+
documents = []
|
| 234 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 235 |
+
chunk_size=512,
|
| 236 |
+
chunk_overlap=128,
|
| 237 |
+
separators=[]
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
chunks = text_splitter.split_text(python_code)
|
| 241 |
+
|
| 242 |
+
for chunk in chunks:
|
| 243 |
+
doc = Document(
|
| 244 |
+
page_content=chunk
|
| 245 |
+
)
|
| 246 |
+
documents.append(doc)
|
| 247 |
+
|
| 248 |
return documents
|