Spaces:
Runtime error
Runtime error
Commit
·
fdba7fc
1
Parent(s):
ada5c0c
update splitter
Browse files
app.py
CHANGED
|
@@ -322,7 +322,8 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
|
| 322 |
print("No new documents to load")
|
| 323 |
exit(0)
|
| 324 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
| 325 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
|
| 326 |
texts = text_splitter.split_documents(documents)
|
| 327 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
| 328 |
return texts
|
|
@@ -339,7 +340,8 @@ def process_documents_2(ignored_files: List[str] = []) -> List[Document]:
|
|
| 339 |
print("No new documents to load")
|
| 340 |
exit(0)
|
| 341 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
| 342 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
|
| 343 |
texts = text_splitter.split_documents(documents)
|
| 344 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
| 345 |
return texts
|
|
|
|
| 322 |
print("No new documents to load")
|
| 323 |
exit(0)
|
| 324 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
| 325 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 326 |
+
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 327 |
texts = text_splitter.split_documents(documents)
|
| 328 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
| 329 |
return texts
|
|
|
|
| 340 |
print("No new documents to load")
|
| 341 |
exit(0)
|
| 342 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
| 343 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 344 |
+
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 345 |
texts = text_splitter.split_documents(documents)
|
| 346 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
| 347 |
return texts
|