Spaces:
Runtime error
Runtime error
Commit
·
ada5c0c
1
Parent(s):
dfc2f0a
update splitter
Browse files
app.py
CHANGED
|
@@ -60,6 +60,7 @@ from langchain.document_loaders import (
|
|
| 60 |
UnstructuredExcelLoader
|
| 61 |
)
|
| 62 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 63 |
from langchain.docstore.document import Document
|
| 64 |
import langchain
|
| 65 |
import asyncio
|
|
@@ -355,7 +356,8 @@ def process_documents_3(ignored_files: List[str] = []) -> List[Document]:
|
|
| 355 |
print("No new documents to load")
|
| 356 |
exit(0)
|
| 357 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
| 358 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
|
|
|
|
| 359 |
texts = text_splitter.split_documents(documents)
|
| 360 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
| 361 |
return texts
|
|
|
|
| 60 |
UnstructuredExcelLoader
|
| 61 |
)
|
| 62 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 63 |
+
from langchain.text_splitter import TokenTextSplitter
|
| 64 |
from langchain.docstore.document import Document
|
| 65 |
import langchain
|
| 66 |
import asyncio
|
|
|
|
| 356 |
print("No new documents to load")
|
| 357 |
exit(0)
|
| 358 |
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
| 359 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
|
| 360 |
+
text_splitter = TokenTextSplitter(chunk_size=4000, chunk_overlap=500)
|
| 361 |
texts = text_splitter.split_documents(documents)
|
| 362 |
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
| 363 |
return texts
|