|
|
|
|
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
|
|
CHUNK_SIZE = 1500 |
|
|
CHUNK_OVERLAP = 200 |
|
|
|
|
|
def split_documents(docs): |
|
|
splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=CHUNK_SIZE, |
|
|
chunk_overlap=CHUNK_OVERLAP, |
|
|
separators=["\n\n", "\n", ". ", " ", ""], |
|
|
) |
|
|
chunks = splitter.split_documents(docs) |
|
|
|
|
|
for c in chunks: |
|
|
c.metadata["chunk_size"] = CHUNK_SIZE |
|
|
c.metadata["chunk_overlap"] = CHUNK_OVERLAP |
|
|
|
|
|
return chunks |
|
|
|
|
|
if __name__ == "__main__": |
|
|
from load_documents import load_documents |
|
|
docs = load_documents() |
|
|
chunks = split_documents(docs) |
|
|
print("Docs:", len(docs), "Chunks:", len(chunks)) |
|
|
print(chunks[0].page_content[:300], chunks[0].metadata) |
|
|
|