File size: 743 Bytes
ed084d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
CHUNK_SIZE = 1500
CHUNK_OVERLAP = 200
def split_documents(docs):
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(docs)
for c in chunks:
c.metadata["chunk_size"] = CHUNK_SIZE
c.metadata["chunk_overlap"] = CHUNK_OVERLAP
return chunks
if __name__ == "__main__":
from load_documents import load_documents
docs = load_documents()
chunks = split_documents(docs)
print("Docs:", len(docs), "Chunks:", len(chunks))
print(chunks[0].page_content[:300], chunks[0].metadata)
|