File size: 769 Bytes
6548bf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# split_documents.py – v2

from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 1500
CHUNK_OVERLAP = 200

def split_documents(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    chunks = splitter.split_documents(docs)

    for c in chunks:
        c.metadata["chunk_size"] = CHUNK_SIZE
        c.metadata["chunk_overlap"] = CHUNK_OVERLAP

    return chunks

if __name__ == "__main__":
    from load_documents import load_documents
    docs = load_documents()
    chunks = split_documents(docs)
    print("Docs:", len(docs), "Chunks:", len(chunks))
    print(chunks[0].page_content[:300], chunks[0].metadata)