commit
Browse files- load_documents.py +0 -30
load_documents.py
CHANGED
|
@@ -128,33 +128,3 @@ if __name__ == "__main__":
|
|
| 128 |
if len(docs):
|
| 129 |
print("\nExample metadata from 1st document:")
|
| 130 |
print(docs[0].metadata)
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
- split_documents.py:
|
| 134 |
-
# split_documents.py – v2
|
| 135 |
-
|
| 136 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 137 |
-
|
| 138 |
-
CHUNK_SIZE = 1500
|
| 139 |
-
CHUNK_OVERLAP = 200
|
| 140 |
-
|
| 141 |
-
def split_documents(docs):
|
| 142 |
-
splitter = RecursiveCharacterTextSplitter(
|
| 143 |
-
chunk_size=CHUNK_SIZE,
|
| 144 |
-
chunk_overlap=CHUNK_OVERLAP,
|
| 145 |
-
separators=["\n\n", "\n", ". ", " ", ""],
|
| 146 |
-
)
|
| 147 |
-
chunks = splitter.split_documents(docs)
|
| 148 |
-
|
| 149 |
-
for c in chunks:
|
| 150 |
-
c.metadata["chunk_size"] = CHUNK_SIZE
|
| 151 |
-
c.metadata["chunk_overlap"] = CHUNK_OVERLAP
|
| 152 |
-
|
| 153 |
-
return chunks
|
| 154 |
-
|
| 155 |
-
if __name__ == "__main__":
|
| 156 |
-
from load_documents import load_documents
|
| 157 |
-
docs = load_documents()
|
| 158 |
-
chunks = split_documents(docs)
|
| 159 |
-
print("Docs:", len(docs), "Chunks:", len(chunks))
|
| 160 |
-
print(chunks[0].page_content[:300], chunks[0].metadata)
|
|
|
|
| 128 |
if len(docs):
|
| 129 |
print("\nExample metadata from 1st document:")
|
| 130 |
print(docs[0].metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|