Nguyen5 commited on
Commit
40ce2a9
·
1 Parent(s): e640fc1
Files changed (1) hide show
  1. load_documents.py +0 -29
load_documents.py CHANGED
@@ -114,32 +114,3 @@ if __name__ == "__main__":
114
  docs = load_documents()
115
  print(docs[0])
116
  print("Total:", len(docs))
117
-
118
- - split_documents.py:
119
- # split_documents.py – v2
120
-
121
- from langchain_text_splitters import RecursiveCharacterTextSplitter
122
-
123
- CHUNK_SIZE = 1500
124
- CHUNK_OVERLAP = 200
125
-
126
- def split_documents(docs):
127
- splitter = RecursiveCharacterTextSplitter(
128
- chunk_size=CHUNK_SIZE,
129
- chunk_overlap=CHUNK_OVERLAP,
130
- separators=["\n\n", "\n", ". ", " ", ""],
131
- )
132
- chunks = splitter.split_documents(docs)
133
-
134
- for c in chunks:
135
- c.metadata["chunk_size"] = CHUNK_SIZE
136
- c.metadata["chunk_overlap"] = CHUNK_OVERLAP
137
-
138
- return chunks
139
-
140
- if __name__ == "__main__":
141
- from load_documents import load_documents
142
- docs = load_documents()
143
- chunks = split_documents(docs)
144
- print("Docs:", len(docs), "Chunks:", len(chunks))
145
- print(chunks[0].page_content[:300], chunks[0].metadata)
 
114
  docs = load_documents()
115
  print(docs[0])
116
  print("Total:", len(docs))