Update utils.py
Browse files
utils.py
CHANGED
|
@@ -303,7 +303,7 @@ def load_word_with_metadata(file_path):
|
|
| 303 |
################################################
|
| 304 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
| 305 |
# Funktion zum Splitten und Zuweisen der doc_id
|
| 306 |
-
def split_documents_with_id(docs):
|
| 307 |
splits = []
|
| 308 |
for doc in docs:
|
| 309 |
doc_splits = text_splitter.split_text(doc.page_content)
|
|
@@ -366,11 +366,11 @@ def document_loading_splitting():
|
|
| 366 |
################################
|
| 367 |
# Document splitting
|
| 368 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 369 |
-
splits = text_splitter.split_documents(preprocessed_docs)
|
| 370 |
|
| 371 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
| 372 |
-
original_splits = split_documents_with_id(original_docs)
|
| 373 |
-
preprocessed_splits = split_documents_with_id(preprocessed_docs)
|
| 374 |
|
| 375 |
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
|
| 376 |
split_to_original_mapping = {p_split.metadata["doc_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
|
|
|
| 303 |
################################################
|
| 304 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
| 305 |
# Funktion zum Splitten und Zuweisen der doc_id
|
| 306 |
+
def split_documents_with_id(docs, text_splitter):
|
| 307 |
splits = []
|
| 308 |
for doc in docs:
|
| 309 |
doc_splits = text_splitter.split_text(doc.page_content)
|
|
|
|
| 366 |
################################
|
| 367 |
# Document splitting
|
| 368 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 369 |
+
#splits = text_splitter.split_documents(preprocessed_docs)
|
| 370 |
|
| 371 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
| 372 |
+
original_splits = split_documents_with_id(original_docs, text_splitter)
|
| 373 |
+
preprocessed_splits = split_documents_with_id(preprocessed_docs, text_splitter)
|
| 374 |
|
| 375 |
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
|
| 376 |
split_to_original_mapping = {p_split.metadata["doc_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|