Update utils.py
Browse files
utils.py
CHANGED
|
@@ -302,6 +302,17 @@ def load_word_with_metadata(file_path):
|
|
| 302 |
|
| 303 |
################################################
|
| 304 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
def document_loading_splitting():
|
| 306 |
##############################
|
| 307 |
# Document loading
|
|
@@ -335,6 +346,7 @@ def document_loading_splitting():
|
|
| 335 |
# Vorverarbeitung der Dokumente
|
| 336 |
preprocessed_docs = []
|
| 337 |
original_docs = []
|
|
|
|
| 338 |
for doc in docs:
|
| 339 |
doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
|
| 340 |
preprocessed_content = preprocess_text(doc.page_content)
|
|
@@ -342,7 +354,8 @@ def document_loading_splitting():
|
|
| 342 |
preprocessed_metadata = {
|
| 343 |
"title": preprocessed_title,
|
| 344 |
"page": doc.metadata["page"],
|
| 345 |
-
"path": doc.metadata["path"]
|
|
|
|
| 346 |
}
|
| 347 |
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
|
| 348 |
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
|
|
@@ -354,12 +367,13 @@ def document_loading_splitting():
|
|
| 354 |
# Document splitting
|
| 355 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 356 |
splits = text_splitter.split_documents(preprocessed_docs)
|
|
|
|
| 357 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
| 358 |
-
original_splits =
|
| 359 |
-
preprocessed_splits =
|
| 360 |
|
| 361 |
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
|
| 362 |
-
split_to_original_mapping = {p_split.
|
| 363 |
|
| 364 |
|
| 365 |
print("Splits...........................")
|
|
@@ -720,11 +734,9 @@ class Document:
|
|
| 720 |
self.metadata = {
|
| 721 |
"title": title,
|
| 722 |
"page": page,
|
| 723 |
-
"path": path
|
|
|
|
| 724 |
}
|
| 725 |
-
self.doc_id = doc_id
|
| 726 |
-
|
| 727 |
-
|
| 728 |
|
| 729 |
|
| 730 |
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
|
|
|
|
| 302 |
|
| 303 |
################################################
|
| 304 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
| 305 |
+
# Funktion zum Splitten und Zuweisen der doc_id
|
| 306 |
+
def split_documents_with_id(docs):
|
| 307 |
+
splits = []
|
| 308 |
+
for doc in docs:
|
| 309 |
+
doc_splits = text_splitter.split_text(doc.page_content)
|
| 310 |
+
for split_content in doc_splits:
|
| 311 |
+
split_doc = Document(content=split_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc.doc_id)
|
| 312 |
+
splits.append(split_doc)
|
| 313 |
+
return splits
|
| 314 |
+
|
| 315 |
+
#finally die Splits erzeugen und laden.....
|
| 316 |
def document_loading_splitting():
|
| 317 |
##############################
|
| 318 |
# Document loading
|
|
|
|
| 346 |
# Vorverarbeitung der Dokumente
|
| 347 |
preprocessed_docs = []
|
| 348 |
original_docs = []
|
| 349 |
+
|
| 350 |
for doc in docs:
|
| 351 |
doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
|
| 352 |
preprocessed_content = preprocess_text(doc.page_content)
|
|
|
|
| 354 |
preprocessed_metadata = {
|
| 355 |
"title": preprocessed_title,
|
| 356 |
"page": doc.metadata["page"],
|
| 357 |
+
"path": doc.metadata["path"],
|
| 358 |
+
"doc_id": doc_id # Füge die ID in die Metadaten ein
|
| 359 |
}
|
| 360 |
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
|
| 361 |
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
|
|
|
|
| 367 |
# Document splitting
|
| 368 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 369 |
splits = text_splitter.split_documents(preprocessed_docs)
|
| 370 |
+
|
| 371 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
| 372 |
+
original_splits = split_documents_with_id(original_docs)
|
| 373 |
+
preprocessed_splits = split_documents_with_id(preprocessed_docs)
|
| 374 |
|
| 375 |
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
|
| 376 |
+
split_to_original_mapping = {p_split.metadata["doc_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
| 377 |
|
| 378 |
|
| 379 |
print("Splits...........................")
|
|
|
|
| 734 |
self.metadata = {
|
| 735 |
"title": title,
|
| 736 |
"page": page,
|
| 737 |
+
"path": path,
|
| 738 |
+
"doc_id": doc_id # Füge die ID in die Metadaten ein
|
| 739 |
}
|
|
|
|
|
|
|
|
|
|
| 740 |
|
| 741 |
|
| 742 |
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
|