Update utils.py
Browse files
utils.py
CHANGED
|
@@ -306,9 +306,10 @@ def load_word_with_metadata(file_path):
|
|
| 306 |
def split_documents_with_id(docs, text_splitter):
|
| 307 |
splits = []
|
| 308 |
for doc in docs:
|
| 309 |
-
doc_splits = text_splitter.split_text(doc.page_content)
|
| 310 |
for split_content in doc_splits:
|
| 311 |
-
|
|
|
|
| 312 |
splits.append(split_doc)
|
| 313 |
return splits
|
| 314 |
|
|
@@ -342,38 +343,26 @@ def document_loading_splitting():
|
|
| 342 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
| 343 |
#docs.extend(loader.load())
|
| 344 |
|
| 345 |
-
|
| 346 |
-
# Vorverarbeitung der Dokumente
|
| 347 |
-
preprocessed_docs = []
|
| 348 |
-
original_docs = []
|
| 349 |
-
|
| 350 |
-
for doc in docs:
|
| 351 |
-
doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
|
| 352 |
-
preprocessed_content = preprocess_text(doc.page_content)
|
| 353 |
-
preprocessed_title = preprocess_text(doc.metadata["title"])
|
| 354 |
-
preprocessed_metadata = {
|
| 355 |
-
"title": preprocessed_title,
|
| 356 |
-
"page": doc.metadata["page"],
|
| 357 |
-
"path": doc.metadata["path"],
|
| 358 |
-
"doc_id": doc_id # Füge die ID in die Metadaten ein
|
| 359 |
-
}
|
| 360 |
-
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
|
| 361 |
-
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
|
| 362 |
-
preprocessed_docs.append(preprocessed_doc)
|
| 363 |
-
original_docs.append(original_doc)
|
| 364 |
-
print("orgin doc....................................."+str(original_doc))
|
| 365 |
|
| 366 |
################################
|
| 367 |
# Document splitting
|
| 368 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 369 |
#splits = text_splitter.split_documents(preprocessed_docs)
|
| 370 |
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
-
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der
|
| 376 |
-
split_to_original_mapping = {p_split.metadata["
|
| 377 |
|
| 378 |
|
| 379 |
print("Splits...........................")
|
|
@@ -381,7 +370,7 @@ def document_loading_splitting():
|
|
| 381 |
if 'divis' in split.page_content:
|
| 382 |
print("DIVIS found in chunk:", split)
|
| 383 |
|
| 384 |
-
return preprocessed_splits, split_to_original_mapping
|
| 385 |
|
| 386 |
###########################################
|
| 387 |
#Chroma DB die splits ablegen - vektorisiert...
|
|
@@ -580,8 +569,6 @@ def extract_document_info(documents):
|
|
| 580 |
else:
|
| 581 |
download_link = doc_path
|
| 582 |
|
| 583 |
-
# Prüfe, ob doc_id existiert und weise einen Standardwert zu, falls nicht
|
| 584 |
-
id = getattr(doc, 'doc_id', None)
|
| 585 |
|
| 586 |
info = {
|
| 587 |
'content': doc.page_content,
|
|
@@ -590,7 +577,6 @@ def extract_document_info(documents):
|
|
| 590 |
'seite': doc.metadata.get("page", "Unbekannte Seite"),
|
| 591 |
'pfad': doc_path,
|
| 592 |
'download_link': download_link,
|
| 593 |
-
'id': id
|
| 594 |
}
|
| 595 |
extracted_info.append(info)
|
| 596 |
return extracted_info
|
|
@@ -735,7 +721,7 @@ class Document:
|
|
| 735 |
"title": title,
|
| 736 |
"page": page,
|
| 737 |
"path": path,
|
| 738 |
-
"
|
| 739 |
}
|
| 740 |
|
| 741 |
|
|
|
|
| 306 |
def split_documents_with_id(docs, text_splitter):
|
| 307 |
splits = []
|
| 308 |
for doc in docs:
|
| 309 |
+
doc_splits = text_splitter.split_text(f"{doc.metadata['title']} {doc.page_content}")
|
| 310 |
for split_content in doc_splits:
|
| 311 |
+
split_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID für jeden Split
|
| 312 |
+
split_doc = Document(content=split_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], split_id=split_id)
|
| 313 |
splits.append(split_doc)
|
| 314 |
return splits
|
| 315 |
|
|
|
|
| 343 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
| 344 |
#docs.extend(loader.load())
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
################################
|
| 348 |
# Document splitting
|
| 349 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
| 350 |
#splits = text_splitter.split_documents(preprocessed_docs)
|
| 351 |
|
| 352 |
+
# Vorverarbeitung der Dokumente
|
| 353 |
+
# Split der Originaldokumente
|
| 354 |
+
original_splits = split_documents_with_id(docs, text_splitter)
|
| 355 |
+
|
| 356 |
+
# Vorverarbeitung der Originalsplits
|
| 357 |
+
preprocessed_splits = []
|
| 358 |
+
for split in original_splits:
|
| 359 |
+
preprocessed_content = preprocess_text(split.page_content)
|
| 360 |
+
preprocessed_split = Document(content=preprocessed_content, title=split.metadata["title"], page=split.metadata["page"], path=split.metadata["path"], split_id=split.metadata["split_id"])
|
| 361 |
+
preprocessed_splits.append(preprocessed_split)
|
| 362 |
+
|
| 363 |
|
| 364 |
+
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der split_ids
|
| 365 |
+
split_to_original_mapping = {p_split.metadata["split_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
| 366 |
|
| 367 |
|
| 368 |
print("Splits...........................")
|
|
|
|
| 370 |
if 'divis' in split.page_content:
|
| 371 |
print("DIVIS found in chunk:", split)
|
| 372 |
|
| 373 |
+
return preprocessed_splits, original_splits, split_to_original_mapping
|
| 374 |
|
| 375 |
###########################################
|
| 376 |
#Chroma DB die splits ablegen - vektorisiert...
|
|
|
|
| 569 |
else:
|
| 570 |
download_link = doc_path
|
| 571 |
|
|
|
|
|
|
|
| 572 |
|
| 573 |
info = {
|
| 574 |
'content': doc.page_content,
|
|
|
|
| 577 |
'seite': doc.metadata.get("page", "Unbekannte Seite"),
|
| 578 |
'pfad': doc_path,
|
| 579 |
'download_link': download_link,
|
|
|
|
| 580 |
}
|
| 581 |
extracted_info.append(info)
|
| 582 |
return extracted_info
|
|
|
|
| 721 |
"title": title,
|
| 722 |
"page": page,
|
| 723 |
"path": path,
|
| 724 |
+
"split_id": split_id # Füge die ID in die Metadaten ein
|
| 725 |
}
|
| 726 |
|
| 727 |
|