Update utils.py
Browse files
utils.py
CHANGED
|
@@ -443,6 +443,35 @@ def extract_document_info(documents):
|
|
| 443 |
}
|
| 444 |
extracted_info.append(info)
|
| 445 |
return extracted_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
|
| 447 |
|
| 448 |
|
|
|
|
| 443 |
}
|
| 444 |
extracted_info.append(info)
|
| 445 |
return extracted_info
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
def extract_document_info(documents):
|
| 450 |
+
extracted_info = []
|
| 451 |
+
for doc in documents:
|
| 452 |
+
# Extract the filename from the path to use as the title
|
| 453 |
+
filename = os.path.basename(doc.metadata.get("path", ""))
|
| 454 |
+
title = filename if filename else "Keine Überschrift"
|
| 455 |
+
|
| 456 |
+
# Determine the document type and adjust the path accordingly
|
| 457 |
+
doc_path = doc.metadata.get("path", "")
|
| 458 |
+
if doc_path.endswith('.pdf'):
|
| 459 |
+
download_link = f"https://huggingface.co/spaces/alexkueck/SucheRAG/resolve/main/chroma/kkg/pdf/{title}?token=hf_token"
|
| 460 |
+
elif doc_path.endswith('.docx'):
|
| 461 |
+
download_link = f"https://huggingface.co/spaces/alexkueck/SucheRAG/resolve/main/chroma/kkg/word/{title}?token=hf_token"
|
| 462 |
+
else:
|
| 463 |
+
download_link = doc_path
|
| 464 |
+
|
| 465 |
+
info = {
|
| 466 |
+
'content': doc.page_content,
|
| 467 |
+
'metadata': doc.metadata,
|
| 468 |
+
'titel': title,
|
| 469 |
+
'seite': doc.metadata.get("page", "Unbekannte Seite"),
|
| 470 |
+
'pfad': doc_path,
|
| 471 |
+
'download_link': download_link
|
| 472 |
+
}
|
| 473 |
+
extracted_info.append(info)
|
| 474 |
+
return extracted_info
|
| 475 |
|
| 476 |
|
| 477 |
|