Update utils.py
Browse files
utils.py
CHANGED
|
@@ -244,6 +244,31 @@ def create_directory_loader(file_type, directory_path):
|
|
| 244 |
################################################
|
| 245 |
# Custom Loader-Funktionen zu dem DirektoryLoader
|
| 246 |
# Custom loader functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
def load_pdf_with_metadata(file_path):
|
| 248 |
document = fitz.open(file_path)
|
| 249 |
documents = []
|
|
@@ -269,7 +294,7 @@ def load_word_with_metadata(file_path):
|
|
| 269 |
content = para.text
|
| 270 |
contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
|
| 271 |
return contents
|
| 272 |
-
|
| 273 |
|
| 274 |
|
| 275 |
################################################
|
|
|
|
| 244 |
################################################
|
| 245 |
# Custom Loader-Funktionen zu dem DirektoryLoader
|
| 246 |
# Custom loader functions
|
| 247 |
+
def load_pdf_with_metadata(file_path):
|
| 248 |
+
document = fitz.open(file_path)
|
| 249 |
+
documents = []
|
| 250 |
+
for page_num in range(len(document)):
|
| 251 |
+
page = document.load_page(page_num)
|
| 252 |
+
content = page.get_text("text")
|
| 253 |
+
title = document.metadata.get("title", "Unbekannt")
|
| 254 |
+
page_number = page_num + 1
|
| 255 |
+
documents.append(Document(content=content, title=title, page=page_number, path=file_path))
|
| 256 |
+
return documents
|
| 257 |
+
|
| 258 |
+
def load_word_with_metadata(file_path):
|
| 259 |
+
document = docx.Document(file_path)
|
| 260 |
+
title = "Dokument"
|
| 261 |
+
path = file_path
|
| 262 |
+
documents = []
|
| 263 |
+
for para in document.paragraphs:
|
| 264 |
+
content = para.text
|
| 265 |
+
page_number = 1 # Word-Dokumente haben keine Seitenzahlen in diesem Kontext
|
| 266 |
+
documents.append(Document(content=content, title=title, page=page_number, path=path))
|
| 267 |
+
return documents
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
"""
|
| 271 |
+
# Custom loader functions
|
| 272 |
def load_pdf_with_metadata(file_path):
|
| 273 |
document = fitz.open(file_path)
|
| 274 |
documents = []
|
|
|
|
| 294 |
content = para.text
|
| 295 |
contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
|
| 296 |
return contents
|
| 297 |
+
"""
|
| 298 |
|
| 299 |
|
| 300 |
################################################
|