Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| from typing import List , Dict | |
| from src.ingestion.pdf_to_markdown import MarkdownLoader | |
| from pathlib import Path | |
| def load_documents(path: Path): | |
| documents: List[Dict] = [] | |
| loader = MarkdownLoader() | |
| if path.is_file(): | |
| if path.suffix.lower() == ".pdf" : | |
| documents.extend(loader.load_pdf(path)) | |
| else : | |
| raise ValueError(f"Unsupported file type: {path.suffix}") | |
| elif path.is_dir(): | |
| pdf_files = sorted(path.rglob("*.pdf")) | |
| if not pdf_files: | |
| raise ValueError("No PDF files found in the directory") | |
| for pdf_path in pdf_files : | |
| documents.extend(loader.load_pdf(pdf_path)) | |
| else: | |
| raise ValueError(f"Invalid path: {path}") | |
| return documents | |