File size: 853 Bytes
4e7e4c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import json
from pathlib import Path
from typing import List , Dict
from src.ingestion.pdf_to_markdown import MarkdownLoader
from pathlib import Path

def load_documents(path: Path):

    documents: List[Dict] = []
    loader = MarkdownLoader()

    if path.is_file():
        if path.suffix.lower() == ".pdf" :

            documents.extend(loader.load_pdf(path))

        else :
            raise ValueError(f"Unsupported file type: {path.suffix}")
        
    elif path.is_dir():
        pdf_files = sorted(path.rglob("*.pdf"))

        if not pdf_files:
            raise ValueError("No PDF files found in the directory")
        
        for pdf_path in pdf_files :
            documents.extend(loader.load_pdf(pdf_path))

    else:
        raise ValueError(f"Invalid path: {path}")
    
    return documents