File size: 1,474 Bytes
c7f747d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pathlib import Path
from typing import IO, Dict, List, Optional, Union

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class PDFLoader(BaseReader):
    """PDF reader."""

    def load_data(
        self, file: Union[IO[bytes], str, Path], extra_info: Optional[Dict] = None
    ) -> List[Document]:
        """Parse file."""
        import pypdf

        # Check if the file is already a Path object, if not, create a Path object from the string
        if not isinstance(file, Path) and isinstance(file, str):
            file = Path(file)

        # Open the file if it's not already open, else use it as it is
        context = open(file, "rb") if isinstance(file, Path) else file

        with context as fp:
            # Create a PDF object
            pdf = pypdf.PdfReader(fp)

            # Get the number of pages in the PDF document
            num_pages = len(pdf.pages)

            # Iterate over every page
            docs = []
            for page in range(num_pages):
                # Extract the text from the page
                page_text = pdf.pages[page].extract_text()
                page_label = pdf.page_labels[page]
                metadata = {"page_label": page_label, "file_name": file.name}

                if extra_info is not None:
                    metadata.update(extra_info)

                docs.append(Document(text=page_text, extra_info=metadata))
            return docs