|
|
from pathlib import Path |
|
|
from typing import IO, Dict, List, Optional, Union |
|
|
|
|
|
from llama_index.readers.base import BaseReader |
|
|
from llama_index.readers.schema.base import Document |
|
|
|
|
|
|
|
|
class PDFLoader(BaseReader): |
|
|
"""PDF reader.""" |
|
|
|
|
|
def load_data( |
|
|
self, file: Union[IO[bytes], str, Path], extra_info: Optional[Dict] = None |
|
|
) -> List[Document]: |
|
|
"""Parse file.""" |
|
|
import pypdf |
|
|
|
|
|
|
|
|
if not isinstance(file, Path) and isinstance(file, str): |
|
|
file = Path(file) |
|
|
|
|
|
|
|
|
context = open(file, "rb") if isinstance(file, Path) else file |
|
|
|
|
|
with context as fp: |
|
|
|
|
|
pdf = pypdf.PdfReader(fp) |
|
|
|
|
|
|
|
|
num_pages = len(pdf.pages) |
|
|
|
|
|
|
|
|
docs = [] |
|
|
for page in range(num_pages): |
|
|
|
|
|
page_text = pdf.pages[page].extract_text() |
|
|
page_label = pdf.page_labels[page] |
|
|
metadata = {"page_label": page_label, "file_name": file.name} |
|
|
|
|
|
if extra_info is not None: |
|
|
metadata.update(extra_info) |
|
|
|
|
|
docs.append(Document(text=page_text, extra_info=metadata)) |
|
|
return docs |