plg-score / PDFLoader.py
n8bit's picture
Upload folder using huggingface_hub
c7f747d
from pathlib import Path
from typing import IO, Dict, List, Optional, Union
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class PDFLoader(BaseReader):
"""PDF reader."""
def load_data(
self, file: Union[IO[bytes], str, Path], extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
import pypdf
# Check if the file is already a Path object, if not, create a Path object from the string
if not isinstance(file, Path) and isinstance(file, str):
file = Path(file)
# Open the file if it's not already open, else use it as it is
context = open(file, "rb") if isinstance(file, Path) else file
with context as fp:
# Create a PDF object
pdf = pypdf.PdfReader(fp)
# Get the number of pages in the PDF document
num_pages = len(pdf.pages)
# Iterate over every page
docs = []
for page in range(num_pages):
# Extract the text from the page
page_text = pdf.pages[page].extract_text()
page_label = pdf.page_labels[page]
metadata = {"page_label": page_label, "file_name": file.name}
if extra_info is not None:
metadata.update(extra_info)
docs.append(Document(text=page_text, extra_info=metadata))
return docs