chatpaper / src /ingestion /pdf_loader.py
Shafagh99's picture
add chatpaper project
c003cc2
import fitz # This is PyMuPDF β€” the package is called pymupdf but imports as fitz
from pathlib import Path
from typing import List, Dict, Any
def extract_text_from_pdf(pdf_path: str) -> Dict[str, Any]:
"""
Extract all text and metadata from a single PDF file.
This function opens a PDF and reads it page by page.
We process per-page rather than extracting everything at once
because it lets us later tell the user *which page* an answer came from.
Args:
pdf_path (str): Full file path to the PDF
Returns:
dict with three keys:
- "text" β†’ the entire paper as one string
- "metadata" β†’ title, author, page count, filename
- "pages" β†’ list of {page_number, text} per page
"""
doc = fitz.open(pdf_path)
pages = []
full_text = ""
for page_num in range(len(doc)):
page = doc[page_num]
page_text = page.get_text("text")
pages.append({
"page_number": page_num + 1,
"text": page_text
})
full_text += page_text + "\n"
metadata = doc.metadata
metadata["file_name"] = Path(pdf_path).name
metadata["total_pages"] = len(doc)
metadata["file_path"] = str(pdf_path)
doc.close()
return {
"text": full_text,
"metadata": metadata,
"pages": pages
}
def load_papers_from_folder(folder_path: str) -> List[Dict[str, Any]]:
"""
Load every PDF in a folder and return their extracted content.
This is the main entry point called by the RAG pipeline.
It scans the folder, processes each PDF, and returns a list
ready to be embedded and stored.
Args:
folder_path (str): Path to a directory containing PDF files
Returns:
List of paper dicts (each from extract_text_from_pdf)
Raises:
FileNotFoundError: If the folder doesn't exist
"""
folder = Path(folder_path)
if not folder.exists():
raise FileNotFoundError(f"Folder not found: {folder_path}")
# glob("*.pdf") finds all files ending in .pdf (case-sensitive on Linux)
pdf_files = list(folder.glob("*.pdf")) + list(folder.glob("*.PDF"))
if not pdf_files:
print(f"No PDF files found in {folder_path}")
return []
papers = []
for pdf_file in pdf_files:
print(f"Loading: {pdf_file.name}")
try:
paper_data = extract_text_from_pdf(str(pdf_file))
pages = paper_data["metadata"]["total_pages"]
print(f"{pages} pages extracted")
papers.append(paper_data)
except Exception as e:
print(f"Skipping {pdf_file.name}: {e}")
print(f"\nLoaded {len(papers)} / {len(pdf_files)} papers")
return papers