| import fitz |
| from pathlib import Path |
| from typing import List, Dict, Any |
|
|
|
|
| def extract_text_from_pdf(pdf_path: str) -> Dict[str, Any]: |
| """ |
| Extract all text and metadata from a single PDF file. |
| |
| This function opens a PDF and reads it page by page. |
| We process per-page rather than extracting everything at once |
| because it lets us later tell the user *which page* an answer came from. |
| |
| Args: |
| pdf_path (str): Full file path to the PDF |
| |
| Returns: |
| dict with three keys: |
| - "text" β the entire paper as one string |
| - "metadata" β title, author, page count, filename |
| - "pages" β list of {page_number, text} per page |
| """ |
| doc = fitz.open(pdf_path) |
|
|
| pages = [] |
| full_text = "" |
|
|
| for page_num in range(len(doc)): |
| page = doc[page_num] |
| page_text = page.get_text("text") |
| pages.append({ |
| "page_number": page_num + 1, |
| "text": page_text |
| }) |
|
|
| full_text += page_text + "\n" |
|
|
|
|
| metadata = doc.metadata |
| metadata["file_name"] = Path(pdf_path).name |
| metadata["total_pages"] = len(doc) |
| metadata["file_path"] = str(pdf_path) |
|
|
| doc.close() |
|
|
| return { |
| "text": full_text, |
| "metadata": metadata, |
| "pages": pages |
| } |
|
|
|
|
| def load_papers_from_folder(folder_path: str) -> List[Dict[str, Any]]: |
| """ |
| Load every PDF in a folder and return their extracted content. |
| |
| This is the main entry point called by the RAG pipeline. |
| It scans the folder, processes each PDF, and returns a list |
| ready to be embedded and stored. |
| |
| Args: |
| folder_path (str): Path to a directory containing PDF files |
| |
| Returns: |
| List of paper dicts (each from extract_text_from_pdf) |
| |
| Raises: |
| FileNotFoundError: If the folder doesn't exist |
| """ |
| folder = Path(folder_path) |
|
|
| if not folder.exists(): |
| raise FileNotFoundError(f"Folder not found: {folder_path}") |
|
|
| |
| pdf_files = list(folder.glob("*.pdf")) + list(folder.glob("*.PDF")) |
|
|
| if not pdf_files: |
| print(f"No PDF files found in {folder_path}") |
| return [] |
|
|
| papers = [] |
|
|
| for pdf_file in pdf_files: |
| print(f"Loading: {pdf_file.name}") |
| try: |
| paper_data = extract_text_from_pdf(str(pdf_file)) |
| pages = paper_data["metadata"]["total_pages"] |
| print(f"{pages} pages extracted") |
| papers.append(paper_data) |
| except Exception as e: |
| print(f"Skipping {pdf_file.name}: {e}") |
|
|
| print(f"\nLoaded {len(papers)} / {len(pdf_files)} papers") |
| return papers |
|
|