| import os |
| import fitz |
| from typing import List, Optional |
| from langchain.schema import Document |
|
|
| class PDFLoader: |
| def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 200): |
| """ |
| Initialize the PDF document loader |
| |
| Args: |
| chunk_size: Maximum size of each chunk |
| chunk_overlap: Overlap between chunks |
| """ |
| self.chunk_size = chunk_size |
| self.chunk_overlap = chunk_overlap |
| |
| def load_file(self, file_path: str) -> List[Document]: |
| """ |
| Load a PDF file and convert it to a list of documents |
| |
| Args: |
| file_path: Path to the PDF file |
| |
| Returns: |
| List of Document objects |
| """ |
| if not os.path.exists(file_path): |
| raise FileNotFoundError(f"File not found: {file_path}") |
| |
| try: |
| print(f"Loading PDF: {file_path}") |
| |
| |
| file_name = os.path.basename(file_path) |
| |
| |
| pdf = fitz.open(file_path) |
| |
| |
| metadata = { |
| "source": file_path, |
| "title": pdf.metadata.get("title") or file_name, |
| "author": pdf.metadata.get("author", ""), |
| "creation_date": pdf.metadata.get("creationDate", ""), |
| "file_type": "pdf", |
| "page_count": len(pdf), |
| } |
| |
| documents = [] |
| text_chunks = [] |
| |
| |
| for page_num, page in enumerate(pdf): |
| text = page.get_text() |
| if text.strip(): |
| |
| page_metadata = metadata.copy() |
| page_metadata.update({ |
| "page_number": page_num + 1, |
| }) |
| |
| |
| if len(text) <= self.chunk_size: |
| documents.append(Document( |
| page_content=text, |
| metadata=page_metadata |
| )) |
| else: |
| |
| chunks = self._chunk_text(text) |
| for i, chunk in enumerate(chunks): |
| chunk_metadata = page_metadata.copy() |
| chunk_metadata.update({"chunk": i + 1}) |
| |
| documents.append(Document( |
| page_content=chunk, |
| metadata=chunk_metadata |
| )) |
| |
| print(f"Extracted {len(documents)} chunks from PDF") |
| return documents |
| |
| except Exception as e: |
| print(f"Error loading PDF {file_path}: {str(e)}") |
| return [] |
| |
| def _chunk_text(self, text: str) -> List[str]: |
| """ |
| Chunk text into smaller pieces |
| |
| Args: |
| text: Text to chunk |
| |
| Returns: |
| List of text chunks |
| """ |
| chunks = [] |
| start = 0 |
| |
| while start < len(text): |
| end = min(start + self.chunk_size, len(text)) |
| |
| |
| if end < len(text): |
| |
| for break_char in ['\n\n', '\n', '. ', '? ', '! ']: |
| last_break = text.rfind(break_char, start, end) |
| if last_break > start + self.chunk_size / 2: |
| end = last_break + len(break_char) |
| break |
| |
| chunks.append(text[start:end]) |
| start = end - self.chunk_overlap if end < len(text) else end |
| |
| return chunks |