import fitz import re from typing import List, Dict from pathlib import Path import logging import PyPDF2 logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DocumentProcessor: def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap def extract_text_from_pdf(self,pdf_path: str) -> str: """Extract text from PDF file""" try: doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() text += f"\n--- Page {page.number + 1} ---\n" # page.number is 0-indexed logger.info(f"Extracted text from {pdf_path}: {len(text)} characters, {len(doc)} pages") doc.close() return text except Exception as e: logger.error(f"Error extracting text from {pdf_path}: {e}") return "" def clean_text(self,text: str) -> str: """Clean text from PDF""" text = re.sub(r'\n{2,}', '\n', text) # keep single newlines text = re.sub(r'[ \t]+', ' ', text) # collapse spaces/tabs # Remove page headers/footers text = re.sub(r'Page \d+.*?\n', '', text) # Remove references to figures/tables text = re.sub(r'\[Figure \d+\]|\[Table \d+\]', '', text) return text.strip() def chunk_text(self,text: str, metadata: Dict = None) -> List[Dict]: """Split text into chunks with metadata""" if not text: return [] sentences = text.split('. ') chunks = [] current_chunk = "" for sentence in sentences: # If adding this sentence would exceed chunk size if len(current_chunk) + len(sentence) > self.chunk_size: if current_chunk: chunks.append({ "text": current_chunk.strip(), "metadata": metadata or {}, "chunk_id": len(chunks) }) # Start new chunk with overlap overlap_text = current_chunk[-self.chunk_overlap:] if len( current_chunk) > self.chunk_overlap else current_chunk current_chunk = overlap_text + " " + sentence else: current_chunk = sentence else: current_chunk += ". " + sentence if current_chunk else sentence # Add final chunk if current_chunk: chunks.append({ "text": current_chunk.strip(), "metadata": metadata or {}, "chunk_id": len(chunks) }) logger.info(f"Created {len(chunks)} chunks") return chunks def extract_metadata(self, pdf_path: str) -> dict: """Extract metadata (title, authors, year, filename, file_size) from a PDF.""" metadata = { "filename": Path(pdf_path).name, "file_size": Path(pdf_path).stat().st_size, "title": None, "authors": None, "year": None } with open(pdf_path, "rb") as f: reader = PyPDF2.PdfReader(f) # 1. Try embedded PDF metadata pdf_meta = reader.metadata if pdf_meta: title = pdf_meta.get("/Title", "").strip() author = pdf_meta.get("/Author", "").strip() if title and title.lower() not in ["", "untitled", "unknown"]: metadata["title"] = title if author and author.lower() not in ["", "anonymous", "unknown"]: metadata["authors"] = author # 2. Fallback: look at first page if not metadata["title"] or not metadata["authors"]: try: first_page = reader.pages[0].extract_text() or "" lines = [line.strip() for line in first_page.split("\n") if line.strip()] # crude heuristic: first line = title if not metadata["title"] and lines: metadata["title"] = lines[0] # crude heuristic: authors in line(s) after title if not metadata["authors"] and len(lines) > 1: possible_authors = lines[1] if re.search(r"[A-Z][a-z]+(?: [A-Z][a-z]+)*", possible_authors): metadata["authors"] = possible_authors # crude heuristic: find year (e.g., 2023, 2024) year_patterns = [ r"\b(19|20)\d{2}\b", # Basic year r"©\s*(19|20)\d{2}", # Copyright year r"\((19|20)\d{2}\)", # Year in parentheses r"(19|20)\d{2}[,.)]", # Year followed by comma/period ] for pattern in year_patterns: year_match = re.search(pattern, first_page) if year_match: year_text = re.search(r"(19|20)\d{2}", year_match.group(0)) if year_text: metadata["year"] = year_text.group(0) break except Exception: pass # Defaults if missing metadata["title"] = metadata["title"] or "Unknown Title" metadata["authors"] = metadata["authors"] if metadata["authors"] else None metadata["year"] = metadata["year"] or "n.d." return metadata def process_document(self,pdf_path: str) -> List[Dict]: """Complete document processing""" try: file_path = Path(pdf_path) except TypeError as e: # Catches specifically if pdf_path is the wrong type logger.error(f"Invalid path type: {pdf_path}: {e}") raise except OSError as e: # Catches other filesystem-related errors logger.error(f"OS error with path: {pdf_path}: {e}") raise metadata=self.extract_metadata(pdf_path) raw_text = self.extract_text_from_pdf(pdf_path) clean_text = self.clean_text(raw_text) chunks = self.chunk_text(clean_text, metadata) logger.info(f"Processed {pdf_path}: {len(chunks)} chunks created") return chunks def process_documents(self, pdf_paths: List[str]) -> List[Dict]: documents = [] for path in pdf_paths: documents.extend(self.process_document(path)) return documents