import re import fitz # PyMuPDF # ----------------------------- # TEXT EXTRACTION (Robust) # ----------------------------- def extract_text_from_pdf(file_path: str) -> str: """ Extracts and cleans text from a PDF using PyMuPDF. Handles both textual and scanned PDFs gracefully. Args: file_path (str): Path to the PDF file. Returns: str: Combined extracted text. """ text = "" try: with fitz.open(file_path) as pdf: for page in pdf: page_text = page.get_text("text").strip() if not page_text: # Fallback: extract raw blocks (helps with weird PDFs) blocks = page.get_text("blocks") page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str)) text += page_text + "\n" except Exception as e: raise RuntimeError(f"❌ PDF extraction failed: {e}") # Clean out any extra whitespace or control characters text = re.sub(r'\s+', ' ', text).strip() return text # ----------------------------- # SMART CHUNKING (Context Aware) # ----------------------------- def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list: """ Splits text into overlapping, sentence-based chunks. Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval. Args: text (str): Input text. chunk_size (int): Max characters per chunk (default: 800). overlap (int): Overlapping characters for continuity (default: 150). Returns: list[str]: Chunked text segments. """ # Clean text once text = re.sub(r'\s+', ' ', text.strip()) # Sentence segmentation (simple rule-based, fast) sentences = re.split(r'(?<=[.!?])\s+', text) chunks, current = [], "" for sent in sentences: if len(current) + len(sent) + 1 <= chunk_size: current += " " + sent else: # Store full chunk if current.strip(): chunks.append(current.strip()) # Overlap control overlap_part = current[-overlap:] if overlap > 0 else "" current = overlap_part + " " + sent # Append the last chunk if current.strip(): chunks.append(current.strip()) return chunks # ----------------------------- # DEBUGGING (Manual Run) # ----------------------------- if __name__ == "__main__": sample_text = """ Artificial Intelligence is transforming industries. Machine learning is a key subfield, driving automation and predictive analytics. Neural networks power most modern AI applications today. This technology is reshaping healthcare, finance, and manufacturing. """ chunks = chunk_text(sample_text, chunk_size=80, overlap=20) print(f"✅ Chunks created: {len(chunks)}") for i, c in enumerate(chunks, 1): print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")