import re import fitz # PyMuPDF # ----------------------------- # TEXT EXTRACTION (Robust) # ----------------------------- def extract_text_from_pdf(file_path: str) -> str: """ Extracts and cleans text from a PDF using PyMuPDF. Handles both textual and scanned PDFs gracefully. Args: file_path (str): Path to the PDF file. Returns: str: Combined extracted text. """ text = "" try: with fitz.open(file_path) as pdf: for page in pdf: page_text = page.get_text("text").strip() if not page_text: # Fallback: extract raw blocks (helps with weird PDFs) blocks = pdf.get_text("blocks") page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str)) text += page_text + "\n" except Exception as e: raise RuntimeError(f"❌ PDF extraction failed: {e}") # Clean out any extra whitespace or control characters text = re.sub(r'\s+', ' ', text).strip() return text # ----------------------------- # SMART CHUNKING (Step-Aware + Context Aware) # ----------------------------- def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list: """ Splits text into overlapping, structured chunks. Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact. Falls back to sentence-based chunking for normal paragraphs. Args: text (str): Input text. chunk_size (int): Max characters per chunk (default: 800). overlap (int): Overlapping characters for continuity (default: 200). Returns: list[str]: Chunked text segments. """ # Clean and normalize text text = re.sub(r'\s+', ' ', text.strip()) # Try to detect “Step” patterns step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE) step_splits = [s.strip() for s in step_splits if s.strip()] chunks = [] # Case 1️⃣: Document has visible “Step” structure if len(step_splits) > 1: for step in step_splits: if len(step) > chunk_size: # If a step is too long → split by sentences within that step sentences = re.split(r'(?<=[.!?])\s+', step) current = "" for sent in sentences: if len(current) + len(sent) + 1 <= chunk_size: current += " " + sent else: if current.strip(): chunks.append(current.strip()) overlap_part = current[-overlap:] if overlap > 0 else "" current = overlap_part + " " + sent if current.strip(): chunks.append(current.strip()) else: chunks.append(step.strip()) # Case 2️⃣: No “Step” keywords — fall back to sentence-based chunking else: sentences = re.split(r'(?<=[.!?])\s+', text) current = "" for sent in sentences: if len(current) + len(sent) + 1 <= chunk_size: current += " " + sent else: if current.strip(): chunks.append(current.strip()) overlap_part = current[-overlap:] if overlap > 0 else "" current = overlap_part + " " + sent if current.strip(): chunks.append(current.strip()) return chunks # ----------------------------- # DEBUGGING (Manual Run) # ----------------------------- if __name__ == "__main__": sample_text = """ Step 1: Open the application. Step 2: Navigate to the dashboard. Step 3: Review the summary and click ‘Export’. If the steps are missing, the function should still chunk by sentences. """ chunks = chunk_text(sample_text, chunk_size=100, overlap=20) print(f"✅ Chunks created: {len(chunks)}") for i, c in enumerate(chunks, 1): print(f"\n--- Chunk {i} ---\n{c}")