import fitz # PyMuPDF import re def extract_text_from_pdf(pdf_path: str) -> str: text = "" try: with fitz.open(pdf_path) as doc: for page in doc: text += page.get_text() except Exception as e: print(f"Error reading PDF {pdf_path}: {e}") return "" return text def split_sentences(text: str) -> list[str]: # Simple regex split on punctuation followed by space return re.split(r'(?<=[.!?])\s+', text) def clean_text(text: str) -> str: text = re.sub(r"\s+", " ", str(text)).strip() return text