Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import re | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| text = "" | |
| try: | |
| with fitz.open(pdf_path) as doc: | |
| for page in doc: | |
| text += page.get_text() | |
| except Exception as e: | |
| print(f"Error reading PDF {pdf_path}: {e}") | |
| return "" | |
| return text | |
| def split_sentences(text: str) -> list[str]: | |
| # Simple regex split on punctuation followed by space | |
| return re.split(r'(?<=[.!?])\s+', text) | |
| def clean_text(text: str) -> str: | |
| text = re.sub(r"\s+", " ", str(text)).strip() | |
| return text | |