Spaces:
Sleeping
Sleeping
File size: 578 Bytes
02cc7f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
import fitz # PyMuPDF
import re
def extract_text_from_pdf(pdf_path: str) -> str:
text = ""
try:
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
except Exception as e:
print(f"Error reading PDF {pdf_path}: {e}")
return ""
return text
def split_sentences(text: str) -> list[str]:
# Simple regex split on punctuation followed by space
return re.split(r'(?<=[.!?])\s+', text)
def clean_text(text: str) -> str:
text = re.sub(r"\s+", " ", str(text)).strip()
return text
|