File size: 578 Bytes
02cc7f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import fitz  # PyMuPDF
import re

def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return ""
    return text

def split_sentences(text: str) -> list[str]:
    # Simple regex split on punctuation followed by space
    return re.split(r'(?<=[.!?])\s+', text)

def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", str(text)).strip()
    return text