greenintellect / app /services /pdf_processor.py
Tanxshh's picture
Deploy GreenIntellect Backend API with ML models and scraping
02cc7f6
raw
history blame contribute delete
578 Bytes
import fitz # PyMuPDF
import re
def extract_text_from_pdf(pdf_path: str) -> str:
text = ""
try:
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
except Exception as e:
print(f"Error reading PDF {pdf_path}: {e}")
return ""
return text
def split_sentences(text: str) -> list[str]:
# Simple regex split on punctuation followed by space
return re.split(r'(?<=[.!?])\s+', text)
def clean_text(text: str) -> str:
text = re.sub(r"\s+", " ", str(text)).strip()
return text