Spaces:

Tanxshh
/

greenintellect

Sleeping

greenintellect / app /services /pdf_processor.py

Deploy GreenIntellect Backend API with ML models and scraping

02cc7f6 about 1 month ago

578 Bytes

	import fitz # PyMuPDF
	import re

	def extract_text_from_pdf(pdf_path: str) -> str:
	text = ""
	try:
	with fitz.open(pdf_path) as doc:
	for page in doc:
	text += page.get_text()
	except Exception as e:
	print(f"Error reading PDF {pdf_path}: {e}")
	return ""
	return text

	def split_sentences(text: str) -> list[str]:
	# Simple regex split on punctuation followed by space
	return re.split(r'(?<=[.!?])\s+', text)

	def clean_text(text: str) -> str:
	text = re.sub(r"\s+", " ", str(text)).strip()
	return text