Spaces:

harshvisualz
/

medextract

Sleeping

medextract / app /utils /pdf_utils.py

Add

ec563fd 9 months ago

1.07 kB

	import re
	from pypdf import PdfReader

	def extract_text_from_pdf(pdf_path: str):
	reader = PdfReader(pdf_path)
	text = ""

	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def structure_pdf_text(text: str):
	section_patterns = [
	"Patient Name", "Age", "Gender", "Diagnosis", "Findings",
	"Test Results", "Impression", "Prescription", "Doctor's Notes"
	]

	structured_report = {"General Information": []}
	current_section = "General Information"

	for line in text.split(". "):
	line = line.strip()
	if not line:
	continue
	for section in section_patterns:
	if line.lower().startswith(section.lower()):
	current_section = section
	structured_report[current_section] = []
	break
	structured_report[current_section].append(line)

	return {k: " ".join(v) for k, v in structured_report.items()}