medextract / app /utils /pdf_utils.py
harsh-dev's picture
Add
ec563fd
import re
from pypdf import PdfReader
def extract_text_from_pdf(pdf_path: str):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
text = re.sub(r'\s+', ' ', text).strip()
return text
def structure_pdf_text(text: str):
section_patterns = [
"Patient Name", "Age", "Gender", "Diagnosis", "Findings",
"Test Results", "Impression", "Prescription", "Doctor's Notes"
]
structured_report = {"General Information": []}
current_section = "General Information"
for line in text.split(". "):
line = line.strip()
if not line:
continue
for section in section_patterns:
if line.lower().startswith(section.lower()):
current_section = section
structured_report[current_section] = []
break
structured_report[current_section].append(line)
return {k: " ".join(v) for k, v in structured_report.items()}