import re from pypdf import PdfReader def extract_text_from_pdf(pdf_path: str): reader = PdfReader(pdf_path) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" text = re.sub(r'\s+', ' ', text).strip() return text def structure_pdf_text(text: str): section_patterns = [ "Patient Name", "Age", "Gender", "Diagnosis", "Findings", "Test Results", "Impression", "Prescription", "Doctor's Notes" ] structured_report = {"General Information": []} current_section = "General Information" for line in text.split(". "): line = line.strip() if not line: continue for section in section_patterns: if line.lower().startswith(section.lower()): current_section = section structured_report[current_section] = [] break structured_report[current_section].append(line) return {k: " ".join(v) for k, v in structured_report.items()}