import os import re import json import pdfplumber from tqdm import tqdm from bs4 import BeautifulSoup from dataclasses import dataclass, field, asdict from typing import Optional PDF_DIR = "data/pdfs" OUTPUT_FILE = "data/processed/extracted_sections.json" SKIP_FILES = { "TOU-UHCPROVIDER-COM-EN.pdf", "OSPP-UHCPROVIDER-COM-EN.pdf", } SECTION_HEADERS = [ "Instructions for Use", "Coverage Rationale", "Coverage Summary", "Application", "Medical Records Documentation Used for Reviews", "Documentation Requirements", "Definitions", "Applicable Codes", "Description of Services", "Benefit Considerations", "Clinical Evidence", "Background", "U.S. Food and Drug Administration", "Centers for Medicare and Medicaid Services", "References", "Policy History/Revision Information", "Frequently Asked Questions", ] SKIP_SECTIONS = { "Instructions for Use", "Policy History/Revision Information", } PAGE_HEADER_PATTERNS = [ re.compile(r"^.{0,120}Page\s+\d+\s+of\s+\d+\s*$"), re.compile(r"^UnitedHealthcare.*(?:Medical|Drug)\s+(?:Policy|Benefit).*(?:Effective|Policy)"), re.compile(r"^Proprietary Information of UnitedHealthcare"), re.compile(r"^©\s*\d{4}"), re.compile(r"^Effective\s+\d{2}/\d{2}/\d{4}\s*$"), ] SIDEBAR_PATTERNS = [ re.compile(r"^(?:Related\s+)?(?:Commercial|Community\s+Plan|Medicare\s+Advantage)\s+(?:Policy|Policies)", re.IGNORECASE), re.compile(r"^Related\s+(?:Commercial|List)", re.IGNORECASE), re.compile(r"^Medicare\s+Advantage\s+Policy", re.IGNORECASE), ] POLICY_NUMBER_RE = re.compile(r"Policy\s+Number:\s*(\S+)") EFFECTIVE_DATE_RE = re.compile(r"Effective\s+Date:\s*(.+?)(?:\s{2,}|$)") PLAN_TYPE_RE = re.compile(r"UnitedHealthcare®?\s+(Commercial.*?)$", re.MULTILINE) @dataclass class PolicySection: section: str content: str page_start: int page_end: int @dataclass class PolicyDocument: filename: str policy_name: str policy_number: str effective_date: str plan_type: str doc_type: str sections: list = field(default_factory=list) def is_html_file(path): try: with open(path, "rb") as f: start = f.read(200).decode(errors="ignore").lower() return " 10) if cleaned_content.strip(): sec.content = cleaned_content filtered_sections.append(sec) return PolicyDocument( filename=filename, policy_name=policy_name, policy_number=policy_number, effective_date=effective_date, plan_type=plan_type, doc_type=doc_type, sections=[asdict(s) for s in filtered_sections] ) def main(): all_policies = [] pdfs = [f for f in os.listdir(PDF_DIR) if f not in SKIP_FILES] pdfs.sort() for filename in tqdm(pdfs, desc="Extracting policies"): path = os.path.join(PDF_DIR, filename) if is_html_file(path): doc = extract_html(path, filename) else: doc = extract_policy(path, filename) if doc and doc.sections: all_policies.append(asdict(doc)) os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(all_policies, f, indent=2, ensure_ascii=False) total_sections = sum(len(p["sections"]) for p in all_policies) print(f"Extracted {len(all_policies)} policies with {total_sections} sections") print(f"Saved to: {OUTPUT_FILE}") if __name__ == "__main__": main()