| | """Medical metadata extractor for EyeWiki articles.""" |
| |
|
| | import re |
| | from typing import Dict, List, Set |
| |
|
| |
|
| | class MetadataExtractor: |
| | """ |
| | Extract medical metadata from EyeWiki articles. |
| | |
| | Extracts: |
| | - Disease names |
| | - ICD-10 codes |
| | - Anatomical structures |
| | - Symptoms |
| | - Treatments (medications and procedures) |
| | - Categories |
| | """ |
| |
|
| | |
| | ANATOMICAL_STRUCTURES = { |
| | |
| | "cornea", "corneal", "sclera", "scleral", "retina", "retinal", |
| | "lens", "crystalline lens", "iris", "iridial", "pupil", "pupillary", |
| | "choroid", "choroidal", "vitreous", "vitreous humor", |
| | "optic nerve", "optic disc", "optic cup", |
| |
|
| | |
| | "anterior chamber", "posterior chamber", "anterior segment", |
| | "trabecular meshwork", "schlemm's canal", "ciliary body", "ciliary muscle", |
| | "zonules", "zonular", "aqueous humor", "aqueous", |
| |
|
| | |
| | "posterior segment", "macula", "macular", "fovea", "foveal", |
| | "retinal pigment epithelium", "rpe", "photoreceptors", |
| | "rods", "cones", "ganglion cells", |
| |
|
| | |
| | "inner limiting membrane", "nerve fiber layer", "ganglion cell layer", |
| | "inner plexiform layer", "inner nuclear layer", "outer plexiform layer", |
| | "outer nuclear layer", "external limiting membrane", |
| | "photoreceptor layer", "bruch's membrane", |
| |
|
| | |
| | "eyelid", "eyelids", "conjunctiva", "conjunctival", |
| | "lacrimal gland", "tear film", "meibomian glands", |
| | "extraocular muscles", "rectus muscle", "oblique muscle", |
| | "orbit", "orbital", "optic chiasm", |
| |
|
| | |
| | "central retinal artery", "central retinal vein", |
| | "retinal vessels", "vascular", "vasculature", |
| | "choriocapillaris", |
| |
|
| | |
| | "angle", "iridocorneal angle", "suprachoroidal space", |
| | } |
| |
|
| | |
| | MEDICATIONS = { |
| | |
| | "latanoprost", "timolol", "dorzolamide", "brinzolamide", |
| | "brimonidine", "apraclonidine", "bimatoprost", "travoprost", |
| | "tafluprost", "pilocarpine", "carbachol", |
| | "acetazolamide", "methazolamide", |
| |
|
| | |
| | "bevacizumab", "ranibizumab", "aflibercept", "brolucizumab", |
| | "pegaptanib", "faricimab", |
| |
|
| | |
| | "prednisolone", "dexamethasone", "triamcinolone", "fluocinolone", |
| | "difluprednate", "fluorometholone", "loteprednol", |
| | "betamethasone", "hydrocortisone", |
| |
|
| | |
| | "moxifloxacin", "gatifloxacin", "ciprofloxacin", "ofloxacin", |
| | "levofloxacin", "tobramycin", "gentamicin", "erythromycin", |
| | "azithromycin", "bacitracin", "polymyxin", "neomycin", |
| | "vancomycin", "ceftazidime", "cefazolin", |
| |
|
| | |
| | "acyclovir", "ganciclovir", "valganciclovir", "valacyclovir", |
| | "trifluridine", "foscarnet", |
| |
|
| | |
| | "ketorolac", "diclofenac", "nepafenac", "bromfenac", |
| | "cyclosporine", "tacrolimus", "lifitegrast", |
| |
|
| | |
| | "tropicamide", "cyclopentolate", "atropine", "homatropine", |
| | "phenylephrine", |
| |
|
| | |
| | "mitomycin", "5-fluorouracil", "interferon", |
| | "methotrexate", "chlorambucil", |
| | } |
| |
|
| | |
| | PROCEDURES = { |
| | |
| | "phacoemulsification", "phaco", "cataract extraction", |
| | "extracapsular cataract extraction", "ecce", |
| | "intracapsular cataract extraction", "icce", |
| | "iol implantation", "intraocular lens", |
| |
|
| | |
| | "trabeculectomy", "tube shunt", "glaucoma drainage device", |
| | "ahmed valve", "baerveldt implant", "molteno implant", |
| | "selective laser trabeculoplasty", "slt", "argon laser trabeculoplasty", "alt", |
| | "laser peripheral iridotomy", "lpi", "iridotomy", |
| | "cyclophotocoagulation", "cyclocryotherapy", |
| | "minimally invasive glaucoma surgery", "migs", |
| | "trabectome", "istent", "kahook dual blade", "goniotomy", |
| |
|
| | |
| | "vitrectomy", "pars plana vitrectomy", "ppv", |
| | "membrane peeling", "epiretinal membrane peeling", |
| | "endolaser", "photocoagulation", "panretinal photocoagulation", "prp", |
| | "focal laser", "grid laser", |
| | "pneumatic retinopexy", "scleral buckle", |
| | "silicone oil", "gas tamponade", "c3f8", "sf6", |
| |
|
| | |
| | "penetrating keratoplasty", "pkp", "corneal transplant", |
| | "descemet stripping endothelial keratoplasty", "dsek", "dsaek", |
| | "descemet membrane endothelial keratoplasty", "dmek", |
| | "deep anterior lamellar keratoplasty", "dalk", |
| | "phototherapeutic keratectomy", "ptk", |
| | "corneal crosslinking", "cxl", |
| |
|
| | |
| | "lasik", "prk", "photorefractive keratectomy", |
| | "smile", "lasek", "refractive lens exchange", |
| | "phakic iol", "icl", |
| |
|
| | |
| | "intravitreal injection", "intravitreal", |
| | "subtenon injection", "retrobulbar block", "peribulbar block", |
| |
|
| | |
| | "yag laser capsulotomy", "laser capsulotomy", |
| | "laser iridotomy", "laser trabeculoplasty", |
| |
|
| | |
| | "enucleation", "evisceration", "exenteration", |
| | "orbital decompression", "ptosis repair", "blepharoplasty", |
| | "dacryocystorhinostomy", "dcr", |
| | } |
| |
|
| | |
| | SYMPTOMS = { |
| | |
| | "blurred vision", "blurring", "vision loss", "visual loss", |
| | "decreased vision", "blindness", "blind spot", |
| | "photophobia", "light sensitivity", "glare", "halos", |
| | "diplopia", "double vision", "metamorphopsia", "distortion", |
| | "scotoma", "floaters", "flashes", "photopsia", |
| | "night blindness", "nyctalopia", "color vision defect", |
| | "visual field defect", "peripheral vision loss", |
| |
|
| | |
| | "eye pain", "ocular pain", "pain", "foreign body sensation", |
| | "irritation", "burning", "stinging", "grittiness", |
| | "discomfort", "ache", "headache", |
| |
|
| | |
| | "discharge", "tearing", "epiphora", "watery eyes", |
| | "mucus", "crusting", "mattering", |
| |
|
| | |
| | "redness", "red eye", "injection", "hyperemia", |
| | "swelling", "edema", "chemosis", "inflammation", |
| |
|
| | |
| | "itching", "pruritus", "dryness", "dry eye", |
| | "eye strain", "asthenopia", "fatigue", |
| | } |
| |
|
| | def __init__(self): |
| | """Initialize the metadata extractor.""" |
| | |
| | self.icd_pattern = re.compile( |
| | r'\b[A-Z]\d{2}(?:\.\d{1,2})?\b|' |
| | r'\b[H][0-5]\d(?:\.\d{1,3})?\b' |
| | ) |
| |
|
| | def extract_icd_codes(self, text: str) -> List[str]: |
| | """ |
| | Extract ICD-10 codes from text using regex. |
| | |
| | Patterns matched: |
| | - Standard ICD-10: H40.1, H35.32, etc. |
| | - Ophthalmic codes: H00-H59 range |
| | - Generic codes: A00, B99.9, etc. |
| | |
| | Args: |
| | text: Input text to search |
| | |
| | Returns: |
| | List of unique ICD-10 codes found |
| | """ |
| | codes = self.icd_pattern.findall(text) |
| |
|
| | |
| | valid_codes = set() |
| | for code in codes: |
| | |
| | if code.startswith('H'): |
| | |
| | try: |
| | main_code = int(code[1:3]) |
| | if 0 <= main_code <= 59: |
| | valid_codes.add(code) |
| | except (ValueError, IndexError): |
| | continue |
| | else: |
| | |
| | valid_codes.add(code) |
| |
|
| | return sorted(list(valid_codes)) |
| |
|
| | def extract_anatomical_terms(self, text: str) -> List[str]: |
| | """ |
| | Extract anatomical structure mentions from text. |
| | |
| | Uses case-insensitive pattern matching against predefined |
| | anatomical structure vocabulary. |
| | |
| | Args: |
| | text: Input text to search |
| | |
| | Returns: |
| | List of unique anatomical structures found |
| | """ |
| | text_lower = text.lower() |
| | found_structures = set() |
| |
|
| | for structure in self.ANATOMICAL_STRUCTURES: |
| | |
| | pattern = r'\b' + re.escape(structure) + r's?\b' |
| | if re.search(pattern, text_lower): |
| | found_structures.add(structure) |
| |
|
| | return sorted(list(found_structures)) |
| |
|
| | def extract_medications(self, text: str) -> List[str]: |
| | """ |
| | Extract medication mentions from text. |
| | |
| | Args: |
| | text: Input text to search |
| | |
| | Returns: |
| | List of unique medications found |
| | """ |
| | text_lower = text.lower() |
| | found_medications = set() |
| |
|
| | for medication in self.MEDICATIONS: |
| | |
| | pattern = r'\b' + re.escape(medication) + r'\b' |
| | if re.search(pattern, text_lower): |
| | found_medications.add(medication) |
| |
|
| | return sorted(list(found_medications)) |
| |
|
| | def extract_procedures(self, text: str) -> List[str]: |
| | """ |
| | Extract procedure mentions from text. |
| | |
| | Args: |
| | text: Input text to search |
| | |
| | Returns: |
| | List of unique procedures found |
| | """ |
| | text_lower = text.lower() |
| | found_procedures = set() |
| |
|
| | for procedure in self.PROCEDURES: |
| | |
| | pattern = r'\b' + re.escape(procedure) + r'\b' |
| | if re.search(pattern, text_lower): |
| | found_procedures.add(procedure) |
| |
|
| | return sorted(list(found_procedures)) |
| |
|
| | def extract_symptoms(self, text: str) -> List[str]: |
| | """ |
| | Extract symptom mentions from text. |
| | |
| | Args: |
| | text: Input text to search |
| | |
| | Returns: |
| | List of unique symptoms found |
| | """ |
| | text_lower = text.lower() |
| | found_symptoms = set() |
| |
|
| | for symptom in self.SYMPTOMS: |
| | |
| | pattern = r'\b' + re.escape(symptom) + r'\b' |
| | if re.search(pattern, text_lower): |
| | found_symptoms.add(symptom) |
| |
|
| | return sorted(list(found_symptoms)) |
| |
|
| | def extract_disease_name(self, existing_metadata: Dict) -> str: |
| | """ |
| | Extract primary disease name from metadata. |
| | |
| | Tries multiple sources: |
| | 1. Article title |
| | 2. First category |
| | 3. URL path |
| | |
| | Args: |
| | existing_metadata: Metadata dict with 'title', 'url', 'categories' |
| | |
| | Returns: |
| | Primary disease/condition name |
| | """ |
| | |
| | title = existing_metadata.get("title", "") |
| | if title: |
| | |
| | cleaned = re.sub(r'^(Disease|Condition|Syndrome):\s*', '', title, flags=re.IGNORECASE) |
| | return cleaned.strip() |
| |
|
| | |
| | categories = existing_metadata.get("categories", []) |
| | if categories and len(categories) > 0: |
| | return categories[0].strip() |
| |
|
| | |
| | url = existing_metadata.get("url", "") |
| | if url: |
| | |
| | match = re.search(r'/([^/]+)$', url) |
| | if match: |
| | |
| | name = match.group(1).replace('_', ' ') |
| | return name.strip() |
| |
|
| | return "Unknown" |
| |
|
| | def extract(self, content: str, existing_metadata: Dict) -> Dict: |
| | """ |
| | Extract comprehensive medical metadata from article content. |
| | |
| | Args: |
| | content: Article text content (markdown) |
| | existing_metadata: Existing metadata dict with basic info |
| | |
| | Returns: |
| | Enhanced metadata dictionary with medical information |
| | """ |
| | |
| | enhanced_metadata = existing_metadata.copy() |
| |
|
| | |
| | enhanced_metadata["disease_name"] = self.extract_disease_name(existing_metadata) |
| |
|
| | |
| | enhanced_metadata["icd_codes"] = self.extract_icd_codes(content) |
| |
|
| | |
| | enhanced_metadata["anatomical_structures"] = self.extract_anatomical_terms(content) |
| |
|
| | |
| | enhanced_metadata["symptoms"] = self.extract_symptoms(content) |
| |
|
| | |
| | medications = self.extract_medications(content) |
| | procedures = self.extract_procedures(content) |
| | enhanced_metadata["treatments"] = { |
| | "medications": medications, |
| | "procedures": procedures, |
| | } |
| |
|
| | |
| | if "categories" not in enhanced_metadata: |
| | enhanced_metadata["categories"] = [] |
| |
|
| | |
| | enhanced_metadata["extraction_stats"] = { |
| | "icd_codes_found": len(enhanced_metadata["icd_codes"]), |
| | "anatomical_terms_found": len(enhanced_metadata["anatomical_structures"]), |
| | "symptoms_found": len(enhanced_metadata["symptoms"]), |
| | "medications_found": len(medications), |
| | "procedures_found": len(procedures), |
| | } |
| |
|
| | return enhanced_metadata |
| |
|
| | def extract_batch(self, documents: List[Dict]) -> List[Dict]: |
| | """ |
| | Extract metadata from multiple documents. |
| | |
| | Args: |
| | documents: List of dicts with 'content' and 'metadata' keys |
| | |
| | Returns: |
| | List of enhanced metadata dictionaries |
| | """ |
| | results = [] |
| |
|
| | for doc in documents: |
| | content = doc.get("content", "") |
| | metadata = doc.get("metadata", {}) |
| |
|
| | enhanced = self.extract(content, metadata) |
| | results.append(enhanced) |
| |
|
| | return results |
| |
|
| | def get_anatomical_vocabulary(self) -> Set[str]: |
| | """Get the full anatomical vocabulary set.""" |
| | return self.ANATOMICAL_STRUCTURES.copy() |
| |
|
| | def get_medication_vocabulary(self) -> Set[str]: |
| | """Get the full medication vocabulary set.""" |
| | return self.MEDICATIONS.copy() |
| |
|
| | def get_procedure_vocabulary(self) -> Set[str]: |
| | """Get the full procedure vocabulary set.""" |
| | return self.PROCEDURES.copy() |
| |
|
| | def get_symptom_vocabulary(self) -> Set[str]: |
| | """Get the full symptom vocabulary set.""" |
| | return self.SYMPTOMS.copy() |
| |
|