import spacy import re from transformers import AutoTokenizer, AutoModel import torch from datetime import datetime import time class RadioloLabProcessor: def __init__(self, model_path: str): self.nlp = spacy.load(model_path) self.clinical_bert_tokenizer = AutoTokenizer.from_pretrained( "nlpie/clinical-distilbert") self.clinical_bert_model = AutoModel.from_pretrained( "nlpie/clinical-distilbert") self.lab_tests = { "White Blood Cell Count": {"unit": "x10^9/L", "min": 4.0, "max": 11.0}, "Red Blood Cell Count": {"unit": "x10^12/L", "min": 4.2, "max": 5.9}, "Hemoglobin": {"unit": "g/dL", "min": 13.5, "max": 17.5}, "Hematocrit": {"unit": "%", "min": 38.3, "max": 48.6}, "Platelet Count": {"unit": "x10^9/L", "min": 150, "max": 450}, "Glucose": {"unit": "mg/dL", "min": 70, "max": 99}, "Creatinine": {"unit": "mg/dL", "min": 0.6, "max": 1.2}, "Urea": {"unit": "mg/dL", "min": 15, "max": 50}, "Cholesterol": {"unit": "mg/dL", "min": 0, "max": 200}, "ALT": {"unit": "U/L", "min": 7, "max": 56}, "AST": {"unit": "U/L", "min": 10, "max": 40}, "ALP": {"unit": "U/L", "min": 44, "max": 147}, "Bilirubin": {"unit": "mg/dL", "min": 0.3, "max": 1.9}, "Albumin": {"unit": "g/dL", "min": 3.5, "max": 5.5}, "Thyroid Stimulating Hormone": {"unit": "mIU/L", "min": 0.5, "max": 4.5}, "Free T4": {"unit": "ng/dL", "min": 0.8, "max": 1.8} } def extract_with_regex(self, text: str) -> dict: test_results = [] patterns = { "White Blood Cell Count": r"White Blood Cell Count[:\s]+(\d+\.?\d*)\s*(x10\^9/L)", "Red Blood Cell Count": r"Red Blood Cell Count[:\s]+(\d+\.?\d*)\s*(x10\^12/L)", "Hemoglobin": r"Hemoglobin[:\s]+(\d+\.?\d*)\s*(g/dL)", "Hematocrit": r"Hematocrit[:\s]+(\d+\.?\d*)\s*(%)", "Platelet Count": r"Platelet Count[:\s]+(\d+\.?\d*)\s*(x10\^9/L)", "Glucose": r"Glucose[:\s]+(\d+\.?\d*)\s*(mg/dL)", "Creatinine": r"Creatinine[:\s]+(\d+\.?\d*)\s*(mg/dL)", "Urea": r"Urea[:\s]+(\d+\.?\d*)\s*(mg/dL)", "Cholesterol": r"Cholesterol[:\s]+(\d+\.?\d*)\s*(mg/dL)", "ALT": r"ALT[:\s]+(\d+\.?\d*)\s*(U/L)", "AST": r"AST[:\s]+(\d+\.?\d*)\s*(U/L)", "ALP": r"ALP[:\s]+(\d+\.?\d*)\s*(U/L)", "Bilirubin": r"Bilirubin[:\s]+(\d+\.?\d*)\s*(mg/dL)", "Albumin": r"Albumin[:\s]+(\d+\.?\d*)\s*(g/dL)", "Thyroid Stimulating Hormone": r"Thyroid Stimulating Hormone[:\s]+(\d+\.?\d*)\s*(mIU/L)", "Free T4": r"Free T4[:\s]+(\d+\.?\d*)\s*(ng/dL)" } for test_name, pattern in patterns.items(): match = re.search(pattern, text, re.IGNORECASE) if match: value = float(match.group(1)) unit = match.group(2) if test_name in self.lab_tests: ref_range = self.lab_tests[test_name] status = "normal" deviation = 0.0 if value < ref_range["min"]: deviation = ( (ref_range["min"] - value) / ref_range["min"]) * 100 status = "critical_low" if deviation > 20 else "low" elif value > ref_range["max"]: deviation = ( (value - ref_range["max"]) / ref_range["max"]) * 100 status = "critical_high" if deviation > 20 else "high" clinical_sig = "Within normal limits" if status != "normal": direction = "↑" if "high" in status else "↓" clinical_sig = f"{'Above' if 'high' in status else 'Below'} normal range ({direction}{deviation:.1f}%)" test_results.append({ "test_name": test_name, "value": value, "unit": unit, "reference_range": { "min": ref_range["min"], "max": ref_range["max"], "unit": ref_range["unit"] }, "status": status, "deviation_percentage": deviation, "clinical_significance": clinical_sig, "trend": None, "source": "regex" }) return {"test_results": test_results} def extract_with_ner(self, text: str) -> dict: doc = self.nlp(text) invalid_test_names = { 'hemolab', 'central', 'health', 'laboratory', 'medicity', 'wellbeing', 'healthland', 'age', 'gender', 'email', 'male', 'sample', 'results', 'verified by', 'dr', 'emily', 'johnson', 'normal', 'elevated', 'johnatan', 'doe', 'page', 'blood test', 'hematology', 'processing details' } entities = [] for ent in doc.ents: if ent.label_ == "TEST_NAME": if ent.text.lower() not in invalid_test_names and len(ent.text) > 2: entities.append({ "text": ent.text, "label": ent.label_, "start_char": ent.start_char, "end_char": ent.end_char, "confidence": 0.92 }) elif ent.label_ in ["TEST_VALUE", "TEST_UNIT", "MedicalCondition"]: entities.append({ "text": ent.text, "label": ent.label_, "start_char": ent.start_char, "end_char": ent.end_char, "confidence": 0.92 }) return {"entities": entities} def get_clinical_bert_embeddings(self, text: str): inputs = self.clinical_bert_tokenizer( text, return_tensors="pt", truncation=True, max_length=512, padding=True, return_token_type_ids=False ) with torch.no_grad(): outputs = self.clinical_bert_model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() return embeddings.tolist() def analyze_with_clinical_bert(self, text: str, test_results: list): embeddings = self.get_clinical_bert_embeddings(text) diseases_detected = [] status_flags = [] abnormal_tests = [t for t in test_results if t['status'] != 'normal'] if any('glucose' in t['test_name'].lower() and 'high' in t['status'] for t in abnormal_tests): diseases_detected.append("Potential Diabetes") if any('cholesterol' in t['test_name'].lower() and 'high' in t['status'] for t in abnormal_tests): diseases_detected.append("Dyslipidemia") for test in test_results: if test['status'] != 'normal' and test['status'] not in [s.lower() for s in status_flags]: status_flags.append(test['status'].replace('_', ' ').title()) if not status_flags: status_flags = ["Normal"] abnormality_patterns = [] critical_count = len( [t for t in test_results if 'critical' in t['status']]) abnormal_count = len(abnormal_tests) if abnormal_count > 0: abnormality_patterns.append( f"Detected {abnormal_count} abnormal parameter(s)") if critical_count > 0: abnormality_patterns.append( f"{critical_count} critical finding(s) require immediate attention") clinical_relevance = min( 100, (abnormal_count / len(test_results)) * 100) if test_results else 0 return { "embedding_dimension": len(embeddings), "clinical_context_captured": True, "embeddings_generated": True, "diseases_detected": diseases_detected, "status_flags": status_flags, "abnormality_patterns": abnormality_patterns, "clinical_relevance_score": round(clinical_relevance, 1) } def generate_patient_summary(self, test_results: list, abnormal_results: list) -> dict: normal_count = len( [t for t in test_results if t['status'] == 'normal']) total_tests = len(test_results) abnormal_count = len(abnormal_results) critical_count = len( [a for a in abnormal_results if a['severity'] == 'critical']) if critical_count > 0: overall_status = "⚠️ URGENT - IMMEDIATE ATTENTION NEEDED" explanation = f"Your lab results show {critical_count} critical finding(s) that require immediate medical attention. Please consult your doctor as soon as possible." elif abnormal_count > 0: overall_status = "⚠️ ABNORMALITIES DETECTED" explanation = f"Your lab results show {abnormal_count} test(s) outside normal range. While not immediately critical, these findings should be discussed with your healthcare provider." else: overall_status = "✅ ALL TESTS NORMAL" explanation = f"Great news! All {total_tests} lab tests are within normal ranges. Your results indicate good health in the tested parameters." key_findings = [] areas_of_concern = [] test_explanations = { "White Blood Cell Count": { "normal": "Your immune system is functioning properly", "high": "Your body may be fighting an infection or inflammation", "low": "Your immune system may be weakened" }, "Red Blood Cell Count": { "normal": "Your blood is carrying oxygen efficiently", "high": "You may have dehydration or a blood disorder requiring evaluation", "low": "You may have anemia, causing fatigue and weakness" }, "Hemoglobin": { "normal": "Your blood oxygen levels are healthy", "high": "May indicate dehydration or lung problems", "low": "You may be anemic - your blood isn't carrying enough oxygen" }, "Hematocrit": { "normal": "Blood volume and red blood cell ratio is normal", "high": "May indicate dehydration", "low": "May indicate anemia or blood loss" }, "Platelet Count": { "normal": "Your blood clotting ability is normal", "high": "Increased risk of blood clots", "low": "Increased risk of bleeding" }, "Glucose": { "normal": "Your blood sugar levels are well controlled", "high": "Your blood sugar is elevated - may indicate diabetes or prediabetes", "low": "Your blood sugar is low - may cause dizziness and weakness" }, "Cholesterol": { "normal": "Your cholesterol levels are healthy for your heart", "high": "Elevated cholesterol increases heart disease risk", "low": "Unusually low cholesterol" }, "Creatinine": { "normal": "Your kidneys are filtering waste properly", "high": "Your kidneys may not be working optimally", "low": "May indicate low muscle mass" }, "Urea": { "normal": "Kidney function is normal", "high": "May indicate kidney problems or dehydration", "low": "May indicate liver problems" }, "ALT": { "normal": "Your liver is functioning normally", "high": "Your liver may be inflamed or damaged", "low": "Generally not concerning" }, "AST": { "normal": "Liver and heart function appear normal", "high": "May indicate liver or heart problems", "low": "Generally not concerning" }, "Bilirubin": { "normal": "Liver is processing waste products normally", "high": "May cause jaundice - liver may not be functioning properly", "low": "Generally not concerning" }, "Albumin": { "normal": "Good protein levels and liver function", "high": "May indicate dehydration", "low": "May indicate liver or kidney disease" }, "Thyroid Stimulating Hormone": { "normal": "Your thyroid hormone levels are balanced", "high": "Your thyroid may be underactive (hypothyroidism)", "low": "Your thyroid may be overactive (hyperthyroidism)" }, "Free T4": { "normal": "Thyroid hormone levels are appropriate", "high": "May indicate hyperthyroidism", "low": "May indicate hypothyroidism" } } for test in test_results[:10]: test_name = test['test_name'] status = test['status'] for key in test_explanations: if key.lower() in test_name.lower(): if status == 'normal': key_findings.append({ "finding": f"{test_name}: {test['value']} {test['unit']}", "explanation": test_explanations[key].get('normal', 'Within normal range') }) elif 'high' in status.lower(): areas_of_concern.append({ "finding": f"{test_name}: {test['value']} {test['unit']} (HIGH)", "explanation": test_explanations[key].get('high', 'Above normal range'), "severity": "critical" if "critical" in status else "moderate" }) elif 'low' in status.lower(): areas_of_concern.append({ "finding": f"{test_name}: {test['value']} {test['unit']} (LOW)", "explanation": test_explanations[key].get('low', 'Below normal range'), "severity": "critical" if "critical" in status else "moderate" }) break next_steps = [] if critical_count > 0: next_steps = [ "Contact your doctor immediately", "Do not delay medical consultation", "Bring these results to your healthcare provider", "Follow your doctor's treatment recommendations" ] elif abnormal_count > 0: next_steps = [ "Schedule an appointment with your doctor within the next few days", "Discuss these results with your healthcare provider", "Your doctor may recommend additional tests", "Follow any lifestyle or treatment recommendations" ] else: next_steps = [ "Maintain your current healthy lifestyle", "Continue regular health checkups", "Keep these results for your medical records", "Discuss with your doctor during your next routine visit" ] return { "overall_status": overall_status, "explanation": explanation, "key_findings": key_findings[:5], "areas_of_concern": areas_of_concern, "next_steps": next_steps, "summary_stats": { "total_tests": total_tests, "normal_tests": normal_count, "abnormal_tests": abnormal_count, "critical_findings": critical_count } } def extract_and_format(self, text: str, report_id: str = None, patient_id: str = None) -> dict: start_time = time.time() regex_results = self.extract_with_regex(text) ner_results = self.extract_with_ner(text) test_results = regex_results['test_results'] entities_list = ner_results['entities'] abnormal_results = [] for test in test_results: if test['status'] != 'normal': severity = 'critical' if 'critical' in test['status'] else 'moderate' abnormal_results.append({ "test_name": test['test_name'], "severity": severity, "requires_attention": 'critical' in test['status'] }) normal_params = [t['test_name'] for t in test_results if t['status'] == 'normal'] key_abnormalities = [ f"{t['test_name']}: {t['clinical_significance']}" for t in test_results if t['status'] != 'normal'] ai_summary = { "overall_assessment": f"Detected {len(abnormal_results)} abnormal result(s). {len(normal_params)} parameters within normal limits.", "key_abnormalities": key_abnormalities, "normal_parameters": normal_params, "recommendations": [ "Correlate with clinical symptoms", "Consider follow-up testing if symptoms persist", "Consult with healthcare provider for interpretation" ] } clinical_insights = self.analyze_with_clinical_bert(text, test_results) patient_summary = self.generate_patient_summary( test_results, abnormal_results) test_panels = [] cbc_tests = [t for t in test_results if any(x in t['test_name'].lower( ) for x in ['blood cell', 'hemoglobin', 'hematocrit', 'platelet'])] if cbc_tests: test_panels.append({ "panel_name": "Complete Blood Count", "tests_included": [t['test_name'] for t in cbc_tests], "panel_status": "abnormal" if any(t['status'] != 'normal' for t in cbc_tests) else "normal", "abnormal_count": len([t for t in cbc_tests if t['status'] != 'normal']), "total_tests": len(cbc_tests) }) chem_tests = [t for t in test_results if any(x in t['test_name'].lower() for x in [ 'glucose', 'creatinine', 'urea', 'cholesterol'])] if chem_tests: test_panels.append({ "panel_name": "General Chemistry", "tests_included": [t['test_name'] for t in chem_tests], "panel_status": "abnormal" if any(t['status'] != 'normal' for t in chem_tests) else "normal", "abnormal_count": len([t for t in chem_tests if t['status'] != 'normal']), "total_tests": len(chem_tests) }) liver_tests = [t for t in test_results if any(x in t['test_name'].lower() for x in [ 'alt', 'ast', 'alp', 'bilirubin', 'albumin'])] if liver_tests: test_panels.append({ "panel_name": "Liver Function Panel", "tests_included": [t['test_name'] for t in liver_tests], "panel_status": "abnormal" if any(t['status'] != 'normal' for t in liver_tests) else "normal", "abnormal_count": len([t for t in liver_tests if t['status'] != 'normal']), "total_tests": len(liver_tests) }) thyroid_tests = [t for t in test_results if any( x in t['test_name'].lower() for x in ['thyroid', 'tsh', 't4', 't3'])] if thyroid_tests: test_panels.append({ "panel_name": "Thyroid Function Panel", "tests_included": [t['test_name'] for t in thyroid_tests], "panel_status": "abnormal" if any(t['status'] != 'normal' for t in thyroid_tests) else "normal", "abnormal_count": len([t for t in thyroid_tests if t['status'] != 'normal']), "total_tests": len(thyroid_tests) }) chart_data = [] for test in test_results: if test['reference_range']: chart_data.append({ "test": test['test_name'], "value": test['value'], "ref_min": test['reference_range']['min'], "ref_max": test['reference_range']['max'] }) visualization_data = { "charts": [{ "chart_type": "bar", "title": "Lab Results vs Reference Range", "data": chart_data }], "trend_data": [] } ner_stats = {} for ent in entities_list: label = ent['label'] ner_stats[label] = ner_stats.get(label, 0) + 1 test_category = "hematology" sub_category = "complete_blood_count" urgency_level = "critical" if len( [a for a in abnormal_results if a['severity'] == 'critical']) > 0 else "routine" if any('glucose' in t['test_name'].lower() for t in test_results): test_category = "clinical_chemistry" sub_category = "metabolic_panel" classification = { "test_category": test_category, "sub_category": sub_category, "urgency_level": urgency_level, "confidence": 0.96 } extraction_stats = { "tests_with_values": len(test_results), "additional_tests_found": len([e for e in entities_list if e['label'] == 'TEST_NAME']), "diseases_detected": len(clinical_insights['diseases_detected']), "interpretations_found": len([t for t in test_results if t['status'] != 'normal']), "ner_model_stats": ner_stats } processing_time_ms = int((time.time() - start_time) * 1000) metadata = { "model_version": "radiolo_smart_ner_v2.0.0", "processing_date": datetime.utcnow().isoformat() + "Z", "tests_extracted": len(test_results), "confidence_score": 0.94, "nlp_models": { "ner": "Custom Lab NER (Smart Filtered)", "clinical_bert": "ClinicalDistilBERT", "extraction_method": "Hybrid (Regex + Filtered NER)" } } return { "report_id": report_id or f"lab_{int(time.time())}", "report_type": "laboratory", "processing_time_ms": processing_time_ms, "classification": classification, "extraction_stats": extraction_stats, "entities": entities_list, "test_results": test_results, "abnormal_results": abnormal_results, "ai_summary": ai_summary, "clinical_insights": clinical_insights, "patient_friendly_summary": patient_summary, "test_panels": test_panels, "visualization_data": visualization_data, "metadata": metadata }