Agentic-RagBot / src /agents /confidence_assessor.py
T0X1N's picture
chore: codebase audit and fixes (ruff, mypy, pytest)
9659593
"""
MediGuard AI RAG-Helper
Confidence Assessor Agent - Evaluates prediction reliability
"""
from typing import Any
from src.biomarker_validator import BiomarkerValidator
from src.llm_config import llm_config
from src.state import AgentOutput, GuildState
class ConfidenceAssessorAgent:
"""Agent that assesses the reliability and limitations of the prediction"""
def __init__(self):
self.llm = llm_config.analyzer
def assess(self, state: GuildState) -> GuildState:
"""
Assess prediction confidence and identify limitations.
Args:
state: Current guild state
Returns:
Updated state with confidence assessment
"""
print("\n" + "=" * 70)
print("EXECUTING: Confidence Assessor Agent")
print("=" * 70)
model_prediction = state["model_prediction"]
disease = model_prediction["disease"]
ml_confidence = model_prediction["confidence"]
probabilities = model_prediction.get("probabilities", {})
biomarkers = state["patient_biomarkers"]
# Collect previous agent findings
biomarker_analysis = state.get("biomarker_analysis") or {}
disease_explanation = self._get_agent_findings(state, "Disease Explainer")
linker_findings = self._get_agent_findings(state, "Biomarker-Disease Linker")
print(f"\nAssessing confidence for {disease} prediction...")
# Evaluate evidence strength
evidence_strength = self._evaluate_evidence_strength(biomarker_analysis, disease_explanation, linker_findings)
# Identify limitations
limitations = self._identify_limitations(biomarkers, biomarker_analysis, probabilities)
# Calculate aggregate reliability
reliability = self._calculate_reliability(ml_confidence, evidence_strength, len(limitations))
# Generate assessment summary
assessment_summary = self._generate_assessment(
disease, ml_confidence, reliability, evidence_strength, limitations
)
# Create agent output
output = AgentOutput(
agent_name="Confidence Assessor",
findings={
"prediction_reliability": reliability,
"ml_confidence": ml_confidence,
"evidence_strength": evidence_strength,
"limitations": limitations,
"assessment_summary": assessment_summary,
"recommendation": self._get_recommendation(reliability),
"alternative_diagnoses": self._get_alternatives(probabilities),
},
)
# Update state
print("\nConfidence assessment complete")
print(f" - Prediction reliability: {reliability}")
print(f" - Evidence strength: {evidence_strength}")
print(f" - Limitations identified: {len(limitations)}")
return {"agent_outputs": [output]}
def _get_agent_findings(self, state: GuildState, agent_name: str) -> dict:
"""Extract findings from a specific agent"""
for output in state.get("agent_outputs", []):
if output.agent_name == agent_name:
return output.findings
return {}
def _evaluate_evidence_strength(
self, biomarker_analysis: dict, disease_explanation: dict, linker_findings: dict
) -> str:
"""Evaluate the strength of supporting evidence"""
score = 0
max_score = 5
# Check biomarker validation quality
flags = biomarker_analysis.get("biomarker_flags", [])
abnormal_count = len([f for f in flags if f.get("status") != "NORMAL"])
if abnormal_count >= 3:
score += 1
if abnormal_count >= 5:
score += 1
# Check disease explanation quality
if disease_explanation.get("retrieval_quality", 0) >= 3:
score += 1
# Check biomarker-disease linking
key_drivers = linker_findings.get("key_drivers", [])
if len(key_drivers) >= 2:
score += 1
if len(key_drivers) >= 4:
score += 1
# Map score to categorical rating
if score >= 4:
return "STRONG"
elif score >= 2:
return "MODERATE"
else:
return "WEAK"
def _identify_limitations(
self, biomarkers: dict[str, float], biomarker_analysis: dict, probabilities: dict[str, float]
) -> list[str]:
"""Identify limitations and uncertainties"""
limitations = []
# Check for missing biomarkers
expected_biomarkers = BiomarkerValidator().expected_biomarker_count()
if len(biomarkers) < expected_biomarkers:
missing = expected_biomarkers - len(biomarkers)
limitations.append(f"Missing data: {missing} biomarker(s) not provided")
# Check for close alternative predictions
sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
if len(sorted_probs) >= 2:
top1, prob1 = sorted_probs[0]
top2, prob2 = sorted_probs[1]
if prob2 > 0.15: # Alternative is significant
limitations.append(f"Differential diagnosis: {top2} also possible ({prob2:.1%} probability)")
# Check for normal biomarkers despite prediction
flags = biomarker_analysis.get("biomarker_flags", [])
relevant = biomarker_analysis.get("relevant_biomarkers", [])
normal_relevant = [f for f in flags if f.get("name") in relevant and f.get("status") == "NORMAL"]
if len(normal_relevant) >= 2:
limitations.append("Some disease-relevant biomarkers are within normal range")
# Check for safety alerts (indicates complexity)
alerts = biomarker_analysis.get("safety_alerts", [])
if len(alerts) >= 2:
limitations.append("Multiple critical values detected; professional evaluation essential")
return limitations
def _calculate_reliability(self, ml_confidence: float, evidence_strength: str, limitation_count: int) -> str:
"""Calculate overall prediction reliability"""
score = 0
# ML confidence contribution
if ml_confidence >= 0.8:
score += 3
elif ml_confidence >= 0.6:
score += 2
elif ml_confidence >= 0.4:
score += 1
# Evidence strength contribution
if evidence_strength == "STRONG":
score += 3
elif evidence_strength == "MODERATE":
score += 2
else:
score += 1
# Limitation penalty
score -= min(limitation_count, 3)
# Map to categorical
if score >= 5:
return "HIGH"
elif score >= 3:
return "MODERATE"
else:
return "LOW"
def _generate_assessment(
self, disease: str, ml_confidence: float, reliability: str, evidence_strength: str, limitations: list[str]
) -> str:
"""Generate human-readable assessment summary"""
prompt = f"""As a medical AI assessment system, provide a brief confidence statement about this prediction:
Disease Predicted: {disease}
ML Model Confidence: {ml_confidence:.1%}
Overall Reliability: {reliability}
Evidence Strength: {evidence_strength}
Limitations: {len(limitations)} identified
Write a 2-3 sentence assessment that:
1. States the overall reliability
2. Mentions key strengths or weaknesses
3. Emphasizes the need for professional medical consultation
Be honest about uncertainty. Patient safety is paramount."""
try:
response = self.llm.invoke(prompt)
return response.content.strip()
except Exception as e:
print(f"Warning: Assessment generation failed: {e}")
return f"The {disease} prediction has {reliability.lower()} reliability based on available data. Professional medical evaluation is strongly recommended for accurate diagnosis."
def _get_recommendation(self, reliability: str) -> str:
"""Get action recommendation based on reliability"""
if reliability == "HIGH":
return "High confidence prediction. Schedule medical consultation to confirm diagnosis and discuss treatment options."
elif reliability == "MODERATE":
return "Moderate confidence prediction. Medical consultation recommended for professional evaluation and additional testing if needed."
else:
return "Low confidence prediction. Professional medical assessment essential. Additional tests may be required for accurate diagnosis."
def _get_alternatives(self, probabilities: dict[str, float]) -> list[dict[str, Any]]:
"""Get alternative diagnoses to consider"""
sorted_probs = sorted(probabilities.items(), key=lambda x: x[1], reverse=True)
alternatives = []
for disease, prob in sorted_probs[1:4]: # Top 3 alternatives
if prob > 0.05: # Only significant alternatives
alternatives.append(
{"disease": disease, "probability": prob, "note": "Consider discussing with healthcare provider"}
)
return alternatives
# Create agent instance for import
confidence_assessor_agent = ConfidenceAssessorAgent()