Spaces:

Fa987123
/

multimodal_previsit

Sleeping

App Files Files Community

frabbani commited on Jan 27

Commit

82a0e99

1 Parent(s): 3ade5b9

Fix fact extraction - pass raw data for simple tools..................,,,

Browse files

Files changed (2) hide show

evaluation/llm_eval.py +204 -0
server.py +68 -28

evaluation/llm_eval.py CHANGED Viewed

@@ -238,6 +238,210 @@ def extract_numbers_from_text(text: str) -> Dict[str, Any]:
     return numbers
 def extract_numbers_from_chart(chart_data: Dict) -> Dict[str, Any]:
     """
     Extract numerical values from chart data returned by tools.

     return numbers
+def extract_medication_names(text: str, expected_meds: List[str]) -> Dict[str, Any]:
+    """
+    Extract medication names from LLM response and compare against expected list.
+    Uses fuzzy matching since LLM might abbreviate or paraphrase.
+    """
+    text_lower = text.lower()
+    found = []
+    missing = []
+    for med in expected_meds:
+        med_lower = med.lower()
+        # Extract the drug name (first word or two before dosage)
+        # "Metformin 500 MG Oral Tablet" -> "metformin"
+        drug_name = med_lower.split()[0] if med_lower else ""
+        # Check if drug name appears in text
+        if drug_name and drug_name in text_lower:
+            found.append(med)
+        # Also check for common abbreviations or alternate names
+        elif any(part in text_lower for part in med_lower.split()[:2] if len(part) > 3):
+            found.append(med)
+        else:
+            missing.append(med)
+    return {
+        "found": found,
+        "missing": missing,
+        "found_count": len(found),
+        "expected_count": len(expected_meds),
+        "accuracy": len(found) / len(expected_meds) if expected_meds else 1.0
+    }
+def extract_condition_names(text: str, expected_conditions: List[str]) -> Dict[str, Any]:
+    """
+    Extract condition names from LLM response and compare against expected list.
+    """
+    text_lower = text.lower()
+    found = []
+    missing = []
+    for condition in expected_conditions:
+        cond_lower = condition.lower()
+        # Check for key words from condition name
+        key_words = [w for w in cond_lower.split() if len(w) > 4]
+        # If any significant word from condition appears in text
+        if any(word in text_lower for word in key_words):
+            found.append(condition)
+        # Also check for common abbreviations
+        elif "diabetes" in cond_lower and ("diabetes" in text_lower or "diabetic" in text_lower):
+            found.append(condition)
+        elif "hypertension" in cond_lower and ("hypertension" in text_lower or "blood pressure" in text_lower or "htn" in text_lower):
+            found.append(condition)
+        elif "cholesterol" in cond_lower and ("cholesterol" in text_lower or "hyperlipidemia" in text_lower):
+            found.append(condition)
+        else:
+            missing.append(condition)
+    return {
+        "found": found,
+        "missing": missing,
+        "found_count": len(found),
+        "expected_count": len(expected_conditions),
+        "accuracy": len(found) / len(expected_conditions) if expected_conditions else 1.0
+    }
+def extract_allergy_names(text: str, expected_allergies: List[str]) -> Dict[str, Any]:
+    """
+    Extract allergy/allergen names from LLM response.
+    """
+    text_lower = text.lower()
+    found = []
+    missing = []
+    for allergy in expected_allergies:
+        allergy_lower = allergy.lower()
+        # Check if allergen name appears
+        if allergy_lower in text_lower:
+            found.append(allergy)
+        # Check key words
+        elif any(word in text_lower for word in allergy_lower.split() if len(word) > 3):
+            found.append(allergy)
+        else:
+            missing.append(allergy)
+    return {
+        "found": found,
+        "missing": missing,
+        "found_count": len(found),
+        "expected_count": len(expected_allergies),
+        "accuracy": len(found) / len(expected_allergies) if expected_allergies else 1.0
+    }
+@dataclass
+class TextComparisonResult:
+    """Result of comparing LLM text response against expected items."""
+    case_id: str
+    query: str
+    query_type: str
+    success: bool
+    expected_items: List[str] = field(default_factory=list)
+    found_items: List[str] = field(default_factory=list)
+    missing_items: List[str] = field(default_factory=list)
+    accuracy: float = 0.0
+    raw_response: str = ""
+async def evaluate_text_query(
+    patient_id: str,
+    query: str,
+    query_type: str,
+    expected_items: List[str],
+    case_id: str = ""
+) -> TextComparisonResult:
+    """
+    Evaluate LLM response for text-based queries (medications, conditions, allergies).
+    """
+    result = TextComparisonResult(
+        case_id=case_id,
+        query=query,
+        query_type=query_type,
+        success=False,
+        expected_items=expected_items
+    )
+    # Call the agent
+    llm_response = await call_agent_endpoint(patient_id, query, timeout=90.0)
+    if llm_response.error:
+        result.raw_response = f"Error: {llm_response.error}"
+        return result
+    result.raw_response = llm_response.raw_response
+    # Compare based on query type
+    if query_type == "medication_list":
+        comparison = extract_medication_names(llm_response.raw_response, expected_items)
+    elif query_type == "condition_list":
+        comparison = extract_condition_names(llm_response.raw_response, expected_items)
+    elif query_type == "allergy_list":
+        comparison = extract_allergy_names(llm_response.raw_response, expected_items)
+    else:
+        # Generic text comparison
+        comparison = extract_condition_names(llm_response.raw_response, expected_items)
+    result.found_items = comparison["found"]
+    result.missing_items = comparison["missing"]
+    result.accuracy = comparison["accuracy"]
+    result.success = comparison["accuracy"] >= 0.7  # 70% threshold
+    return result
+def aggregate_text_results(results: List[TextComparisonResult]) -> Dict[str, Any]:
+    """Aggregate text evaluation results."""
+    if not results:
+        return {"total_cases": 0, "message": "No text cases evaluated"}
+    total = len(results)
+    successful = sum(1 for r in results if r.success)
+    total_expected = sum(len(r.expected_items) for r in results)
+    total_found = sum(len(r.found_items) for r in results)
+    by_type = {}
+    for r in results:
+        if r.query_type not in by_type:
+            by_type[r.query_type] = {"total": 0, "passed": 0, "accuracy_sum": 0}
+        by_type[r.query_type]["total"] += 1
+        by_type[r.query_type]["passed"] += 1 if r.success else 0
+        by_type[r.query_type]["accuracy_sum"] += r.accuracy
+    # Compute average accuracy per type
+    for qtype in by_type:
+        by_type[qtype]["avg_accuracy"] = f"{by_type[qtype]['accuracy_sum'] / by_type[qtype]['total'] * 100:.1f}%"
+    return {
+        "total_cases": total,
+        "successful_cases": successful,
+        "failed_cases": total - successful,
+        "success_rate": f"{(successful/total*100):.1f}%",
+        "total_expected_items": total_expected,
+        "total_found_items": total_found,
+        "item_recall": f"{(total_found/total_expected*100):.1f}%" if total_expected > 0 else "N/A",
+        "by_type": by_type,
+        "failed_details": [
+            {
+                "case_id": r.case_id,
+                "query_type": r.query_type,
+                "accuracy": f"{r.accuracy:.0%}",
+                "missing": r.missing_items[:5]
+            }
+            for r in results if not r.success
+        ]
+    }
 def extract_numbers_from_chart(chart_data: Dict) -> Dict[str, Any]:
     """
     Extract numerical values from chart data returned by tools.

server.py CHANGED Viewed

@@ -698,32 +698,27 @@ async def run_evaluation(
                 extract_numbers_from_text,
                 compare_llm_response,
                 aggregate_llm_results,
-                LLMComparisonResult
             )
             print("\nRunning FULL LLM evaluation (this calls actual MedGemma)...")
-            print("Note: Only testing vital_trend queries (charts) for number accuracy\n")
-            # Filter to just vital trend cases (charts with numbers)
-            vital_cases = [tc for tc in test_cases if tc["query_type"] == "vital_trend"]
-            if not vital_cases:
-                return {
-                    "success": False,
-                    "error": "No vital trend test cases found"
-                }
             llm_results = []
-            for i, test_case in enumerate(vital_cases[:5]):  # Limit to 5 for speed
                 patient_id = test_case["patient_id"]
                 query = test_case["query"]
                 case_id = test_case["case_id"]
                 expected = compute_expected_values(test_case)
-                print(f"  [{i+1}/{min(5, len(vital_cases))}] {query[:50]}...")
-                # Call actual LLM
                 llm_response = await call_agent_endpoint(patient_id, query, timeout=90.0)
                 if llm_response.error:
@@ -735,17 +730,12 @@ async def run_evaluation(
                         errors=[llm_response.error]
                     ))
                 else:
-                    # Extract numbers from chart (ground truth) and text (LLM said)
                     chart_nums = extract_numbers_from_chart(llm_response.chart_data)
                     text_nums = extract_numbers_from_text(llm_response.raw_response)
-                    # Debug: show first 300 chars of LLM response
-                    print(f"    LLM response (first 300 chars):")
-                    print(f"    {llm_response.raw_response[:300].replace(chr(10), ' ')}")
                     print(f"    Chart numbers: {chart_nums}")
                     print(f"    Text numbers:  {text_nums}")
-                    # Compare
                     result = compare_llm_response(llm_response, expected)
                     result.case_id = case_id
                     llm_results.append(result)
@@ -757,26 +747,76 @@ async def run_evaluation(
                         for err in result.errors[:3]:
                             print(f"      - {err}")
-            # Aggregate LLM results
-            llm_summary = aggregate_llm_results(llm_results)
             print("\n" + "="*60)
             print("LLM RESPONSE ACCURACY REPORT")
             print("="*60)
-            print(f"Test Cases:        {llm_summary['total_cases']}")
-            print(f"Successful:        {llm_summary['successful_cases']}")
-            print(f"Failed:            {llm_summary['failed_cases']}")
-            print(f"Success Rate:      {llm_summary['success_rate']}")
-            print(f"Number Checks:     {llm_summary['total_number_checks']}")
-            print(f"Correct Numbers:   {llm_summary['correct_numbers']}")
-            print(f"Number Accuracy:   {llm_summary['number_accuracy']}")
             print("="*60)
             return {
                 "success": True,
                 "mode": "llm",
                 "patients_tested": patients,
-                "metrics": llm_summary
             }
         elif mode == "agent":

                 extract_numbers_from_text,
                 compare_llm_response,
                 aggregate_llm_results,
+                LLMComparisonResult,
+                evaluate_text_query,
+                aggregate_text_results
             )
             print("\nRunning FULL LLM evaluation (this calls actual MedGemma)...")
+            # === PART 1: NUMERIC EVALUATION (Vitals) ===
+            print("\n--- PART 1: NUMERIC ACCURACY (Vital Charts) ---\n")
+            vital_cases = [tc for tc in test_cases if tc["query_type"] == "vital_trend"]
             llm_results = []
+            for i, test_case in enumerate(vital_cases[:4]):  # Limit to 4
                 patient_id = test_case["patient_id"]
                 query = test_case["query"]
                 case_id = test_case["case_id"]
                 expected = compute_expected_values(test_case)
+                print(f"  [{i+1}/{min(4, len(vital_cases))}] {query[:50]}...")
                 llm_response = await call_agent_endpoint(patient_id, query, timeout=90.0)
                 if llm_response.error:
                         errors=[llm_response.error]
                     ))
                 else:
                     chart_nums = extract_numbers_from_chart(llm_response.chart_data)
                     text_nums = extract_numbers_from_text(llm_response.raw_response)
                     print(f"    Chart numbers: {chart_nums}")
                     print(f"    Text numbers:  {text_nums}")
                     result = compare_llm_response(llm_response, expected)
                     result.case_id = case_id
                     llm_results.append(result)
                         for err in result.errors[:3]:
                             print(f"      - {err}")
+            # === PART 2: TEXT EVALUATION (Medications, Conditions, Allergies) ===
+            print("\n--- PART 2: TEXT ACCURACY (Medications, Conditions, Allergies) ---\n")
+            text_cases = [tc for tc in test_cases if tc["query_type"] in ["medication_list", "condition_list", "allergy_list"]]
+            text_results = []
+            for i, test_case in enumerate(text_cases[:4]):  # Limit to 4
+                patient_id = test_case["patient_id"]
+                query = test_case["query"]
+                query_type = test_case["query_type"]
+                case_id = test_case["case_id"]
+                expected = compute_expected_values(test_case)
+                # Get expected items list based on query type
+                if query_type == "medication_list":
+                    expected_items = expected.get("medication_names", [])
+                elif query_type == "condition_list":
+                    expected_items = expected.get("condition_names", [])
+                elif query_type == "allergy_list":
+                    expected_items = expected.get("substances", [])
+                else:
+                    expected_items = []
+                print(f"  [{i+1}/{min(4, len(text_cases))}] {query[:50]}...")
+                print(f"    Expected {len(expected_items)} items: {[x[:30] for x in expected_items[:3]]}...")
+                result = await evaluate_text_query(
+                    patient_id, query, query_type, expected_items, case_id
+                )
+                text_results.append(result)
+                if result.success:
+                    print(f"    ✓ PASS ({result.accuracy:.0%} - found {len(result.found_items)}/{len(expected_items)})")
+                else:
+                    print(f"    ✗ FAIL ({result.accuracy:.0%} - found {len(result.found_items)}/{len(expected_items)})")
+                    if result.missing_items:
+                        print(f"      Missing: {result.missing_items[:3]}")
+            # === AGGREGATE RESULTS ===
+            numeric_summary = aggregate_llm_results(llm_results)
+            text_summary = aggregate_text_results(text_results) if text_results else {}
             print("\n" + "="*60)
             print("LLM RESPONSE ACCURACY REPORT")
             print("="*60)
+            print("\n📊 NUMERIC ACCURACY (Vital Charts):")
+            print(f"  Test Cases:        {numeric_summary['total_cases']}")
+            print(f"  Success Rate:      {numeric_summary['success_rate']}")
+            print(f"  Number Accuracy:   {numeric_summary['number_accuracy']}")
+            if text_summary:
+                print("\n📝 TEXT ACCURACY (Medications, Conditions, Allergies):")
+                print(f"  Test Cases:        {text_summary['total_cases']}")
+                print(f"  Success Rate:      {text_summary['success_rate']}")
+                print(f"  Item Recall:       {text_summary['item_recall']}")
+                if text_summary.get('by_type'):
+                    for qtype, stats in text_summary['by_type'].items():
+                        print(f"    {qtype}: {stats['passed']}/{stats['total']} passed ({stats['avg_accuracy']})")
             print("="*60)
             return {
                 "success": True,
                 "mode": "llm",
                 "patients_tested": patients,
+                "metrics": {
+                    "numeric": numeric_summary,
+                    "text": text_summary
+                }
             }
         elif mode == "agent":