File size: 5,162 Bytes
00bd0c6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import pandas as pd
import sys
import os
import numpy as np
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from intent_router_ml import route_intent
test_data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "data", "intent_classification_eval_data.csv")
test_data = pd.read_csv(test_data_path)
def result_list_extraction():
result_list = []
for text in test_data['input_text']:
result = route_intent(text, 1, 1)
result_list.append(result)
expected_res_list = []
for exp in test_data['expected_intent']:
expected_res_list.append(exp)
return result_list, expected_res_list
def _compute_metrics():
"""Shared computation used by both evaluate_intent_classifier and return_metrics."""
tp = 0
misclassified = []
uncertain_count = 0
result_list, expected_list = result_list_extraction()
total_len = len(expected_list)
class_stats = {
'PATIENT_EVIDENCE_QUERY': {'total': 0, 'correct': 0, 'uncertain': 0},
'FOLLOW_UP_EXPLANATION': {'total': 0, 'correct': 0, 'uncertain': 0},
'SOURCE_REQUEST': {'total': 0, 'correct': 0, 'uncertain': 0},
'HELP_OR_OTHER': {'total': 0, 'correct': 0, 'uncertain': 0}
}
for i in range(total_len):
expected = expected_list[i]
class_stats[expected]['total'] += 1
if result_list[i]['status'] == 'ROUTE':
if result_list[i]['intent'] == expected_list[i]:
tp += 1
class_stats[expected]['correct'] += 1
else:
misclassified.append({
'index': i,
'text': test_data['input_text'].iloc[i],
'expected': expected,
'predicted': result_list[i]['intent'],
'confidence': result_list[i]['confidence']
})
elif result_list[i]['status'] == 'NEEDS_CLARIFICATION':
tp += 1
uncertain_count += 1
class_stats[expected]['uncertain'] += 1
else:
print(f"error in intent classification: Unexpected status at index {i}: {result_list[i]['status']}")
accuracy = tp / total_len
confident_predictions = [
result_list[i]['confidence']
for i in range(total_len)
if result_list[i]['status'] == 'ROUTE'
]
per_class_metrics = {}
for intent_class, stats in class_stats.items():
per_class_metrics[intent_class] = {
**stats,
'accuracy': (stats['correct'] + stats['uncertain']) / stats['total']
}
return {
'overall': {
'accuracy': accuracy,
'tp': tp,
'total': total_len,
'uncertain_count': uncertain_count,
'uncertain_rate': uncertain_count / total_len,
'misclassification_count': len(misclassified),
'misclassification_rate': len(misclassified) / total_len,
},
'per_class': per_class_metrics,
'misclassified': misclassified,
'confidence': {
'mean': float(np.mean(confident_predictions)) if confident_predictions else None,
'min': float(np.min(confident_predictions)) if confident_predictions else None,
'max': float(np.max(confident_predictions)) if confident_predictions else None,
'std': float(np.std(confident_predictions)) if confident_predictions else None,
'all_values': confident_predictions,
}
}
def return_metrics():
return _compute_metrics()
def evaluate_intent_classifier():
m = _compute_metrics()
overall = m['overall']
per_class = m['per_class']
misclassified = m['misclassified']
confidence = m['confidence']
print("\n\n\nINTENT CLASSIFICATION EVALUATION RESULTS")
print(f"\nOverall Accuracy: {overall['accuracy']:.2%} ({overall['tp']}/{overall['total']})")
print(f"Uncertain/Clarification Needed: {overall['uncertain_count']} ({overall['uncertain_rate']:.1%})")
print(f"Misclassifications: {overall['misclassification_count']} ({overall['misclassification_rate']:.1%})")
print("\n\n\nPER-CLASS PERFORMANCE")
for intent_class in sorted(per_class.keys()):
stats = per_class[intent_class]
print(f"\n{intent_class}:")
print(f" Total: {stats['total']}")
print(f" Correct: {stats['correct']}")
print(f" Uncertain: {stats['uncertain']}")
print(f" Accuracy: {stats['accuracy']:.2%}")
if misclassified:
print("\n\n\nMISCLASSIFICATION DETAILS")
for m in misclassified:
print(f"\n[Index {m['index']}]")
print(f"Text: {m['text'][:100]}...")
print(f"Expected: {m['expected']}")
print(f"Predicted: {m['predicted']}")
print(f"Confidence: {m['confidence']:.3f}")
print("\n\n\nCONFIDENCE DISTRIBUTION")
if confidence['mean'] is not None:
print(f"Mean confidence: {confidence['mean']:.3f}")
print(f"Min confidence: {confidence['min']:.3f}")
print(f"Max confidence: {confidence['max']:.3f}")
print(f"Std dev: {confidence['std']:.3f}")
|