fatimaxa's picture
Upload 112 files
00bd0c6 verified
import pandas as pd
import sys
import os
import numpy as np
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from intent_router_ml import route_intent
test_data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "data", "intent_classification_eval_data.csv")
test_data = pd.read_csv(test_data_path)
def result_list_extraction():
result_list = []
for text in test_data['input_text']:
result = route_intent(text, 1, 1)
result_list.append(result)
expected_res_list = []
for exp in test_data['expected_intent']:
expected_res_list.append(exp)
return result_list, expected_res_list
def _compute_metrics():
"""Shared computation used by both evaluate_intent_classifier and return_metrics."""
tp = 0
misclassified = []
uncertain_count = 0
result_list, expected_list = result_list_extraction()
total_len = len(expected_list)
class_stats = {
'PATIENT_EVIDENCE_QUERY': {'total': 0, 'correct': 0, 'uncertain': 0},
'FOLLOW_UP_EXPLANATION': {'total': 0, 'correct': 0, 'uncertain': 0},
'SOURCE_REQUEST': {'total': 0, 'correct': 0, 'uncertain': 0},
'HELP_OR_OTHER': {'total': 0, 'correct': 0, 'uncertain': 0}
}
for i in range(total_len):
expected = expected_list[i]
class_stats[expected]['total'] += 1
if result_list[i]['status'] == 'ROUTE':
if result_list[i]['intent'] == expected_list[i]:
tp += 1
class_stats[expected]['correct'] += 1
else:
misclassified.append({
'index': i,
'text': test_data['input_text'].iloc[i],
'expected': expected,
'predicted': result_list[i]['intent'],
'confidence': result_list[i]['confidence']
})
elif result_list[i]['status'] == 'NEEDS_CLARIFICATION':
tp += 1
uncertain_count += 1
class_stats[expected]['uncertain'] += 1
else:
print(f"error in intent classification: Unexpected status at index {i}: {result_list[i]['status']}")
accuracy = tp / total_len
confident_predictions = [
result_list[i]['confidence']
for i in range(total_len)
if result_list[i]['status'] == 'ROUTE'
]
per_class_metrics = {}
for intent_class, stats in class_stats.items():
per_class_metrics[intent_class] = {
**stats,
'accuracy': (stats['correct'] + stats['uncertain']) / stats['total']
}
return {
'overall': {
'accuracy': accuracy,
'tp': tp,
'total': total_len,
'uncertain_count': uncertain_count,
'uncertain_rate': uncertain_count / total_len,
'misclassification_count': len(misclassified),
'misclassification_rate': len(misclassified) / total_len,
},
'per_class': per_class_metrics,
'misclassified': misclassified,
'confidence': {
'mean': float(np.mean(confident_predictions)) if confident_predictions else None,
'min': float(np.min(confident_predictions)) if confident_predictions else None,
'max': float(np.max(confident_predictions)) if confident_predictions else None,
'std': float(np.std(confident_predictions)) if confident_predictions else None,
'all_values': confident_predictions,
}
}
def return_metrics():
return _compute_metrics()
def evaluate_intent_classifier():
m = _compute_metrics()
overall = m['overall']
per_class = m['per_class']
misclassified = m['misclassified']
confidence = m['confidence']
print("\n\n\nINTENT CLASSIFICATION EVALUATION RESULTS")
print(f"\nOverall Accuracy: {overall['accuracy']:.2%} ({overall['tp']}/{overall['total']})")
print(f"Uncertain/Clarification Needed: {overall['uncertain_count']} ({overall['uncertain_rate']:.1%})")
print(f"Misclassifications: {overall['misclassification_count']} ({overall['misclassification_rate']:.1%})")
print("\n\n\nPER-CLASS PERFORMANCE")
for intent_class in sorted(per_class.keys()):
stats = per_class[intent_class]
print(f"\n{intent_class}:")
print(f" Total: {stats['total']}")
print(f" Correct: {stats['correct']}")
print(f" Uncertain: {stats['uncertain']}")
print(f" Accuracy: {stats['accuracy']:.2%}")
if misclassified:
print("\n\n\nMISCLASSIFICATION DETAILS")
for m in misclassified:
print(f"\n[Index {m['index']}]")
print(f"Text: {m['text'][:100]}...")
print(f"Expected: {m['expected']}")
print(f"Predicted: {m['predicted']}")
print(f"Confidence: {m['confidence']:.3f}")
print("\n\n\nCONFIDENCE DISTRIBUTION")
if confidence['mean'] is not None:
print(f"Mean confidence: {confidence['mean']:.3f}")
print(f"Min confidence: {confidence['min']:.3f}")
print(f"Max confidence: {confidence['max']:.3f}")
print(f"Std dev: {confidence['std']:.3f}")