import pandas as pd import sys import os import numpy as np sys.path.append(os.path.dirname(os.path.abspath(__file__))) from intent_router_ml import route_intent test_data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "data", "intent_classification_eval_data.csv") test_data = pd.read_csv(test_data_path) def result_list_extraction(): result_list = [] for text in test_data['input_text']: result = route_intent(text, 1, 1) result_list.append(result) expected_res_list = [] for exp in test_data['expected_intent']: expected_res_list.append(exp) return result_list, expected_res_list def _compute_metrics(): """Shared computation used by both evaluate_intent_classifier and return_metrics.""" tp = 0 misclassified = [] uncertain_count = 0 result_list, expected_list = result_list_extraction() total_len = len(expected_list) class_stats = { 'PATIENT_EVIDENCE_QUERY': {'total': 0, 'correct': 0, 'uncertain': 0}, 'FOLLOW_UP_EXPLANATION': {'total': 0, 'correct': 0, 'uncertain': 0}, 'SOURCE_REQUEST': {'total': 0, 'correct': 0, 'uncertain': 0}, 'HELP_OR_OTHER': {'total': 0, 'correct': 0, 'uncertain': 0} } for i in range(total_len): expected = expected_list[i] class_stats[expected]['total'] += 1 if result_list[i]['status'] == 'ROUTE': if result_list[i]['intent'] == expected_list[i]: tp += 1 class_stats[expected]['correct'] += 1 else: misclassified.append({ 'index': i, 'text': test_data['input_text'].iloc[i], 'expected': expected, 'predicted': result_list[i]['intent'], 'confidence': result_list[i]['confidence'] }) elif result_list[i]['status'] == 'NEEDS_CLARIFICATION': tp += 1 uncertain_count += 1 class_stats[expected]['uncertain'] += 1 else: print(f"error in intent classification: Unexpected status at index {i}: {result_list[i]['status']}") accuracy = tp / total_len confident_predictions = [ result_list[i]['confidence'] for i in range(total_len) if result_list[i]['status'] == 'ROUTE' ] per_class_metrics = {} for intent_class, stats in class_stats.items(): per_class_metrics[intent_class] = { **stats, 'accuracy': (stats['correct'] + stats['uncertain']) / stats['total'] } return { 'overall': { 'accuracy': accuracy, 'tp': tp, 'total': total_len, 'uncertain_count': uncertain_count, 'uncertain_rate': uncertain_count / total_len, 'misclassification_count': len(misclassified), 'misclassification_rate': len(misclassified) / total_len, }, 'per_class': per_class_metrics, 'misclassified': misclassified, 'confidence': { 'mean': float(np.mean(confident_predictions)) if confident_predictions else None, 'min': float(np.min(confident_predictions)) if confident_predictions else None, 'max': float(np.max(confident_predictions)) if confident_predictions else None, 'std': float(np.std(confident_predictions)) if confident_predictions else None, 'all_values': confident_predictions, } } def return_metrics(): return _compute_metrics() def evaluate_intent_classifier(): m = _compute_metrics() overall = m['overall'] per_class = m['per_class'] misclassified = m['misclassified'] confidence = m['confidence'] print("\n\n\nINTENT CLASSIFICATION EVALUATION RESULTS") print(f"\nOverall Accuracy: {overall['accuracy']:.2%} ({overall['tp']}/{overall['total']})") print(f"Uncertain/Clarification Needed: {overall['uncertain_count']} ({overall['uncertain_rate']:.1%})") print(f"Misclassifications: {overall['misclassification_count']} ({overall['misclassification_rate']:.1%})") print("\n\n\nPER-CLASS PERFORMANCE") for intent_class in sorted(per_class.keys()): stats = per_class[intent_class] print(f"\n{intent_class}:") print(f" Total: {stats['total']}") print(f" Correct: {stats['correct']}") print(f" Uncertain: {stats['uncertain']}") print(f" Accuracy: {stats['accuracy']:.2%}") if misclassified: print("\n\n\nMISCLASSIFICATION DETAILS") for m in misclassified: print(f"\n[Index {m['index']}]") print(f"Text: {m['text'][:100]}...") print(f"Expected: {m['expected']}") print(f"Predicted: {m['predicted']}") print(f"Confidence: {m['confidence']:.3f}") print("\n\n\nCONFIDENCE DISTRIBUTION") if confidence['mean'] is not None: print(f"Mean confidence: {confidence['mean']:.3f}") print(f"Min confidence: {confidence['min']:.3f}") print(f"Max confidence: {confidence['max']:.3f}") print(f"Std dev: {confidence['std']:.3f}")