Lung-Cancer-Risk-Diagnosis-Assistant / backend /intent_classification /intent_classification_eval.py
| import pandas as pd | |
| import sys | |
| import os | |
| import numpy as np | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| from intent_router_ml import route_intent | |
| test_data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "data", "intent_classification_eval_data.csv") | |
| test_data = pd.read_csv(test_data_path) | |
| def result_list_extraction(): | |
| result_list = [] | |
| for text in test_data['input_text']: | |
| result = route_intent(text, 1, 1) | |
| result_list.append(result) | |
| expected_res_list = [] | |
| for exp in test_data['expected_intent']: | |
| expected_res_list.append(exp) | |
| return result_list, expected_res_list | |
| def _compute_metrics(): | |
| """Shared computation used by both evaluate_intent_classifier and return_metrics.""" | |
| tp = 0 | |
| misclassified = [] | |
| uncertain_count = 0 | |
| result_list, expected_list = result_list_extraction() | |
| total_len = len(expected_list) | |
| class_stats = { | |
| 'PATIENT_EVIDENCE_QUERY': {'total': 0, 'correct': 0, 'uncertain': 0}, | |
| 'FOLLOW_UP_EXPLANATION': {'total': 0, 'correct': 0, 'uncertain': 0}, | |
| 'SOURCE_REQUEST': {'total': 0, 'correct': 0, 'uncertain': 0}, | |
| 'HELP_OR_OTHER': {'total': 0, 'correct': 0, 'uncertain': 0} | |
| } | |
| for i in range(total_len): | |
| expected = expected_list[i] | |
| class_stats[expected]['total'] += 1 | |
| if result_list[i]['status'] == 'ROUTE': | |
| if result_list[i]['intent'] == expected_list[i]: | |
| tp += 1 | |
| class_stats[expected]['correct'] += 1 | |
| else: | |
| misclassified.append({ | |
| 'index': i, | |
| 'text': test_data['input_text'].iloc[i], | |
| 'expected': expected, | |
| 'predicted': result_list[i]['intent'], | |
| 'confidence': result_list[i]['confidence'] | |
| }) | |
| elif result_list[i]['status'] == 'NEEDS_CLARIFICATION': | |
| tp += 1 | |
| uncertain_count += 1 | |
| class_stats[expected]['uncertain'] += 1 | |
| else: | |
| print(f"error in intent classification: Unexpected status at index {i}: {result_list[i]['status']}") | |
| accuracy = tp / total_len | |
| confident_predictions = [ | |
| result_list[i]['confidence'] | |
| for i in range(total_len) | |
| if result_list[i]['status'] == 'ROUTE' | |
| ] | |
| per_class_metrics = {} | |
| for intent_class, stats in class_stats.items(): | |
| per_class_metrics[intent_class] = { | |
| **stats, | |
| 'accuracy': (stats['correct'] + stats['uncertain']) / stats['total'] | |
| } | |
| return { | |
| 'overall': { | |
| 'accuracy': accuracy, | |
| 'tp': tp, | |
| 'total': total_len, | |
| 'uncertain_count': uncertain_count, | |
| 'uncertain_rate': uncertain_count / total_len, | |
| 'misclassification_count': len(misclassified), | |
| 'misclassification_rate': len(misclassified) / total_len, | |
| }, | |
| 'per_class': per_class_metrics, | |
| 'misclassified': misclassified, | |
| 'confidence': { | |
| 'mean': float(np.mean(confident_predictions)) if confident_predictions else None, | |
| 'min': float(np.min(confident_predictions)) if confident_predictions else None, | |
| 'max': float(np.max(confident_predictions)) if confident_predictions else None, | |
| 'std': float(np.std(confident_predictions)) if confident_predictions else None, | |
| 'all_values': confident_predictions, | |
| } | |
| } | |
| def return_metrics(): | |
| return _compute_metrics() | |
| def evaluate_intent_classifier(): | |
| m = _compute_metrics() | |
| overall = m['overall'] | |
| per_class = m['per_class'] | |
| misclassified = m['misclassified'] | |
| confidence = m['confidence'] | |
| print("\n\n\nINTENT CLASSIFICATION EVALUATION RESULTS") | |
| print(f"\nOverall Accuracy: {overall['accuracy']:.2%} ({overall['tp']}/{overall['total']})") | |
| print(f"Uncertain/Clarification Needed: {overall['uncertain_count']} ({overall['uncertain_rate']:.1%})") | |
| print(f"Misclassifications: {overall['misclassification_count']} ({overall['misclassification_rate']:.1%})") | |
| print("\n\n\nPER-CLASS PERFORMANCE") | |
| for intent_class in sorted(per_class.keys()): | |
| stats = per_class[intent_class] | |
| print(f"\n{intent_class}:") | |
| print(f" Total: {stats['total']}") | |
| print(f" Correct: {stats['correct']}") | |
| print(f" Uncertain: {stats['uncertain']}") | |
| print(f" Accuracy: {stats['accuracy']:.2%}") | |
| if misclassified: | |
| print("\n\n\nMISCLASSIFICATION DETAILS") | |
| for m in misclassified: | |
| print(f"\n[Index {m['index']}]") | |
| print(f"Text: {m['text'][:100]}...") | |
| print(f"Expected: {m['expected']}") | |
| print(f"Predicted: {m['predicted']}") | |
| print(f"Confidence: {m['confidence']:.3f}") | |
| print("\n\n\nCONFIDENCE DISTRIBUTION") | |
| if confidence['mean'] is not None: | |
| print(f"Mean confidence: {confidence['mean']:.3f}") | |
| print(f"Min confidence: {confidence['min']:.3f}") | |
| print(f"Max confidence: {confidence['max']:.3f}") | |
| print(f"Std dev: {confidence['std']:.3f}") | |