Spaces:
Sleeping
Sleeping
| from sklearn.metrics import accuracy_score, precision_score, recall_score | |
| from typing import List, Dict, Any | |
| import pandas as pd | |
| from llama_index import GPTVectorStoreIndex | |
| class EvaluationModule: | |
| def __init__(self): | |
| self.metric_functions = { | |
| "Accuracy": accuracy_score, | |
| "Precision": precision_score, | |
| "Recall": recall_score | |
| } | |
| self.evaluation_results = [] | |
| def evaluate_model( | |
| self, | |
| index: GPTVectorStoreIndex, | |
| selected_metrics: List[str], | |
| test_data: pd.DataFrame = None | |
| ) -> Dict[str, float]: | |
| """Evaluate model performance using selected metrics""" | |
| results = {} | |
| # If test data is not provided, use default test set | |
| if test_data is None: | |
| # You would need to implement this based on your specific needs | |
| return self._run_default_evaluation(index, selected_metrics) | |
| for metric in selected_metrics: | |
| if metric in self.metric_functions: | |
| # Calculate predictions using the index | |
| predictions = self._get_predictions(index, test_data) | |
| true_values = test_data['target'].values # Adjust column name as needed | |
| # Calculate metric | |
| score = self.metric_functions[metric](true_values, predictions) | |
| results[metric] = score | |
| return results | |
| def _get_predictions( | |
| self, | |
| index: GPTVectorStoreIndex, | |
| test_data: pd.DataFrame | |
| ) -> List[Any]: | |
| """Get predictions for test data using the index""" | |
| predictions = [] | |
| for _, row in test_data.iterrows(): | |
| # Convert row to query format | |
| query = " ".join([f"{col}: {val}" for col, val in row.items()]) | |
| # Query the index | |
| response = index.as_query_engine().query(query) | |
| predictions.append(response.response) | |
| return predictions | |
| def _run_default_evaluation( | |
| self, | |
| index: GPTVectorStoreIndex, | |
| selected_metrics: List[str] | |
| ) -> Dict[str, float]: | |
| """Run evaluation using default test cases""" | |
| # Define some default test cases | |
| default_tests = [ | |
| { | |
| "query": "What is machine learning?", | |
| "expected": "Machine learning is a branch of artificial intelligence..." | |
| }, | |
| # Add more default test cases as needed | |
| ] | |
| results = {} | |
| for metric in selected_metrics: | |
| # For demonstration, return placeholder scores | |
| results[metric] = 0.85 # Replace with actual evaluation logic | |
| return results | |
| def save_evaluation_results(self, results: Dict[str, float]): | |
| """Save evaluation results for tracking""" | |
| self.evaluation_results.append(results) | |
| def get_evaluation_history(self) -> List[Dict[str, float]]: | |
| """Get history of evaluation results""" | |
| return self.evaluation_results |