Spaces:
Sleeping
Sleeping
| # LightEval_Mimir.py | |
| '''This document outlines hte LightEval setu for tracking performance metrics of Mimir, to be sent to the trackio page for viszulization.''' | |
| # Imports | |
| from lighteval.metrics.metrics_sample import BertScore, ROUGE | |
| from lighteval.tasks.requests import Doc | |
| async def evaluate_educational_quality(user_query, response, thread_id): | |
| """Dynamic evaluation using LightEval metrics""" | |
| # Create ephemeral task for this turn | |
| doc = Doc( | |
| task_name=f"turn_{thread_id}", | |
| query=user_query, | |
| choices=[response], | |
| gold_index=-1, # No ground truth initially | |
| specific_output=response | |
| ) | |
| # Use BertScore for semantic quality | |
| bert_score = BertScore().compute(doc) | |
| # Custom educational coherence metric | |
| educational_indicators = { | |
| 'has_examples': 'example' in response.lower(), | |
| 'structured_explanation': '##' in response or '1.' in response, | |
| 'appropriate_length': 100 < len(response) < 1500, | |
| 'encourages_learning': any(phrase in response.lower() | |
| for phrase in ['practice', 'try', 'consider', 'think about']) | |
| } | |
| return { | |
| 'semantic_quality': bert_score, | |
| 'educational_score': sum(educational_indicators.values()) / len(educational_indicators), | |
| 'response_time': time.time() - start_time | |
| } | |
| def track_rag_performance(query, retrieved_docs, used_in_response): | |
| """Evaluate RAG retrieval quality""" | |
| from lighteval.metrics.utils.metric_utils import SampleLevelMetric | |
| # Track retrieval-to-response alignment | |
| retrieval_relevance = calculate_relevance(query, retrieved_docs) | |
| retrieval_usage = len(used_in_response) / len(retrieved_docs) if retrieved_docs else 0 | |
| # Log to trackio with LightEval structure | |
| metric_payload = { | |
| "evaluation_id": str(uuid.uuid4()), | |
| "task": "rag_retrieval", | |
| "metrics": { | |
| "retrieval_relevance": retrieval_relevance, | |
| "retrieval_usage_rate": retrieval_usage, | |
| "num_docs_retrieved": len(retrieved_docs) | |
| }, | |
| "metadata": { | |
| "query": query[:100], | |
| "sources": [doc.metadata.get('source') for doc in retrieved_docs] | |
| } | |
| } | |
| send_evaluation_to_trackio(metric_payload) | |
| def evaluate_prompt_classification(predicted_mode, actual_conversation_outcome, thread_id): | |
| """Track prompt classifier accuracy in production""" | |
| # Did the predicted mode lead to successful interaction? | |
| success_indicators = { | |
| 'discovery_mode': lambda outcome: 'clarified_topic' in outcome, | |
| 'teaching_mode': lambda outcome: outcome.get('quality_score', 0) > 3.5, | |
| 'conversational': lambda outcome: outcome.get('user_satisfied', False) | |
| } | |
| mode_was_correct = success_indicators.get( | |
| predicted_mode, | |
| lambda x: True | |
| )(actual_conversation_outcome) | |
| # Create LightEval-style evaluation | |
| from lighteval.metrics import Metrics | |
| accuracy_metric = Metrics.ACCURACY if mode_was_correct else 0 | |
| return { | |
| "prompt_classifier_accuracy": accuracy_metric, | |
| "predicted_mode": predicted_mode, | |
| "conversation_length": len(conversation_state) | |
| } | |
| def process_user_feedback(response_id, feedback_type, conversation_state): | |
| """Convert user feedback to LightEval ground truth""" | |
| last_exchange = { | |
| "query": conversation_state[-2]["content"], # User's question | |
| "response": conversation_state[-1]["content"], # Agent's response | |
| "gold_index": 0 if feedback_type == "thumbs_up" else -1 | |
| } | |
| # Create retrospective evaluation with ground truth | |
| from lighteval.tasks.requests import Doc | |
| doc = Doc( | |
| task_name="user_feedback_eval", | |
| query=last_exchange["query"], | |
| choices=[last_exchange["response"]], | |
| gold_index=last_exchange["gold_index"] | |
| ) | |
| # Now you have ground truth for accuracy metrics! | |
| accuracy = 1.0 if feedback_type == "thumbs_up" else 0.0 | |
| return {"user_feedback_accuracy": accuracy, "response_id": response_id} | |