# LightEval_Mimir.py '''This document outlines hte LightEval setu for tracking performance metrics of Mimir, to be sent to the trackio page for viszulization.''' # Imports from lighteval.metrics.metrics_sample import BertScore, ROUGE from lighteval.tasks.requests import Doc async def evaluate_educational_quality(user_query, response, thread_id): """Dynamic evaluation using LightEval metrics""" # Create ephemeral task for this turn doc = Doc( task_name=f"turn_{thread_id}", query=user_query, choices=[response], gold_index=-1, # No ground truth initially specific_output=response ) # Use BertScore for semantic quality bert_score = BertScore().compute(doc) # Custom educational coherence metric educational_indicators = { 'has_examples': 'example' in response.lower(), 'structured_explanation': '##' in response or '1.' in response, 'appropriate_length': 100 < len(response) < 1500, 'encourages_learning': any(phrase in response.lower() for phrase in ['practice', 'try', 'consider', 'think about']) } return { 'semantic_quality': bert_score, 'educational_score': sum(educational_indicators.values()) / len(educational_indicators), 'response_time': time.time() - start_time } def track_rag_performance(query, retrieved_docs, used_in_response): """Evaluate RAG retrieval quality""" from lighteval.metrics.utils.metric_utils import SampleLevelMetric # Track retrieval-to-response alignment retrieval_relevance = calculate_relevance(query, retrieved_docs) retrieval_usage = len(used_in_response) / len(retrieved_docs) if retrieved_docs else 0 # Log to trackio with LightEval structure metric_payload = { "evaluation_id": str(uuid.uuid4()), "task": "rag_retrieval", "metrics": { "retrieval_relevance": retrieval_relevance, "retrieval_usage_rate": retrieval_usage, "num_docs_retrieved": len(retrieved_docs) }, "metadata": { "query": query[:100], "sources": [doc.metadata.get('source') for doc in retrieved_docs] } } send_evaluation_to_trackio(metric_payload) def evaluate_prompt_classification(predicted_mode, actual_conversation_outcome, thread_id): """Track prompt classifier accuracy in production""" # Did the predicted mode lead to successful interaction? success_indicators = { 'discovery_mode': lambda outcome: 'clarified_topic' in outcome, 'teaching_mode': lambda outcome: outcome.get('quality_score', 0) > 3.5, 'conversational': lambda outcome: outcome.get('user_satisfied', False) } mode_was_correct = success_indicators.get( predicted_mode, lambda x: True )(actual_conversation_outcome) # Create LightEval-style evaluation from lighteval.metrics import Metrics accuracy_metric = Metrics.ACCURACY if mode_was_correct else 0 return { "prompt_classifier_accuracy": accuracy_metric, "predicted_mode": predicted_mode, "conversation_length": len(conversation_state) } def process_user_feedback(response_id, feedback_type, conversation_state): """Convert user feedback to LightEval ground truth""" last_exchange = { "query": conversation_state[-2]["content"], # User's question "response": conversation_state[-1]["content"], # Agent's response "gold_index": 0 if feedback_type == "thumbs_up" else -1 } # Create retrospective evaluation with ground truth from lighteval.tasks.requests import Doc doc = Doc( task_name="user_feedback_eval", query=last_exchange["query"], choices=[last_exchange["response"]], gold_index=last_exchange["gold_index"] ) # Now you have ground truth for accuracy metrics! accuracy = 1.0 if feedback_type == "thumbs_up" else 0.0 return {"user_feedback_accuracy": accuracy, "response_id": response_id}