Spaces:
Sleeping
Sleeping
Upload LightEval_Mimir.py
Browse files- LightEval_Mimir.py +109 -0
LightEval_Mimir.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LightEval_Mimir.py
|
| 2 |
+
'''This document outlines hte LightEval setu for tracking performance metrics of Mimir, to be sent to the trackio page for viszulization.'''
|
| 3 |
+
|
| 4 |
+
# Imports
|
| 5 |
+
from lighteval.metrics.metrics_sample import BertScore, ROUGE
|
| 6 |
+
from lighteval.tasks.requests import Doc
|
| 7 |
+
|
| 8 |
+
async def evaluate_educational_quality(user_query, response, thread_id):
|
| 9 |
+
"""Dynamic evaluation using LightEval metrics"""
|
| 10 |
+
# Create ephemeral task for this turn
|
| 11 |
+
doc = Doc(
|
| 12 |
+
task_name=f"turn_{thread_id}",
|
| 13 |
+
query=user_query,
|
| 14 |
+
choices=[response],
|
| 15 |
+
gold_index=-1, # No ground truth initially
|
| 16 |
+
specific_output=response
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Use BertScore for semantic quality
|
| 20 |
+
bert_score = BertScore().compute(doc)
|
| 21 |
+
|
| 22 |
+
# Custom educational coherence metric
|
| 23 |
+
educational_indicators = {
|
| 24 |
+
'has_examples': 'example' in response.lower(),
|
| 25 |
+
'structured_explanation': '##' in response or '1.' in response,
|
| 26 |
+
'appropriate_length': 100 < len(response) < 1500,
|
| 27 |
+
'encourages_learning': any(phrase in response.lower()
|
| 28 |
+
for phrase in ['practice', 'try', 'consider', 'think about'])
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
return {
|
| 32 |
+
'semantic_quality': bert_score,
|
| 33 |
+
'educational_score': sum(educational_indicators.values()) / len(educational_indicators),
|
| 34 |
+
'response_time': time.time() - start_time
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
def track_rag_performance(query, retrieved_docs, used_in_response):
|
| 38 |
+
"""Evaluate RAG retrieval quality"""
|
| 39 |
+
from lighteval.metrics.utils.metric_utils import SampleLevelMetric
|
| 40 |
+
|
| 41 |
+
# Track retrieval-to-response alignment
|
| 42 |
+
retrieval_relevance = calculate_relevance(query, retrieved_docs)
|
| 43 |
+
retrieval_usage = len(used_in_response) / len(retrieved_docs) if retrieved_docs else 0
|
| 44 |
+
|
| 45 |
+
# Log to trackio with LightEval structure
|
| 46 |
+
metric_payload = {
|
| 47 |
+
"evaluation_id": str(uuid.uuid4()),
|
| 48 |
+
"task": "rag_retrieval",
|
| 49 |
+
"metrics": {
|
| 50 |
+
"retrieval_relevance": retrieval_relevance,
|
| 51 |
+
"retrieval_usage_rate": retrieval_usage,
|
| 52 |
+
"num_docs_retrieved": len(retrieved_docs)
|
| 53 |
+
},
|
| 54 |
+
"metadata": {
|
| 55 |
+
"query": query[:100],
|
| 56 |
+
"sources": [doc.metadata.get('source') for doc in retrieved_docs]
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
send_evaluation_to_trackio(metric_payload)
|
| 61 |
+
|
| 62 |
+
def evaluate_prompt_classification(predicted_mode, actual_conversation_outcome, thread_id):
|
| 63 |
+
"""Track prompt classifier accuracy in production"""
|
| 64 |
+
|
| 65 |
+
# Did the predicted mode lead to successful interaction?
|
| 66 |
+
success_indicators = {
|
| 67 |
+
'discovery_mode': lambda outcome: 'clarified_topic' in outcome,
|
| 68 |
+
'teaching_mode': lambda outcome: outcome.get('quality_score', 0) > 3.5,
|
| 69 |
+
'conversational': lambda outcome: outcome.get('user_satisfied', False)
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
mode_was_correct = success_indicators.get(
|
| 73 |
+
predicted_mode,
|
| 74 |
+
lambda x: True
|
| 75 |
+
)(actual_conversation_outcome)
|
| 76 |
+
|
| 77 |
+
# Create LightEval-style evaluation
|
| 78 |
+
from lighteval.metrics import Metrics
|
| 79 |
+
accuracy_metric = Metrics.ACCURACY if mode_was_correct else 0
|
| 80 |
+
|
| 81 |
+
return {
|
| 82 |
+
"prompt_classifier_accuracy": accuracy_metric,
|
| 83 |
+
"predicted_mode": predicted_mode,
|
| 84 |
+
"conversation_length": len(conversation_state)
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
def process_user_feedback(response_id, feedback_type, conversation_state):
|
| 88 |
+
"""Convert user feedback to LightEval ground truth"""
|
| 89 |
+
|
| 90 |
+
last_exchange = {
|
| 91 |
+
"query": conversation_state[-2]["content"], # User's question
|
| 92 |
+
"response": conversation_state[-1]["content"], # Agent's response
|
| 93 |
+
"gold_index": 0 if feedback_type == "thumbs_up" else -1
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# Create retrospective evaluation with ground truth
|
| 97 |
+
from lighteval.tasks.requests import Doc
|
| 98 |
+
doc = Doc(
|
| 99 |
+
task_name="user_feedback_eval",
|
| 100 |
+
query=last_exchange["query"],
|
| 101 |
+
choices=[last_exchange["response"]],
|
| 102 |
+
gold_index=last_exchange["gold_index"]
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Now you have ground truth for accuracy metrics!
|
| 106 |
+
accuracy = 1.0 if feedback_type == "thumbs_up" else 0.0
|
| 107 |
+
|
| 108 |
+
return {"user_feedback_accuracy": accuracy, "response_id": response_id}
|
| 109 |
+
|