Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Oct 24, 2025

Commit

04db454

verified ·

1 Parent(s): 4846644

Upload LightEval_Mimir.py

Browse files

Files changed (1) hide show

LightEval_Mimir.py +109 -0

LightEval_Mimir.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# LightEval_Mimir.py
+'''This document outlines hte LightEval setu for tracking performance metrics of Mimir, to be sent to the trackio page for viszulization.'''
+# Imports
+from lighteval.metrics.metrics_sample import BertScore, ROUGE
+from lighteval.tasks.requests import Doc
+async def evaluate_educational_quality(user_query, response, thread_id):
+    """Dynamic evaluation using LightEval metrics"""
+    # Create ephemeral task for this turn
+    doc = Doc(
+        task_name=f"turn_{thread_id}",
+        query=user_query,
+        choices=[response],
+        gold_index=-1,  # No ground truth initially
+        specific_output=response
+    )
+    # Use BertScore for semantic quality
+    bert_score = BertScore().compute(doc)
+    # Custom educational coherence metric
+    educational_indicators = {
+        'has_examples': 'example' in response.lower(),
+        'structured_explanation': '##' in response or '1.' in response,
+        'appropriate_length': 100 < len(response) < 1500,
+        'encourages_learning': any(phrase in response.lower()
+            for phrase in ['practice', 'try', 'consider', 'think about'])
+    }
+    return {
+        'semantic_quality': bert_score,
+        'educational_score': sum(educational_indicators.values()) / len(educational_indicators),
+        'response_time': time.time() - start_time
+    }
+def track_rag_performance(query, retrieved_docs, used_in_response):
+    """Evaluate RAG retrieval quality"""
+    from lighteval.metrics.utils.metric_utils import SampleLevelMetric
+    # Track retrieval-to-response alignment
+    retrieval_relevance = calculate_relevance(query, retrieved_docs)
+    retrieval_usage = len(used_in_response) / len(retrieved_docs) if retrieved_docs else 0
+    # Log to trackio with LightEval structure
+    metric_payload = {
+        "evaluation_id": str(uuid.uuid4()),
+        "task": "rag_retrieval",
+        "metrics": {
+            "retrieval_relevance": retrieval_relevance,
+            "retrieval_usage_rate": retrieval_usage,
+            "num_docs_retrieved": len(retrieved_docs)
+        },
+        "metadata": {
+            "query": query[:100],
+            "sources": [doc.metadata.get('source') for doc in retrieved_docs]
+        }
+    }
+    send_evaluation_to_trackio(metric_payload)
+def evaluate_prompt_classification(predicted_mode, actual_conversation_outcome, thread_id):
+    """Track prompt classifier accuracy in production"""
+    # Did the predicted mode lead to successful interaction?
+    success_indicators = {
+        'discovery_mode': lambda outcome: 'clarified_topic' in outcome,
+        'teaching_mode': lambda outcome: outcome.get('quality_score', 0) > 3.5,
+        'conversational': lambda outcome: outcome.get('user_satisfied', False)
+    }
+    mode_was_correct = success_indicators.get(
+        predicted_mode,
+        lambda x: True
+    )(actual_conversation_outcome)
+    # Create LightEval-style evaluation
+    from lighteval.metrics import Metrics
+    accuracy_metric = Metrics.ACCURACY if mode_was_correct else 0
+    return {
+        "prompt_classifier_accuracy": accuracy_metric,
+        "predicted_mode": predicted_mode,
+        "conversation_length": len(conversation_state)
+    }
+def process_user_feedback(response_id, feedback_type, conversation_state):
+    """Convert user feedback to LightEval ground truth"""
+    last_exchange = {
+        "query": conversation_state[-2]["content"],  # User's question
+        "response": conversation_state[-1]["content"], # Agent's response
+        "gold_index": 0 if feedback_type == "thumbs_up" else -1
+    }
+    # Create retrospective evaluation with ground truth
+    from lighteval.tasks.requests import Doc
+    doc = Doc(
+        task_name="user_feedback_eval",
+        query=last_exchange["query"],
+        choices=[last_exchange["response"]],
+        gold_index=last_exchange["gold_index"]
+    )
+    # Now you have ground truth for accuracy metrics!
+    accuracy = 1.0 if feedback_type == "thumbs_up" else 0.0
+    return {"user_feedback_accuracy": accuracy, "response_id": response_id}