animetix-web / src /scripts /mlops_rag_eval.py
MissawB's picture
Upload folder using huggingface_hub
6a8fee8 verified
Raw
History Blame Contribute Delete
1.92 kB
import os
import json
import argparse
from backend.animetix.services import AnimetixService
from core.domain.services.ragas_eval_service import RagasEvalService
def run_mlops_eval():
"""
Lance une évaluation automatique de la qualité de l'IA sur un échantillon de requêtes.
Retourne un dictionnaire de stats pour le pipeline MLOps.
"""
print("🧪 Starting Automated RAG Evaluation (RAGAS)...")
animetix = AnimetixService()
judge = animetix.inference_adapter
eval_service = RagasEvalService(judge_engine=judge)
# Échantillon de test (Golden Dataset simplifié)
test_queries = [
{"q": "Qui est le créateur de One Piece ?", "type": "Anime", "ctx": "Eiichiro Oda"},
{"q": "Quelle est l'intrigue de Akira ?", "type": "Movie", "ctx": "Katsuhiro Otomo, néo-tokyo"}
]
all_scores = []
for item in test_queries:
print(f"🧐 Evaluating query: '{item['q']}'")
# 1. Génération de la réponse
response = animetix.agentic_rag.plan_and_solve(item['q'], item['type'])
# 2. Évaluation
scores = eval_service.evaluate_response(item['q'], item['ctx'], response)
all_scores.append({"query": item['q'], "scores": scores})
# Stats Finales
avg_faith = sum(s['scores']['faithfulness'] for s in all_scores) / len(all_scores)
avg_relevancy = sum(s['scores']['answer_relevancy'] for s in all_scores) / len(all_scores)
report = {
"avg_faithfulness": avg_faith,
"avg_answer_relevancy": avg_relevancy,
"timestamp": str(datetime.now())
}
print("\n" + "═"*30)
print(f"📊 MLOPS REPORT")
print(f"✅ Avg Faithfulness: {avg_faith:.2f}")
print(f"🎯 Avg Relevancy: {avg_relevancy:.2f}")
print("═"*30)
return report
if __name__ == "__main__":
from datetime import datetime
run_mlops_eval()