Spaces:
Sleeping
Sleeping
| import warnings | |
| warnings.filterwarnings("ignore", category=DeprecationWarning) | |
| from datasets import Dataset | |
| from ragas import evaluate | |
| from ragas.metrics import faithfulness, context_precision | |
| from app.capstone import full_chain | |
| def run_eval(): | |
| eval_data = { | |
| "question": [ | |
| "What is the minimum password length?", | |
| "Can interns work remotely?", | |
| "What is the per diem for international travel?", | |
| "How quickly must I report a security breach?", | |
| "What happens after two bad performance reviews?" | |
| ], | |
| "ground_truth": [ | |
| "Passwords must be minimum 14 characters with uppercase, lowercase, numbers, and special characters.", | |
| "Interns must receive written approval from their department head.", | |
| "Per diem for meals is $100 per day international.", | |
| "All security incidents must be reported to security@techcorp.com within 1 hour of discovery.", | |
| "Two consecutive unsatisfactory reviews may result in revocation of remote work privileges.", | |
| ] | |
| } | |
| answers = [] | |
| contexts = [] | |
| for question in eval_data["question"]: | |
| print(f"Processing: {question}") | |
| result = full_chain.invoke({"question": question}) | |
| answers.append(result.answer) | |
| contexts.append(result.retrieved_contexts) | |
| eval_data["answer"] = answers | |
| eval_data["contexts"] = contexts | |
| dataset = Dataset.from_dict(eval_data) | |
| # Compute metrics | |
| results = evaluate(dataset, metrics=[faithfulness, context_precision]) | |
| print("\nEvaluation Results:", results) | |
| # Extract scores (they should be floats; if lists, take first element) | |
| raw_faith = results["faithfulness"] | |
| print(f"DEBUG: raw_faith = {raw_faith!r} (type: {type(raw_faith)})") | |
| raw_precision = results["context_precision"] | |
| print(f"DEBUG: raw_precision = {raw_precision!r} (type: {type(raw_precision)})") | |
| faithfulness_score = sum(raw_faith) / len(raw_faith) if isinstance(raw_faith, list) else raw_faith | |
| context_precision_score = sum(raw_precision) / len(raw_precision) if isinstance(raw_precision, list) else raw_precision | |
| print(f"\nFaithfulness: {faithfulness_score:.4f}") | |
| print(f"Context Precision: {context_precision_score:.4f}") | |
| if faithfulness_score >= 0.8 and context_precision_score >= 0.8: | |
| print("✅ Target scores achieved!") | |
| else: | |
| print("❌ Scores below target. Review retrieval or prompts.") | |
| if __name__ == "__main__": | |
| run_eval() |