Spaces:
Running
Running
File size: 7,248 Bytes
b02e301 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv(Path(__file__).parent.parent.parent.parent.parent / ".env", override=True)
from evaluation import RAGEvaluator, create_test_cases
from rag_system import RAGSystem
def test_mrr_calculation():
print("\n" + "="*60)
print("TEST: Mean Reciprocal Rank")
print("="*60)
try:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
evaluator = RAGEvaluator(client)
retrieved = ["doc3", "doc1", "doc2"]
relevant = ["doc1"]
mrr = evaluator.mean_reciprocal_rank(retrieved, relevant)
expected = 1.0 / 2
assert abs(mrr - expected) < 0.001, f"MRR should be {expected}, got {mrr}"
print(f"β MRR calculation correct: {mrr}")
print("β
MRR test PASSED")
return True
except Exception as e:
print(f"β MRR test FAILED: {e}")
return False
def test_ndcg_calculation():
print("\n" + "="*60)
print("TEST: Normalized DCG")
print("="*60)
try:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
evaluator = RAGEvaluator(client)
retrieved = ["doc1", "doc2", "doc3"]
relevance_scores = {"doc1": 5, "doc2": 3, "doc3": 1}
ndcg = evaluator.ndcg_at_k(retrieved, relevance_scores, k=3)
assert 0 <= ndcg <= 1, f"nDCG should be between 0 and 1, got {ndcg}"
print(f"β nDCG calculation: {ndcg:.4f}")
print("β
nDCG test PASSED")
return True
except Exception as e:
print(f"β nDCG test FAILED: {e}")
return False
def test_precision_recall():
print("\n" + "="*60)
print("TEST: Precision and Recall")
print("="*60)
try:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
evaluator = RAGEvaluator(client)
retrieved = ["doc1", "doc2", "doc3", "doc4", "doc5"]
relevant = ["doc1", "doc3", "doc6"]
precision = evaluator.precision_at_k(retrieved, relevant, k=5)
recall = evaluator.recall_at_k(retrieved, relevant, k=5)
expected_precision = 2 / 5
expected_recall = 2 / 3
assert abs(precision - expected_precision) < 0.001, f"Precision should be {expected_precision}"
assert abs(recall - expected_recall) < 0.001, f"Recall should be {expected_recall}"
print(f"β Precision@5: {precision:.4f}")
print(f"β Recall@5: {recall:.4f}")
print("β
Precision/Recall test PASSED")
return True
except Exception as e:
print(f"β Precision/Recall test FAILED: {e}")
return False
def test_llm_as_judge():
print("\n" + "="*60)
print("TEST: LLM-as-Judge")
print("="*60)
try:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
evaluator = RAGEvaluator(client)
query = "What programming languages do you know?"
answer = "I am proficient in Python, JavaScript, and SQL."
result = evaluator.llm_as_judge_answer(query, answer)
assert "accuracy" in result, "Should have accuracy score"
assert "completeness" in result, "Should have completeness score"
assert "relevance" in result, "Should have relevance score"
assert "coherence" in result, "Should have coherence score"
assert "overall_score" in result, "Should have overall score"
assert "feedback" in result, "Should have feedback"
print(f"β Accuracy: {result['accuracy']}/5")
print(f"β Completeness: {result['completeness']}/5")
print(f"β Relevance: {result['relevance']}/5")
print(f"β Coherence: {result['coherence']}/5")
print(f"β Overall: {result['overall_score']}/5")
print(f"β Feedback: {result['feedback'][:50]}...")
print("β
LLM-as-Judge test PASSED")
return True
except Exception as e:
print(f"β LLM-as-Judge test FAILED: {e}")
return False
def test_create_test_cases():
print("\n" + "="*60)
print("TEST: Test Case Creation")
print("="*60)
try:
queries = [
("What is your experience?", "Expected answer 1"),
("What skills do you have?", "Expected answer 2")
]
test_cases = create_test_cases(queries)
assert isinstance(test_cases, list), "Should return a list"
assert len(test_cases) == 2, "Should create 2 test cases"
assert "query" in test_cases[0], "Should have query field"
assert "ground_truth" in test_cases[0], "Should have ground_truth field"
print(f"β Created {len(test_cases)} test cases")
print("β
Test case creation test PASSED")
return True
except Exception as e:
print(f"β Test case creation test FAILED: {e}")
return False
def test_rag_evaluation():
print("\n" + "="*60)
print("TEST: RAG System Evaluation")
print("="*60)
try:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
evaluator = RAGEvaluator(client)
rag_system = RAGSystem(client, data_dir="data/test_eval")
test_docs = {
"summary": "Expert Python developer with 5 years experience",
"projects": "Built ML systems and web applications"
}
rag_system.load_knowledge_base(test_docs, chunk_size=15, overlap=3)
test_cases = create_test_cases([("What programming experience do you have?", "Python development")])
system_prompt = "Answer questions about professional background."
results = evaluator.evaluate_rag_system(test_cases, rag_system, system_prompt, method="hybrid")
assert len(results) > 0, "Should produce evaluation results"
assert "query" in results.columns, "Should have query column"
assert "overall_score" in results.columns, "Should have overall_score column"
print(f"β Evaluated {len(results)} queries")
print(f"β Average score: {results['overall_score'].mean():.2f}/5")
print("β
RAG evaluation test PASSED")
return True
except Exception as e:
print(f"β RAG evaluation test FAILED: {e}")
return False
def run_all_tests():
print("\n" + "="*70)
print("RUNNING EVALUATION TESTS")
print("="*70)
tests = [
test_mrr_calculation,
test_ndcg_calculation,
test_precision_recall,
test_llm_as_judge,
test_create_test_cases,
test_rag_evaluation
]
results = [test() for test in tests]
print("\n" + "="*70)
print(f"RESULTS: {sum(results)}/{len(results)} tests passed")
print("="*70)
return all(results)
if __name__ == "__main__":
success = run_all_tests()
sys.exit(0 if success else 1)
|