Spaces:
Sleeping
Sleeping
File size: 5,962 Bytes
9ae77d7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | import json
import os
import time
from pathlib import Path
from dotenv import load_dotenv
# Import the agent logic from core.agent
from core.agent import init_agent, answer_query
# Load environment variables
load_dotenv()
BASE_DIR = Path(__file__).resolve().parent.parent
EVAL_SET_PATH = BASE_DIR / "data" / "test_set" / "eval_set.json"
REPORT_PATH = BASE_DIR / "data" / "test_set" / "evaluation_report.json"
DEBUG_DIR = BASE_DIR / "evaluation_debug"
def fuzzy_match(topic, answer):
"""
Check if a topic sounds like it's in the answer.
More lenient than strict substring.
"""
topic_clean = topic.lower().strip()
answer_clean = answer.lower().strip()
# 1. Direct match
if topic_clean in answer_clean:
return True
# 2. Key word subset check (if all significant words of a topic are in the answer)
# This helps catch "RAG Pipelines" vs "AI pipelines for RAG"
stop_words = {"and", "the", "a", "an", "is", "for", "vs", "to", "of", "with"}
words = [w for w in topic_clean.split() if w not in stop_words]
if not words: return False
matches = sum(1 for w in words if w in answer_clean)
# If 75% of the important words are there, count it as a pass
if (matches / len(words)) >= 0.75:
return True
return False
def save_detailed_log(qid, question, answer, chunks, retrieval_pass, missing_topics):
"""Save a clean markdown file for manual human inspection of this specific interaction."""
os.makedirs(DEBUG_DIR, exist_ok=True)
filepath = DEBUG_DIR / f"{qid}.md"
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# Evaluation Log: {qid}\n\n")
f.write(f"## Question\n{question}\n\n")
f.write(f"## Status\n")
f.write(f"- **Retrieval Mode:** {'PASS' if retrieval_pass else 'FAIL'}\n")
f.write(f"- **Generation Mode:** {'PASS' if not missing_topics else 'FAIL'}\n")
if missing_topics:
f.write(f"- **Missing Topics:** {', '.join(missing_topics)}\n")
f.write(f"\n## ArunCore Answer\n{answer}\n\n")
f.write(f"## Retrieved Chunks (Final Top 5)\n")
for i, doc in enumerate(chunks):
f.write(f"### Chunk {i+1} | Source: {doc.metadata.get('source')}\n")
f.write(f"```text\n{doc.page_content}\n```\n\n")
def run_evaluation():
print("--- ArunCore Dual-Evaluation Pipeline (Fuzzy Match + Rate Limit Handling) ---")
# 1. Initialize Agent
print("Initializing Agent...")
try:
vectorstore, bm25_retriever, compressor, llm, prompt = init_agent()
except Exception as e:
print(f"Failed to initialize agent: {e}")
return
# 2. Load Eval Set
if not EVAL_SET_PATH.exists():
print(f"Eval set not found at {EVAL_SET_PATH}")
return
with open(EVAL_SET_PATH, "r", encoding="utf-8") as f:
eval_set = json.load(f)
results = []
passed_retrieval = 0
passed_generation = 0
total = len(eval_set)
print(f"Starting evaluation of {total} questions...\n")
for i, test in enumerate(eval_set):
qid = test.get("id", f"Q{i}")
question = test.get("question")
expected_source = test.get("expected_source")
expected_topics = test.get("expected_topics", [])
print(f"[{i+1}/{total}] Evaluating {qid}: {question[:60]}...")
# Execute Agent
try:
# We add a delay to satisfy the 10/min Cohere Trial Limit
if i > 0:
print(f" (Rate limit cool-down: 6.5s)")
time.sleep(6.5)
response = answer_query(question, vectorstore, bm25_retriever, compressor, llm, prompt)
answer = response["answer"]
chunks = response["retrieved_chunks"]
except Exception as e:
print(f" Error Querying Agent: {e}")
results.append({
"id": qid,
"status": "ERROR",
"error": str(e)
})
continue
# --- Layer 1: Retrieval Check ---
retrieval_pass = False
if expected_source.startswith("static/"):
retrieval_pass = True
else:
for doc in chunks:
source_meta = doc.metadata.get("source", "").lower()
if expected_source.lower() in source_meta:
retrieval_pass = True
break
if retrieval_pass: passed_retrieval += 1
# --- Layer 2: Generation Check ---
# Fuzzy match for topics
missing_topics = []
for topic in expected_topics:
if not fuzzy_match(topic, answer):
missing_topics.append(topic)
generation_pass = len(missing_topics) == 0
if generation_pass: passed_generation += 1
# Log detailed human-readable file
save_detailed_log(qid, question, answer, chunks, retrieval_pass, missing_topics)
# Store result in summary list
results.append({
"id": qid,
"retrieval": "PASS" if retrieval_pass else "FAIL",
"generation": "PASS" if generation_pass else "FAIL",
"missing": missing_topics
})
# 3. Final Report
report = {
"summary": {
"total_questions": total,
"retrieval_accuracy": f"{(passed_retrieval/total)*100:.2f}%",
"generation_accuracy": f"{(passed_generation/total)*100:.2f}%",
},
"details": results
}
with open(REPORT_PATH, "w", encoding="utf-8") as f:
json.dump(report, f, indent=4)
print("\n" + "="*40)
print("EVALUATION COMPLETE")
print(f"Retrieval Accuracy: {report['summary']['retrieval_accuracy']}")
print(f"Generation Accuracy: {report['summary']['generation_accuracy']}")
print(f"Detailed logs saved to: {DEBUG_DIR}")
print("="*40)
if __name__ == "__main__":
run_evaluation()
|