Spaces:

neural-arun
/

ArunCore

Running

Neural Arun

ArunCore Deployment

9ae77d7 about 1 month ago

5.96 kB

	import json
	import os
	import time
	from pathlib import Path
	from dotenv import load_dotenv

	# Import the agent logic from core.agent
	from core.agent import init_agent, answer_query

	# Load environment variables
	load_dotenv()

	BASE_DIR = Path(__file__).resolve().parent.parent
	EVAL_SET_PATH = BASE_DIR / "data" / "test_set" / "eval_set.json"
	REPORT_PATH = BASE_DIR / "data" / "test_set" / "evaluation_report.json"
	DEBUG_DIR = BASE_DIR / "evaluation_debug"

	def fuzzy_match(topic, answer):
	"""
	Check if a topic sounds like it's in the answer.
	More lenient than strict substring.
	"""
	topic_clean = topic.lower().strip()
	answer_clean = answer.lower().strip()

	# 1. Direct match
	if topic_clean in answer_clean:
	return True

	# 2. Key word subset check (if all significant words of a topic are in the answer)
	# This helps catch "RAG Pipelines" vs "AI pipelines for RAG"
	stop_words = {"and", "the", "a", "an", "is", "for", "vs", "to", "of", "with"}
	words = [w for w in topic_clean.split() if w not in stop_words]

	if not words: return False

	matches = sum(1 for w in words if w in answer_clean)
	# If 75% of the important words are there, count it as a pass
	if (matches / len(words)) >= 0.75:
	return True

	return False

	def save_detailed_log(qid, question, answer, chunks, retrieval_pass, missing_topics):
	"""Save a clean markdown file for manual human inspection of this specific interaction."""
	os.makedirs(DEBUG_DIR, exist_ok=True)
	filepath = DEBUG_DIR / f"{qid}.md"

	with open(filepath, "w", encoding="utf-8") as f:
	f.write(f"# Evaluation Log: {qid}\n\n")
	f.write(f"## Question\n{question}\n\n")
	f.write(f"## Status\n")
	f.write(f"- Retrieval Mode: {'PASS' if retrieval_pass else 'FAIL'}\n")
	f.write(f"- Generation Mode: {'PASS' if not missing_topics else 'FAIL'}\n")
	if missing_topics:
	f.write(f"- Missing Topics: {', '.join(missing_topics)}\n")
	f.write(f"\n## ArunCore Answer\n{answer}\n\n")
	f.write(f"## Retrieved Chunks (Final Top 5)\n")
	for i, doc in enumerate(chunks):
	f.write(f"### Chunk {i+1} \| Source: {doc.metadata.get('source')}\n")
	f.write(f"```text\n{doc.page_content}\n```\n\n")

	def run_evaluation():
	print("--- ArunCore Dual-Evaluation Pipeline (Fuzzy Match + Rate Limit Handling) ---")

	# 1. Initialize Agent
	print("Initializing Agent...")
	try:
	vectorstore, bm25_retriever, compressor, llm, prompt = init_agent()
	except Exception as e:
	print(f"Failed to initialize agent: {e}")
	return

	# 2. Load Eval Set
	if not EVAL_SET_PATH.exists():
	print(f"Eval set not found at {EVAL_SET_PATH}")
	return

	with open(EVAL_SET_PATH, "r", encoding="utf-8") as f:
	eval_set = json.load(f)

	results = []
	passed_retrieval = 0
	passed_generation = 0
	total = len(eval_set)

	print(f"Starting evaluation of {total} questions...\n")

	for i, test in enumerate(eval_set):
	qid = test.get("id", f"Q{i}")
	question = test.get("question")
	expected_source = test.get("expected_source")
	expected_topics = test.get("expected_topics", [])

	print(f"[{i+1}/{total}] Evaluating {qid}: {question[:60]}...")

	# Execute Agent
	try:
	# We add a delay to satisfy the 10/min Cohere Trial Limit
	if i > 0:
	print(f" (Rate limit cool-down: 6.5s)")
	time.sleep(6.5)

	response = answer_query(question, vectorstore, bm25_retriever, compressor, llm, prompt)
	answer = response["answer"]
	chunks = response["retrieved_chunks"]
	except Exception as e:
	print(f" Error Querying Agent: {e}")
	results.append({
	"id": qid,
	"status": "ERROR",
	"error": str(e)
	})
	continue

	# --- Layer 1: Retrieval Check ---
	retrieval_pass = False
	if expected_source.startswith("static/"):
	retrieval_pass = True
	else:
	for doc in chunks:
	source_meta = doc.metadata.get("source", "").lower()
	if expected_source.lower() in source_meta:
	retrieval_pass = True
	break

	if retrieval_pass: passed_retrieval += 1

	# --- Layer 2: Generation Check ---
	# Fuzzy match for topics
	missing_topics = []
	for topic in expected_topics:
	if not fuzzy_match(topic, answer):
	missing_topics.append(topic)

	generation_pass = len(missing_topics) == 0
	if generation_pass: passed_generation += 1

	# Log detailed human-readable file
	save_detailed_log(qid, question, answer, chunks, retrieval_pass, missing_topics)

	# Store result in summary list
	results.append({
	"id": qid,
	"retrieval": "PASS" if retrieval_pass else "FAIL",
	"generation": "PASS" if generation_pass else "FAIL",
	"missing": missing_topics
	})

	# 3. Final Report
	report = {
	"summary": {
	"total_questions": total,
	"retrieval_accuracy": f"{(passed_retrieval/total)*100:.2f}%",
	"generation_accuracy": f"{(passed_generation/total)*100:.2f}%",
	},
	"details": results
	}

	with open(REPORT_PATH, "w", encoding="utf-8") as f:
	json.dump(report, f, indent=4)

	print("\n" + "="*40)
	print("EVALUATION COMPLETE")
	print(f"Retrieval Accuracy: {report['summary']['retrieval_accuracy']}")
	print(f"Generation Accuracy: {report['summary']['generation_accuracy']}")
	print(f"Detailed logs saved to: {DEBUG_DIR}")
	print("="*40)

	if __name__ == "__main__":
	run_evaluation()