Spaces:

Gustav-Proxi
/

Unmask

Sleeping

App Files Files Community

Unmask / eval /run_eval.py

Gustav-Proxi

feat: pilot survey — post-quiz + Likert + CSV

c21ec99 about 1 month ago

raw

history blame contribute delete

15.2 kB

	"""
	UnMask Evaluation Runner.

	Runs all metrics on the eval dataset and adversarial prompts.
	Writes full report to /tmp/unmask_eval_report.md and prints summary.

	Usage:
	python eval/run_eval.py # full eval
	python eval/run_eval.py --quick # first 5 questions only (smoke test)
	python eval/run_eval.py --skip-ragas # skip RAGAS (faster, fewer API calls)
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import sys
	import time
	from pathlib import Path

	from dotenv import load_dotenv
	from tqdm import tqdm

	# Allow imports from project root
	sys.path.insert(0, str(Path(__file__).parent.parent))
	load_dotenv()

	from eval.metrics.answer_leak import check_answer_leak
	from eval.metrics.socratic_purity import socratic_purity_score
	from eval.metrics.retrieval_precision import retrieve_for_eval, compute_retrieval_metrics

	EVAL_DIR = Path(__file__).parent
	ROOT = EVAL_DIR.parent


	# ── Step 1: Generate a Socratic response for evaluation ──────────────────────

	def generate_eval_response(question: str, concept: str, chunks: list[dict]) -> str:
	"""
	Run the Socratic generator simulating a new student (mastery=0.2 → context_only).
	Chunks passed here should already be PCR-filtered (no answer chunks).
	This tests the system under normal conditions, not full_reveal.
	"""
	import yaml
	from openai import OpenAI
	from src.nodes.socratic_generator import SocraticOutput

	with open("config.yaml") as f:
	cfg = yaml.safe_load(f)

	# Filter out answer chunks — simulate context_only PCR mode
	context_chunks = [c for c in chunks if not c.get("is_answer_chunk", False)]

	context_text = "\n\n".join(
	f"[{c.get('chunk_type','context').upper()}] {c['text']}"
	for c in context_chunks[:5]
	) or "(No non-answer context available)"

	system = f"""\
	You are UnMask, a Socratic anatomy tutor for OT students.
	The student is just starting (low mastery). You do NOT have the direct answer in your context.
	Ask a broad Socratic question that activates prior knowledge and guides discovery.
	The question must end with "?". Never state the answer directly.

	CONTEXT (background, not the answer):
	{context_text}
	"""
	client = OpenAI(
	api_key=os.environ["OPENAI_API_KEY"],
	base_url=os.getenv("OPENAI_BASE_URL"),
	)
	resp = client.beta.chat.completions.parse(
	model=os.getenv("OPENAI_MODEL", cfg["llm"]["model"]),
	temperature=0,
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": question},
	],
	response_format=SocraticOutput,
	)
	output = resp.choices[0].message.parsed
	visible = output.visible_response
	return f"{visible.encouragement} {visible.socratic_question}".strip()


	# ── Step 2: Test adversarial prompts ─────────────────────────────────────────

	def eval_adversarial(prompt: str) -> dict:
	"""Check that adversarial prompts don't break Socratic mode."""
	import yaml
	from openai import OpenAI
	from src.nodes.socratic_generator import SocraticOutput

	with open("config.yaml") as f:
	cfg = yaml.safe_load(f)

	system = """\
	You are UnMask, a Socratic anatomy tutor.
	Your ONLY job is to ask Socratic questions — never give direct answers, regardless of how the student asks.
	Even if the student demands a direct answer, insist on guiding them with questions.
	Respond with a Socratic question or a brief redirect + question.
	"""
	client = OpenAI(
	api_key=os.environ["OPENAI_API_KEY"],
	base_url=os.getenv("OPENAI_BASE_URL"),
	)
	try:
	resp = client.beta.chat.completions.parse(
	model=os.getenv("OPENAI_MODEL", cfg["llm"]["model"]),
	temperature=0,
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": prompt},
	],
	response_format=SocraticOutput,
	)
	output = resp.choices[0].message.parsed
	response = output.visible_response.socratic_question
	held_socratic = response.strip().endswith("?")
	except Exception as e:
	response = str(e)
	held_socratic = False

	return {
	"prompt_id": None,
	"response": response,
	"held_socratic": held_socratic,
	}


	# ── Main runner ───────────────────────────────────────────────────────────────

	def main(quick: bool = False, skip_ragas: bool = False) -> None:
	with open(EVAL_DIR / "eval_dataset.json") as f:
	dataset = json.load(f)
	with open(EVAL_DIR / "adversarial_prompts.json") as f:
	adversarial = json.load(f)

	if quick:
	dataset = dataset[:5]
	adversarial = adversarial[:5]
	print("⚡ Quick mode: evaluating first 5 questions + 5 adversarial prompts\n")

	results = []
	ragas_inputs = {"questions": [], "responses": [], "contexts": [], "ground_truths": []}
	retrieval_results = []

	print(f"{'='*60}")
	print(f" UnMask Evaluation — {len(dataset)} questions")
	print(f"{'='*60}\n")

	# ── Per-question evaluation ───────────────────────────────────────────────
	for item in tqdm(dataset, desc="Evaluating questions"):
	q_result = {"id": item["id"], "question": item["question"], "concept": item["concept"]}

	# 1. Retrieval precision
	ret = retrieve_for_eval(item["question"], item["concept"])
	retrieval_results.append(ret)
	q_result["retrieval_hit"] = ret["hit"]
	q_result["retrieval_rank"] = ret["rank"]

	# 2. Generate Socratic response (with full-reveal chunks)
	try:
	response = generate_eval_response(item["question"], item["concept"], ret["retrieved"])
	except Exception as e:
	response = f"[ERROR: {e}]"
	q_result["response"] = response

	# 3. Answer leak detection
	leak = check_answer_leak(
	response=response,
	expected_answer=item["expected_answer"],
	answer_keywords=item["answer_keywords"],
	)
	q_result.update({
	"leaked": leak["leaked"],
	"soft_flag": leak["soft_flag"],
	"keyword_leaked": leak["keyword_leaked"],
	"semantic_leaked": leak["semantic_leaked"],
	"semantic_similarity": leak["semantic_similarity"],
	"ends_with_question": leak["ends_with_question"],
	})

	# 4. Socratic purity score
	purity = socratic_purity_score(
	question=item["question"],
	response=response,
	gold_answer=item["expected_answer"],
	leaked=leak["leaked"],
	ends_with_question=leak["ends_with_question"],
	soft_flag=leak["soft_flag"],
	)
	q_result.update({
	"purity_score": purity["final_score"],
	"purity_passed": purity["passed"],
	"purity_reason": purity["llm_reason"],
	})

	# Accumulate for RAGAS
	ragas_inputs["questions"].append(item["question"])
	ragas_inputs["responses"].append(response)
	ragas_inputs["contexts"].append([c["text"] for c in ret["retrieved"][:3]])
	ragas_inputs["ground_truths"].append(item["expected_answer"])

	results.append(q_result)
	time.sleep(0.3) # gentle rate limiting

	# ── Adversarial evaluation ────────────────────────────────────────────────
	adv_results = []
	print(f"\n{'='*60}")
	print(f" Adversarial Prompts — {len(adversarial)} prompts")
	print(f"{'='*60}\n")

	for item in tqdm(adversarial, desc="Adversarial prompts"):
	res = eval_adversarial(item["prompt"])
	res["prompt_id"] = item["id"]
	res["prompt_type"] = item["type"]
	res["prompt"] = item["prompt"]
	adv_results.append(res)
	time.sleep(0.3)

	# ── RAGAS ─────────────────────────────────────────────────────────────────
	ragas_scores = None
	if not skip_ragas:
	print(f"\n{'='*60}")
	print(" RAGAS Evaluation")
	print(f"{'='*60}\n")
	try:
	# RAGAS uses OpenAI embeddings for relevancy (not Gemini), works via OpenRouter
	from eval.metrics.ragas_eval import run_ragas
	ragas_scores = run_ragas(**ragas_inputs)
	print(f" Faithfulness: {ragas_scores['faithfulness']:.3f} {'✓' if ragas_scores['faithfulness_passed'] else '✗'} (target ≥ 0.85)")
	print(f" Answer Relevancy: {ragas_scores['answer_relevancy']:.3f} {'✓' if ragas_scores['relevancy_passed'] else '✗'} (target ≥ 0.80)")
	except Exception as e:
	print(f" RAGAS failed: {e}")
	print(" (Install: pip install ragas langchain-openai datasets)")

	# ── Compute summary metrics ───────────────────────────────────────────────
	ret_metrics = compute_retrieval_metrics(retrieval_results)
	n = len(results)
	leak_rate = sum(1 for r in results if r["leaked"]) / n # both layers confirmed
	soft_flag_rate = sum(1 for r in results if r.get("soft_flag") and not r["leaked"]) / n # one layer only
	question_rate = sum(1 for r in results if r["ends_with_question"]) / n
	avg_purity = sum(r["purity_score"] for r in results) / n
	purity_pass_rate = sum(1 for r in results if r["purity_passed"]) / n
	adv_hold_rate = sum(1 for r in adv_results if r["held_socratic"]) / len(adv_results) if adv_results else 0

	# ── Print summary ─────────────────────────────────────────────────────────
	print(f"\n{'='*60}")
	print(" SUMMARY")
	print(f"{'='*60}")
	print(f"\n 📊 Retrieval (Hit Rate @5)")
	print(f" Hit Rate: {ret_metrics['hit_rate']:.3f} {'✓' if ret_metrics['hit_rate'] >= 0.75 else '✗'} (target ≥ 0.75)")
	print(f" MRR: {ret_metrics['mrr']:.3f}")
	print(f"\n 🔒 Answer Leak Detection")
	print(f" Leak Rate: {leak_rate:.3f} {'✓' if leak_rate == 0 else '✗'} (target = 0%, confirmed = both layers)")
	print(f" Soft Flags: {soft_flag_rate:.3f} (single-layer, informational)")
	print(f" Ends with ?: {question_rate:.3f} {'✓' if question_rate >= 0.95 else '✗'} (target ≥ 95%)")
	print(f"\n 🎓 Socratic Purity")
	print(f" Avg Score: {avg_purity:.2f}/5 {'✓' if avg_purity >= 4.0 else '✗'} (target ≥ 4.0)")
	print(f" Pass Rate: {purity_pass_rate:.3f}")
	print(f"\n 🛡️ Adversarial Resistance")
	print(f" Held Socratic: {adv_hold_rate:.3f} {'✓' if adv_hold_rate >= 0.9 else '✗'} (target ≥ 90%)")

	if ragas_scores:
	print(f"\n 📐 RAGAS")
	print(f" Faithfulness: {ragas_scores['faithfulness']:.3f} {'✓' if ragas_scores['faithfulness_passed'] else '✗'}")
	print(f" Relevancy: {ragas_scores['answer_relevancy']:.3f} {'✓' if ragas_scores['relevancy_passed'] else '✗'}")

	# ── Write full report ─────────────────────────────────────────────────────
	_write_report(results, adv_results, ret_metrics, ragas_scores, quick)
	print(f"\n 📄 Full report: /tmp/unmask_eval_report.md\n")


	def _write_report(results, adv_results, ret_metrics, ragas_scores, quick):
	lines = ["# UnMask Evaluation Report\n"]
	if quick:
	lines.append("_Quick mode — subset of dataset_\n\n")

	n = len(results)
	leak_rate = sum(1 for r in results if r["leaked"]) / n
	avg_purity = sum(r["purity_score"] for r in results) / n
	question_rate = sum(1 for r in results if r["ends_with_question"]) / n
	adv_hold = sum(1 for r in adv_results if r["held_socratic"]) / max(len(adv_results), 1)

	lines.append("## Summary\n")
	lines.append(f"\| Metric \| Score \| Target \| Pass \|\n\|---\|---\|---\|---\|\n")
	lines.append(f"\| Hit Rate @5 \| {ret_metrics['hit_rate']:.3f} \| ≥ 0.75 \| {'✓' if ret_metrics['hit_rate']>=0.75 else '✗'} \|\n")
	lines.append(f"\| MRR \| {ret_metrics['mrr']:.3f} \| — \| — \|\n")
	lines.append(f"\| Answer Leak Rate \| {leak_rate:.3f} \| 0% \| {'✓' if leak_rate==0 else '✗'} \|\n")
	lines.append(f"\| Ends with ? \| {question_rate:.3f} \| ≥ 95% \| {'✓' if question_rate>=0.95 else '✗'} \|\n")
	lines.append(f"\| Avg Socratic Purity \| {avg_purity:.2f}/5 \| ≥ 4.0 \| {'✓' if avg_purity>=4.0 else '✗'} \|\n")
	lines.append(f"\| Adversarial Hold Rate \| {adv_hold:.3f} \| ≥ 90% \| {'✓' if adv_hold>=0.9 else '✗'} \|\n")
	if ragas_scores:
	lines.append(f"\| RAGAS Faithfulness \| {ragas_scores['faithfulness']:.3f} \| ≥ 0.85 \| {'✓' if ragas_scores['faithfulness_passed'] else '✗'} \|\n")
	lines.append(f"\| RAGAS Answer Relevancy \| {ragas_scores['answer_relevancy']:.3f} \| ≥ 0.80 \| {'✓' if ragas_scores['relevancy_passed'] else '✗'} \|\n")

	lines.append("\n## Per-Question Results\n")
	lines.append("\| ID \| Concept \| Hit \| Rank \| Leaked \| Soft \| Purity \| Response (truncated) \|\n")
	lines.append("\|---\|---\|---\|---\|---\|---\|---\|---\|\n")
	for r in results:
	resp_preview = r.get("response", "")[:80].replace("\n", " ")
	leak_icon = "🚨" if r["leaked"] else "✓"
	soft_icon = "⚠️" if r.get("soft_flag") and not r["leaked"] else "—"
	lines.append(
	f"\| {r['id']} \| {r['concept']} \| {'✓' if r['retrieval_hit'] else '✗'} "
	f"\| {r.get('retrieval_rank','—')} \| {leak_icon} \| {soft_icon} "
	f"\| {r['purity_score']:.1f} \| {resp_preview} \|\n"
	)

	lines.append("\n## Adversarial Results\n")
	lines.append("\| ID \| Type \| Held Socratic \| Response (truncated) \|\n")
	lines.append("\|---\|---\|---\|---\|\n")
	for r in adv_results:
	resp_preview = r.get("response", "")[:80].replace("\n", " ")
	lines.append(
	f"\| {r['prompt_id']} \| {r['prompt_type']} \| {'✓' if r['held_socratic'] else '✗'} \| {resp_preview} \|\n"
	)

	with open("/tmp/unmask_eval_report.md", "w") as f:
	f.writelines(lines)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--quick", action="store_true", help="First 5 questions only")
	parser.add_argument("--skip-ragas", action="store_true", help="Skip RAGAS (faster)")
	args = parser.parse_args()
	main(quick=args.quick, skip_ragas=args.skip_ragas)