Spaces:

ABAO77
/

chatbot-rag-fi

Sleeping

App Files Files Community

chatbot-rag-fi / scripts /eval_prompt.py

ABAO77

Upload 147 files

0df80b4 verified about 1 month ago

raw

history blame contribute delete

9.48 kB

	"""
	Prompt evaluation script.
	Tests the 3-behavior framework: Clarify, Solve, Invite.
	Run: python scripts/eval_prompt.py
	"""
	import httpx
	import json
	import sys

	BASE_URL = "http://localhost:8000/api"

	TESTS = [
	{
	"id": "T1_vague",
	"label": "Vague first message → should clarify (1 question only)",
	"message": "I want help with my brand.",
	"history": [],
	"expect": {
	"should_clarify": True,
	"should_solve": False,
	"should_invite": False,
	"note": "Message is too vague. Agent should ask ONE short clarifying question, no contact push.",
	},
	},
	{
	"id": "T2_clear",
	"label": "Clear specific question → skip clarify, go straight to solution + invite",
	"message": "How do I build a content strategy for a B2B SaaS startup that is launching its first product?",
	"history": [],
	"expect": {
	"should_clarify": False,
	"should_solve": True,
	"should_invite": True,
	"note": "Intent is clear. Agent should answer with substance and end with specialist invite.",
	},
	},
	{
	"id": "T3_offtopic",
	"label": "Off-topic question → polite redirect + one question",
	"message": "What is the best programming language to learn in 2025?",
	"history": [],
	"expect": {
	"should_clarify": False,
	"should_solve": False,
	"should_invite": False,
	"note": "Out of scope. Agent should politely redirect to branding/marketing and ask one question.",
	},
	},
	{
	"id": "T4_after_clarify",
	"label": "Follow-up with context provided → should now solve + invite",
	"message": "We sell project management tools to enterprise companies. Our main problem is that no one knows us despite having a great product.",
	"history": [
	{"role": "user", "content": "I want help with my brand."},
	{"role": "assistant", "content": "What industry are you in and who is your main target audience?"},
	],
	"expect": {
	"should_clarify": False,
	"should_solve": True,
	"should_invite": True,
	"note": "Context is now clear. Should deliver a solid solution and invite, not ask more questions.",
	},
	},
	{
	"id": "T5_ceo_consent",
	"label": "User agrees to specialist contact → CEO escalation triggered",
	"message": "Yes, I would like to speak with someone from your team.",
	"history": [
	{"role": "user", "content": "How do I differentiate my brand?"},
	{"role": "assistant", "content": "Brand differentiation starts with a clear point of view. You need to identify what only you can say and say it consistently. If you would like a more tailored perspective from our team, I can make that happen."},
	],
	"expect": {
	"should_clarify": False,
	"should_solve": False,
	"should_invite": False,
	"ceo_triggered": True,
	"note": "User clearly consented. notify_ceo_interest should be called, contact details shared.",
	},
	},
	{
	"id": "T6_finnish",
	"label": "Finnish message → answer in Finnish",
	"message": "Miten rakennan vahvan brändin pienelle yritykselle?",
	"history": [],
	"expect": {
	"should_clarify": False,
	"should_solve": True,
	"should_invite": True,
	"language": "fi",
	"note": "Finnish input. Answer must be in Finnish, end with Finnish invite.",
	},
	},
	{
	"id": "T7_no_bullet_lists",
	"label": "Multi-point answer should be prose, not bullet points",
	"message": "What are the key elements of a strong brand identity?",
	"history": [],
	"expect": {
	"should_solve": True,
	"should_invite": True,
	"no_bullets": True,
	"note": "Answer must be written in prose paragraphs, no bullet points or numbered lists.",
	},
	},
	]


	def call_chat(message: str, history: list) -> str:
	"""Call /api/chat/stream and collect all token deltas into a full response string."""
	payload = {"message": message, "history": history}
	full_text = ""
	ceo_triggered = False
	current_event = None
	try:
	with httpx.Client(timeout=60) as client:
	with client.stream("POST", f"{BASE_URL}/chat/stream", json=payload) as resp:
	resp.raise_for_status()
	for line in resp.iter_lines():
	if not line or line.startswith(":"):
	continue
	if line.startswith("event:"):
	current_event = line[6:].strip()
	# CEO tool has needs_approval=True → creates an "interrupt" event
	if current_event in ("ceo_email_sent", "interrupt"):
	ceo_triggered = True
	elif line.startswith("data:"):
	raw = line[5:].strip()
	try:
	data = json.loads(raw)
	except json.JSONDecodeError:
	continue
	if "delta" in data:
	full_text += data["delta"]
	# interrupt payload may also confirm notify_ceo_interest was called
	if current_event == "interrupt":
	interruptions = data.get("interruptions", [])
	for item in interruptions:
	if item.get("name") == "notify_ceo_interest":
	ceo_triggered = True
	except Exception as e:
	return f"[ERROR: {e}]", False
	return full_text.strip(), ceo_triggered


	def grade(test: dict, response: str, ceo_triggered: bool) -> dict:
	expect = test["expect"]
	result = {"id": test["id"], "label": test["label"], "response_preview": response[:300], "issues": [], "pass": True}

	# Detect clarifying question (ends with ?, single question, short)
	question_marks = response.count("?")
	is_clarify = question_marks >= 1 and len(response) < 400

	# Detect substantial solution (longer response)
	is_solve = len(response) > 200

	# Detect invite sentence (loose keyword match)
	invite_keywords = ["specialist", "team", "arrange", "contact", "asiantuntija", "jatkaa", "onnistuu", "reach out"]
	is_invite = any(kw in response.lower() for kw in invite_keywords)

	# Detect bullet points
	has_bullets = any(line.strip().startswith(("-", "*", "•")) or (len(line) > 2 and line.strip()[0].isdigit() and line.strip()[1] == ".") for line in response.split("\n"))

	if expect.get("should_clarify") and not is_clarify:
	result["issues"].append("❌ Expected a clarifying question but got a long response")
	if expect.get("should_clarify") is False and is_clarify and not is_solve:
	result["issues"].append("❌ Asked clarifying question when context was already clear")
	if expect.get("should_solve") and not is_solve:
	result["issues"].append("❌ Expected a substantive solution but response is too short")
	if expect.get("should_invite") and not is_invite:
	result["issues"].append("❌ Missing specialist invite at the end")
	if expect.get("should_invite") is False and is_invite and not expect.get("ceo_triggered"):
	result["issues"].append("⚠️ Invite present when not expected (minor)")
	if expect.get("ceo_triggered") and not ceo_triggered:
	result["issues"].append("❌ CEO escalation not triggered after clear user consent")
	if expect.get("no_bullets") and has_bullets:
	result["issues"].append("❌ Response uses bullet points — must be prose only")
	if expect.get("language") == "fi" and not any(w in response for w in ["ja", "on", "tai", "myös", "Jos", "brändi"]):
	result["issues"].append("❌ Response does not appear to be in Finnish")

	if result["issues"]:
	result["pass"] = False
	return result


	def main():
	print(f"\n{'='*70}")
	print(" BRÄNDIVÄLKKY PROMPT EVALUATION SUITE")
	print(f"{'='*70}\n")

	results = []
	for test in TESTS:
	print(f"Running [{test['id']}] {test['label']}...")
	print(f" Note: {test['expect']['note']}")
	response, ceo_triggered = call_chat(test["message"], test["history"])
	graded = grade(test, response, ceo_triggered)
	results.append(graded)

	status = "✅ PASS" if graded["pass"] else "❌ FAIL"
	print(f" Status: {status}")
	if graded["issues"]:
	for issue in graded["issues"]:
	print(f" {issue}")
	print(f" Response preview: {graded['response_preview'][:200]!r}")
	print()

	passed = sum(1 for r in results if r["pass"])
	total = len(results)
	print(f"{'='*70}")
	print(f" RESULT: {passed}/{total} tests passed")
	print(f"{'='*70}\n")
	if passed < total:
	print("Failed tests:")
	for r in results:
	if not r["pass"]:
	print(f" - [{r['id']}] {r['label']}")
	for issue in r["issues"]:
	print(f" {issue}")
	return 0 if passed == total else 1


	if __name__ == "__main__":
	sys.exit(main())