""" Prompt evaluation script. Tests the 3-behavior framework: Clarify, Solve, Invite. Run: python scripts/eval_prompt.py """ import httpx import json import sys BASE_URL = "http://localhost:8000/api" TESTS = [ { "id": "T1_vague", "label": "Vague first message → should clarify (1 question only)", "message": "I want help with my brand.", "history": [], "expect": { "should_clarify": True, "should_solve": False, "should_invite": False, "note": "Message is too vague. Agent should ask ONE short clarifying question, no contact push.", }, }, { "id": "T2_clear", "label": "Clear specific question → skip clarify, go straight to solution + invite", "message": "How do I build a content strategy for a B2B SaaS startup that is launching its first product?", "history": [], "expect": { "should_clarify": False, "should_solve": True, "should_invite": True, "note": "Intent is clear. Agent should answer with substance and end with specialist invite.", }, }, { "id": "T3_offtopic", "label": "Off-topic question → polite redirect + one question", "message": "What is the best programming language to learn in 2025?", "history": [], "expect": { "should_clarify": False, "should_solve": False, "should_invite": False, "note": "Out of scope. Agent should politely redirect to branding/marketing and ask one question.", }, }, { "id": "T4_after_clarify", "label": "Follow-up with context provided → should now solve + invite", "message": "We sell project management tools to enterprise companies. Our main problem is that no one knows us despite having a great product.", "history": [ {"role": "user", "content": "I want help with my brand."}, {"role": "assistant", "content": "What industry are you in and who is your main target audience?"}, ], "expect": { "should_clarify": False, "should_solve": True, "should_invite": True, "note": "Context is now clear. Should deliver a solid solution and invite, not ask more questions.", }, }, { "id": "T5_ceo_consent", "label": "User agrees to specialist contact → CEO escalation triggered", "message": "Yes, I would like to speak with someone from your team.", "history": [ {"role": "user", "content": "How do I differentiate my brand?"}, {"role": "assistant", "content": "Brand differentiation starts with a clear point of view. You need to identify what only you can say and say it consistently. If you would like a more tailored perspective from our team, I can make that happen."}, ], "expect": { "should_clarify": False, "should_solve": False, "should_invite": False, "ceo_triggered": True, "note": "User clearly consented. notify_ceo_interest should be called, contact details shared.", }, }, { "id": "T6_finnish", "label": "Finnish message → answer in Finnish", "message": "Miten rakennan vahvan brändin pienelle yritykselle?", "history": [], "expect": { "should_clarify": False, "should_solve": True, "should_invite": True, "language": "fi", "note": "Finnish input. Answer must be in Finnish, end with Finnish invite.", }, }, { "id": "T7_no_bullet_lists", "label": "Multi-point answer should be prose, not bullet points", "message": "What are the key elements of a strong brand identity?", "history": [], "expect": { "should_solve": True, "should_invite": True, "no_bullets": True, "note": "Answer must be written in prose paragraphs, no bullet points or numbered lists.", }, }, ] def call_chat(message: str, history: list) -> str: """Call /api/chat/stream and collect all token deltas into a full response string.""" payload = {"message": message, "history": history} full_text = "" ceo_triggered = False current_event = None try: with httpx.Client(timeout=60) as client: with client.stream("POST", f"{BASE_URL}/chat/stream", json=payload) as resp: resp.raise_for_status() for line in resp.iter_lines(): if not line or line.startswith(":"): continue if line.startswith("event:"): current_event = line[6:].strip() # CEO tool has needs_approval=True → creates an "interrupt" event if current_event in ("ceo_email_sent", "interrupt"): ceo_triggered = True elif line.startswith("data:"): raw = line[5:].strip() try: data = json.loads(raw) except json.JSONDecodeError: continue if "delta" in data: full_text += data["delta"] # interrupt payload may also confirm notify_ceo_interest was called if current_event == "interrupt": interruptions = data.get("interruptions", []) for item in interruptions: if item.get("name") == "notify_ceo_interest": ceo_triggered = True except Exception as e: return f"[ERROR: {e}]", False return full_text.strip(), ceo_triggered def grade(test: dict, response: str, ceo_triggered: bool) -> dict: expect = test["expect"] result = {"id": test["id"], "label": test["label"], "response_preview": response[:300], "issues": [], "pass": True} # Detect clarifying question (ends with ?, single question, short) question_marks = response.count("?") is_clarify = question_marks >= 1 and len(response) < 400 # Detect substantial solution (longer response) is_solve = len(response) > 200 # Detect invite sentence (loose keyword match) invite_keywords = ["specialist", "team", "arrange", "contact", "asiantuntija", "jatkaa", "onnistuu", "reach out"] is_invite = any(kw in response.lower() for kw in invite_keywords) # Detect bullet points has_bullets = any(line.strip().startswith(("-", "*", "•")) or (len(line) > 2 and line.strip()[0].isdigit() and line.strip()[1] == ".") for line in response.split("\n")) if expect.get("should_clarify") and not is_clarify: result["issues"].append("❌ Expected a clarifying question but got a long response") if expect.get("should_clarify") is False and is_clarify and not is_solve: result["issues"].append("❌ Asked clarifying question when context was already clear") if expect.get("should_solve") and not is_solve: result["issues"].append("❌ Expected a substantive solution but response is too short") if expect.get("should_invite") and not is_invite: result["issues"].append("❌ Missing specialist invite at the end") if expect.get("should_invite") is False and is_invite and not expect.get("ceo_triggered"): result["issues"].append("⚠️ Invite present when not expected (minor)") if expect.get("ceo_triggered") and not ceo_triggered: result["issues"].append("❌ CEO escalation not triggered after clear user consent") if expect.get("no_bullets") and has_bullets: result["issues"].append("❌ Response uses bullet points — must be prose only") if expect.get("language") == "fi" and not any(w in response for w in ["ja", "on", "tai", "myös", "Jos", "brändi"]): result["issues"].append("❌ Response does not appear to be in Finnish") if result["issues"]: result["pass"] = False return result def main(): print(f"\n{'='*70}") print(" BRÄNDIVÄLKKY PROMPT EVALUATION SUITE") print(f"{'='*70}\n") results = [] for test in TESTS: print(f"Running [{test['id']}] {test['label']}...") print(f" Note: {test['expect']['note']}") response, ceo_triggered = call_chat(test["message"], test["history"]) graded = grade(test, response, ceo_triggered) results.append(graded) status = "✅ PASS" if graded["pass"] else "❌ FAIL" print(f" Status: {status}") if graded["issues"]: for issue in graded["issues"]: print(f" {issue}") print(f" Response preview: {graded['response_preview'][:200]!r}") print() passed = sum(1 for r in results if r["pass"]) total = len(results) print(f"{'='*70}") print(f" RESULT: {passed}/{total} tests passed") print(f"{'='*70}\n") if passed < total: print("Failed tests:") for r in results: if not r["pass"]: print(f" - [{r['id']}] {r['label']}") for issue in r["issues"]: print(f" {issue}") return 0 if passed == total else 1 if __name__ == "__main__": sys.exit(main())