Spaces:
Sleeping
Sleeping
| """ | |
| Prompt evaluation script. | |
| Tests the 3-behavior framework: Clarify, Solve, Invite. | |
| Run: python scripts/eval_prompt.py | |
| """ | |
| import httpx | |
| import json | |
| import sys | |
| BASE_URL = "http://localhost:8000/api" | |
| TESTS = [ | |
| { | |
| "id": "T1_vague", | |
| "label": "Vague first message → should clarify (1 question only)", | |
| "message": "I want help with my brand.", | |
| "history": [], | |
| "expect": { | |
| "should_clarify": True, | |
| "should_solve": False, | |
| "should_invite": False, | |
| "note": "Message is too vague. Agent should ask ONE short clarifying question, no contact push.", | |
| }, | |
| }, | |
| { | |
| "id": "T2_clear", | |
| "label": "Clear specific question → skip clarify, go straight to solution + invite", | |
| "message": "How do I build a content strategy for a B2B SaaS startup that is launching its first product?", | |
| "history": [], | |
| "expect": { | |
| "should_clarify": False, | |
| "should_solve": True, | |
| "should_invite": True, | |
| "note": "Intent is clear. Agent should answer with substance and end with specialist invite.", | |
| }, | |
| }, | |
| { | |
| "id": "T3_offtopic", | |
| "label": "Off-topic question → polite redirect + one question", | |
| "message": "What is the best programming language to learn in 2025?", | |
| "history": [], | |
| "expect": { | |
| "should_clarify": False, | |
| "should_solve": False, | |
| "should_invite": False, | |
| "note": "Out of scope. Agent should politely redirect to branding/marketing and ask one question.", | |
| }, | |
| }, | |
| { | |
| "id": "T4_after_clarify", | |
| "label": "Follow-up with context provided → should now solve + invite", | |
| "message": "We sell project management tools to enterprise companies. Our main problem is that no one knows us despite having a great product.", | |
| "history": [ | |
| {"role": "user", "content": "I want help with my brand."}, | |
| {"role": "assistant", "content": "What industry are you in and who is your main target audience?"}, | |
| ], | |
| "expect": { | |
| "should_clarify": False, | |
| "should_solve": True, | |
| "should_invite": True, | |
| "note": "Context is now clear. Should deliver a solid solution and invite, not ask more questions.", | |
| }, | |
| }, | |
| { | |
| "id": "T5_ceo_consent", | |
| "label": "User agrees to specialist contact → CEO escalation triggered", | |
| "message": "Yes, I would like to speak with someone from your team.", | |
| "history": [ | |
| {"role": "user", "content": "How do I differentiate my brand?"}, | |
| {"role": "assistant", "content": "Brand differentiation starts with a clear point of view. You need to identify what only you can say and say it consistently. If you would like a more tailored perspective from our team, I can make that happen."}, | |
| ], | |
| "expect": { | |
| "should_clarify": False, | |
| "should_solve": False, | |
| "should_invite": False, | |
| "ceo_triggered": True, | |
| "note": "User clearly consented. notify_ceo_interest should be called, contact details shared.", | |
| }, | |
| }, | |
| { | |
| "id": "T6_finnish", | |
| "label": "Finnish message → answer in Finnish", | |
| "message": "Miten rakennan vahvan brändin pienelle yritykselle?", | |
| "history": [], | |
| "expect": { | |
| "should_clarify": False, | |
| "should_solve": True, | |
| "should_invite": True, | |
| "language": "fi", | |
| "note": "Finnish input. Answer must be in Finnish, end with Finnish invite.", | |
| }, | |
| }, | |
| { | |
| "id": "T7_no_bullet_lists", | |
| "label": "Multi-point answer should be prose, not bullet points", | |
| "message": "What are the key elements of a strong brand identity?", | |
| "history": [], | |
| "expect": { | |
| "should_solve": True, | |
| "should_invite": True, | |
| "no_bullets": True, | |
| "note": "Answer must be written in prose paragraphs, no bullet points or numbered lists.", | |
| }, | |
| }, | |
| ] | |
| def call_chat(message: str, history: list) -> str: | |
| """Call /api/chat/stream and collect all token deltas into a full response string.""" | |
| payload = {"message": message, "history": history} | |
| full_text = "" | |
| ceo_triggered = False | |
| current_event = None | |
| try: | |
| with httpx.Client(timeout=60) as client: | |
| with client.stream("POST", f"{BASE_URL}/chat/stream", json=payload) as resp: | |
| resp.raise_for_status() | |
| for line in resp.iter_lines(): | |
| if not line or line.startswith(":"): | |
| continue | |
| if line.startswith("event:"): | |
| current_event = line[6:].strip() | |
| # CEO tool has needs_approval=True → creates an "interrupt" event | |
| if current_event in ("ceo_email_sent", "interrupt"): | |
| ceo_triggered = True | |
| elif line.startswith("data:"): | |
| raw = line[5:].strip() | |
| try: | |
| data = json.loads(raw) | |
| except json.JSONDecodeError: | |
| continue | |
| if "delta" in data: | |
| full_text += data["delta"] | |
| # interrupt payload may also confirm notify_ceo_interest was called | |
| if current_event == "interrupt": | |
| interruptions = data.get("interruptions", []) | |
| for item in interruptions: | |
| if item.get("name") == "notify_ceo_interest": | |
| ceo_triggered = True | |
| except Exception as e: | |
| return f"[ERROR: {e}]", False | |
| return full_text.strip(), ceo_triggered | |
| def grade(test: dict, response: str, ceo_triggered: bool) -> dict: | |
| expect = test["expect"] | |
| result = {"id": test["id"], "label": test["label"], "response_preview": response[:300], "issues": [], "pass": True} | |
| # Detect clarifying question (ends with ?, single question, short) | |
| question_marks = response.count("?") | |
| is_clarify = question_marks >= 1 and len(response) < 400 | |
| # Detect substantial solution (longer response) | |
| is_solve = len(response) > 200 | |
| # Detect invite sentence (loose keyword match) | |
| invite_keywords = ["specialist", "team", "arrange", "contact", "asiantuntija", "jatkaa", "onnistuu", "reach out"] | |
| is_invite = any(kw in response.lower() for kw in invite_keywords) | |
| # Detect bullet points | |
| has_bullets = any(line.strip().startswith(("-", "*", "•")) or (len(line) > 2 and line.strip()[0].isdigit() and line.strip()[1] == ".") for line in response.split("\n")) | |
| if expect.get("should_clarify") and not is_clarify: | |
| result["issues"].append("❌ Expected a clarifying question but got a long response") | |
| if expect.get("should_clarify") is False and is_clarify and not is_solve: | |
| result["issues"].append("❌ Asked clarifying question when context was already clear") | |
| if expect.get("should_solve") and not is_solve: | |
| result["issues"].append("❌ Expected a substantive solution but response is too short") | |
| if expect.get("should_invite") and not is_invite: | |
| result["issues"].append("❌ Missing specialist invite at the end") | |
| if expect.get("should_invite") is False and is_invite and not expect.get("ceo_triggered"): | |
| result["issues"].append("⚠️ Invite present when not expected (minor)") | |
| if expect.get("ceo_triggered") and not ceo_triggered: | |
| result["issues"].append("❌ CEO escalation not triggered after clear user consent") | |
| if expect.get("no_bullets") and has_bullets: | |
| result["issues"].append("❌ Response uses bullet points — must be prose only") | |
| if expect.get("language") == "fi" and not any(w in response for w in ["ja", "on", "tai", "myös", "Jos", "brändi"]): | |
| result["issues"].append("❌ Response does not appear to be in Finnish") | |
| if result["issues"]: | |
| result["pass"] = False | |
| return result | |
| def main(): | |
| print(f"\n{'='*70}") | |
| print(" BRÄNDIVÄLKKY PROMPT EVALUATION SUITE") | |
| print(f"{'='*70}\n") | |
| results = [] | |
| for test in TESTS: | |
| print(f"Running [{test['id']}] {test['label']}...") | |
| print(f" Note: {test['expect']['note']}") | |
| response, ceo_triggered = call_chat(test["message"], test["history"]) | |
| graded = grade(test, response, ceo_triggered) | |
| results.append(graded) | |
| status = "✅ PASS" if graded["pass"] else "❌ FAIL" | |
| print(f" Status: {status}") | |
| if graded["issues"]: | |
| for issue in graded["issues"]: | |
| print(f" {issue}") | |
| print(f" Response preview: {graded['response_preview'][:200]!r}") | |
| print() | |
| passed = sum(1 for r in results if r["pass"]) | |
| total = len(results) | |
| print(f"{'='*70}") | |
| print(f" RESULT: {passed}/{total} tests passed") | |
| print(f"{'='*70}\n") | |
| if passed < total: | |
| print("Failed tests:") | |
| for r in results: | |
| if not r["pass"]: | |
| print(f" - [{r['id']}] {r['label']}") | |
| for issue in r["issues"]: | |
| print(f" {issue}") | |
| return 0 if passed == total else 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |