chatbot-rag-fi / scripts /eval_prompt.py
ABAO77's picture
Upload 147 files
0df80b4 verified
"""
Prompt evaluation script.
Tests the 3-behavior framework: Clarify, Solve, Invite.
Run: python scripts/eval_prompt.py
"""
import httpx
import json
import sys
BASE_URL = "http://localhost:8000/api"
TESTS = [
{
"id": "T1_vague",
"label": "Vague first message → should clarify (1 question only)",
"message": "I want help with my brand.",
"history": [],
"expect": {
"should_clarify": True,
"should_solve": False,
"should_invite": False,
"note": "Message is too vague. Agent should ask ONE short clarifying question, no contact push.",
},
},
{
"id": "T2_clear",
"label": "Clear specific question → skip clarify, go straight to solution + invite",
"message": "How do I build a content strategy for a B2B SaaS startup that is launching its first product?",
"history": [],
"expect": {
"should_clarify": False,
"should_solve": True,
"should_invite": True,
"note": "Intent is clear. Agent should answer with substance and end with specialist invite.",
},
},
{
"id": "T3_offtopic",
"label": "Off-topic question → polite redirect + one question",
"message": "What is the best programming language to learn in 2025?",
"history": [],
"expect": {
"should_clarify": False,
"should_solve": False,
"should_invite": False,
"note": "Out of scope. Agent should politely redirect to branding/marketing and ask one question.",
},
},
{
"id": "T4_after_clarify",
"label": "Follow-up with context provided → should now solve + invite",
"message": "We sell project management tools to enterprise companies. Our main problem is that no one knows us despite having a great product.",
"history": [
{"role": "user", "content": "I want help with my brand."},
{"role": "assistant", "content": "What industry are you in and who is your main target audience?"},
],
"expect": {
"should_clarify": False,
"should_solve": True,
"should_invite": True,
"note": "Context is now clear. Should deliver a solid solution and invite, not ask more questions.",
},
},
{
"id": "T5_ceo_consent",
"label": "User agrees to specialist contact → CEO escalation triggered",
"message": "Yes, I would like to speak with someone from your team.",
"history": [
{"role": "user", "content": "How do I differentiate my brand?"},
{"role": "assistant", "content": "Brand differentiation starts with a clear point of view. You need to identify what only you can say and say it consistently. If you would like a more tailored perspective from our team, I can make that happen."},
],
"expect": {
"should_clarify": False,
"should_solve": False,
"should_invite": False,
"ceo_triggered": True,
"note": "User clearly consented. notify_ceo_interest should be called, contact details shared.",
},
},
{
"id": "T6_finnish",
"label": "Finnish message → answer in Finnish",
"message": "Miten rakennan vahvan brändin pienelle yritykselle?",
"history": [],
"expect": {
"should_clarify": False,
"should_solve": True,
"should_invite": True,
"language": "fi",
"note": "Finnish input. Answer must be in Finnish, end with Finnish invite.",
},
},
{
"id": "T7_no_bullet_lists",
"label": "Multi-point answer should be prose, not bullet points",
"message": "What are the key elements of a strong brand identity?",
"history": [],
"expect": {
"should_solve": True,
"should_invite": True,
"no_bullets": True,
"note": "Answer must be written in prose paragraphs, no bullet points or numbered lists.",
},
},
]
def call_chat(message: str, history: list) -> str:
"""Call /api/chat/stream and collect all token deltas into a full response string."""
payload = {"message": message, "history": history}
full_text = ""
ceo_triggered = False
current_event = None
try:
with httpx.Client(timeout=60) as client:
with client.stream("POST", f"{BASE_URL}/chat/stream", json=payload) as resp:
resp.raise_for_status()
for line in resp.iter_lines():
if not line or line.startswith(":"):
continue
if line.startswith("event:"):
current_event = line[6:].strip()
# CEO tool has needs_approval=True → creates an "interrupt" event
if current_event in ("ceo_email_sent", "interrupt"):
ceo_triggered = True
elif line.startswith("data:"):
raw = line[5:].strip()
try:
data = json.loads(raw)
except json.JSONDecodeError:
continue
if "delta" in data:
full_text += data["delta"]
# interrupt payload may also confirm notify_ceo_interest was called
if current_event == "interrupt":
interruptions = data.get("interruptions", [])
for item in interruptions:
if item.get("name") == "notify_ceo_interest":
ceo_triggered = True
except Exception as e:
return f"[ERROR: {e}]", False
return full_text.strip(), ceo_triggered
def grade(test: dict, response: str, ceo_triggered: bool) -> dict:
expect = test["expect"]
result = {"id": test["id"], "label": test["label"], "response_preview": response[:300], "issues": [], "pass": True}
# Detect clarifying question (ends with ?, single question, short)
question_marks = response.count("?")
is_clarify = question_marks >= 1 and len(response) < 400
# Detect substantial solution (longer response)
is_solve = len(response) > 200
# Detect invite sentence (loose keyword match)
invite_keywords = ["specialist", "team", "arrange", "contact", "asiantuntija", "jatkaa", "onnistuu", "reach out"]
is_invite = any(kw in response.lower() for kw in invite_keywords)
# Detect bullet points
has_bullets = any(line.strip().startswith(("-", "*", "•")) or (len(line) > 2 and line.strip()[0].isdigit() and line.strip()[1] == ".") for line in response.split("\n"))
if expect.get("should_clarify") and not is_clarify:
result["issues"].append("❌ Expected a clarifying question but got a long response")
if expect.get("should_clarify") is False and is_clarify and not is_solve:
result["issues"].append("❌ Asked clarifying question when context was already clear")
if expect.get("should_solve") and not is_solve:
result["issues"].append("❌ Expected a substantive solution but response is too short")
if expect.get("should_invite") and not is_invite:
result["issues"].append("❌ Missing specialist invite at the end")
if expect.get("should_invite") is False and is_invite and not expect.get("ceo_triggered"):
result["issues"].append("⚠️ Invite present when not expected (minor)")
if expect.get("ceo_triggered") and not ceo_triggered:
result["issues"].append("❌ CEO escalation not triggered after clear user consent")
if expect.get("no_bullets") and has_bullets:
result["issues"].append("❌ Response uses bullet points — must be prose only")
if expect.get("language") == "fi" and not any(w in response for w in ["ja", "on", "tai", "myös", "Jos", "brändi"]):
result["issues"].append("❌ Response does not appear to be in Finnish")
if result["issues"]:
result["pass"] = False
return result
def main():
print(f"\n{'='*70}")
print(" BRÄNDIVÄLKKY PROMPT EVALUATION SUITE")
print(f"{'='*70}\n")
results = []
for test in TESTS:
print(f"Running [{test['id']}] {test['label']}...")
print(f" Note: {test['expect']['note']}")
response, ceo_triggered = call_chat(test["message"], test["history"])
graded = grade(test, response, ceo_triggered)
results.append(graded)
status = "✅ PASS" if graded["pass"] else "❌ FAIL"
print(f" Status: {status}")
if graded["issues"]:
for issue in graded["issues"]:
print(f" {issue}")
print(f" Response preview: {graded['response_preview'][:200]!r}")
print()
passed = sum(1 for r in results if r["pass"])
total = len(results)
print(f"{'='*70}")
print(f" RESULT: {passed}/{total} tests passed")
print(f"{'='*70}\n")
if passed < total:
print("Failed tests:")
for r in results:
if not r["pass"]:
print(f" - [{r['id']}] {r['label']}")
for issue in r["issues"]:
print(f" {issue}")
return 0 if passed == total else 1
if __name__ == "__main__":
sys.exit(main())