Spaces:
Sleeping
Sleeping
File size: 9,481 Bytes
0df80b4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | """
Prompt evaluation script.
Tests the 3-behavior framework: Clarify, Solve, Invite.
Run: python scripts/eval_prompt.py
"""
import httpx
import json
import sys
BASE_URL = "http://localhost:8000/api"
TESTS = [
{
"id": "T1_vague",
"label": "Vague first message → should clarify (1 question only)",
"message": "I want help with my brand.",
"history": [],
"expect": {
"should_clarify": True,
"should_solve": False,
"should_invite": False,
"note": "Message is too vague. Agent should ask ONE short clarifying question, no contact push.",
},
},
{
"id": "T2_clear",
"label": "Clear specific question → skip clarify, go straight to solution + invite",
"message": "How do I build a content strategy for a B2B SaaS startup that is launching its first product?",
"history": [],
"expect": {
"should_clarify": False,
"should_solve": True,
"should_invite": True,
"note": "Intent is clear. Agent should answer with substance and end with specialist invite.",
},
},
{
"id": "T3_offtopic",
"label": "Off-topic question → polite redirect + one question",
"message": "What is the best programming language to learn in 2025?",
"history": [],
"expect": {
"should_clarify": False,
"should_solve": False,
"should_invite": False,
"note": "Out of scope. Agent should politely redirect to branding/marketing and ask one question.",
},
},
{
"id": "T4_after_clarify",
"label": "Follow-up with context provided → should now solve + invite",
"message": "We sell project management tools to enterprise companies. Our main problem is that no one knows us despite having a great product.",
"history": [
{"role": "user", "content": "I want help with my brand."},
{"role": "assistant", "content": "What industry are you in and who is your main target audience?"},
],
"expect": {
"should_clarify": False,
"should_solve": True,
"should_invite": True,
"note": "Context is now clear. Should deliver a solid solution and invite, not ask more questions.",
},
},
{
"id": "T5_ceo_consent",
"label": "User agrees to specialist contact → CEO escalation triggered",
"message": "Yes, I would like to speak with someone from your team.",
"history": [
{"role": "user", "content": "How do I differentiate my brand?"},
{"role": "assistant", "content": "Brand differentiation starts with a clear point of view. You need to identify what only you can say and say it consistently. If you would like a more tailored perspective from our team, I can make that happen."},
],
"expect": {
"should_clarify": False,
"should_solve": False,
"should_invite": False,
"ceo_triggered": True,
"note": "User clearly consented. notify_ceo_interest should be called, contact details shared.",
},
},
{
"id": "T6_finnish",
"label": "Finnish message → answer in Finnish",
"message": "Miten rakennan vahvan brändin pienelle yritykselle?",
"history": [],
"expect": {
"should_clarify": False,
"should_solve": True,
"should_invite": True,
"language": "fi",
"note": "Finnish input. Answer must be in Finnish, end with Finnish invite.",
},
},
{
"id": "T7_no_bullet_lists",
"label": "Multi-point answer should be prose, not bullet points",
"message": "What are the key elements of a strong brand identity?",
"history": [],
"expect": {
"should_solve": True,
"should_invite": True,
"no_bullets": True,
"note": "Answer must be written in prose paragraphs, no bullet points or numbered lists.",
},
},
]
def call_chat(message: str, history: list) -> str:
"""Call /api/chat/stream and collect all token deltas into a full response string."""
payload = {"message": message, "history": history}
full_text = ""
ceo_triggered = False
current_event = None
try:
with httpx.Client(timeout=60) as client:
with client.stream("POST", f"{BASE_URL}/chat/stream", json=payload) as resp:
resp.raise_for_status()
for line in resp.iter_lines():
if not line or line.startswith(":"):
continue
if line.startswith("event:"):
current_event = line[6:].strip()
# CEO tool has needs_approval=True → creates an "interrupt" event
if current_event in ("ceo_email_sent", "interrupt"):
ceo_triggered = True
elif line.startswith("data:"):
raw = line[5:].strip()
try:
data = json.loads(raw)
except json.JSONDecodeError:
continue
if "delta" in data:
full_text += data["delta"]
# interrupt payload may also confirm notify_ceo_interest was called
if current_event == "interrupt":
interruptions = data.get("interruptions", [])
for item in interruptions:
if item.get("name") == "notify_ceo_interest":
ceo_triggered = True
except Exception as e:
return f"[ERROR: {e}]", False
return full_text.strip(), ceo_triggered
def grade(test: dict, response: str, ceo_triggered: bool) -> dict:
expect = test["expect"]
result = {"id": test["id"], "label": test["label"], "response_preview": response[:300], "issues": [], "pass": True}
# Detect clarifying question (ends with ?, single question, short)
question_marks = response.count("?")
is_clarify = question_marks >= 1 and len(response) < 400
# Detect substantial solution (longer response)
is_solve = len(response) > 200
# Detect invite sentence (loose keyword match)
invite_keywords = ["specialist", "team", "arrange", "contact", "asiantuntija", "jatkaa", "onnistuu", "reach out"]
is_invite = any(kw in response.lower() for kw in invite_keywords)
# Detect bullet points
has_bullets = any(line.strip().startswith(("-", "*", "•")) or (len(line) > 2 and line.strip()[0].isdigit() and line.strip()[1] == ".") for line in response.split("\n"))
if expect.get("should_clarify") and not is_clarify:
result["issues"].append("❌ Expected a clarifying question but got a long response")
if expect.get("should_clarify") is False and is_clarify and not is_solve:
result["issues"].append("❌ Asked clarifying question when context was already clear")
if expect.get("should_solve") and not is_solve:
result["issues"].append("❌ Expected a substantive solution but response is too short")
if expect.get("should_invite") and not is_invite:
result["issues"].append("❌ Missing specialist invite at the end")
if expect.get("should_invite") is False and is_invite and not expect.get("ceo_triggered"):
result["issues"].append("⚠️ Invite present when not expected (minor)")
if expect.get("ceo_triggered") and not ceo_triggered:
result["issues"].append("❌ CEO escalation not triggered after clear user consent")
if expect.get("no_bullets") and has_bullets:
result["issues"].append("❌ Response uses bullet points — must be prose only")
if expect.get("language") == "fi" and not any(w in response for w in ["ja", "on", "tai", "myös", "Jos", "brändi"]):
result["issues"].append("❌ Response does not appear to be in Finnish")
if result["issues"]:
result["pass"] = False
return result
def main():
print(f"\n{'='*70}")
print(" BRÄNDIVÄLKKY PROMPT EVALUATION SUITE")
print(f"{'='*70}\n")
results = []
for test in TESTS:
print(f"Running [{test['id']}] {test['label']}...")
print(f" Note: {test['expect']['note']}")
response, ceo_triggered = call_chat(test["message"], test["history"])
graded = grade(test, response, ceo_triggered)
results.append(graded)
status = "✅ PASS" if graded["pass"] else "❌ FAIL"
print(f" Status: {status}")
if graded["issues"]:
for issue in graded["issues"]:
print(f" {issue}")
print(f" Response preview: {graded['response_preview'][:200]!r}")
print()
passed = sum(1 for r in results if r["pass"])
total = len(results)
print(f"{'='*70}")
print(f" RESULT: {passed}/{total} tests passed")
print(f"{'='*70}\n")
if passed < total:
print("Failed tests:")
for r in results:
if not r["pass"]:
print(f" - [{r['id']}] {r['label']}")
for issue in r["issues"]:
print(f" {issue}")
return 0 if passed == total else 1
if __name__ == "__main__":
sys.exit(main())
|