File size: 9,481 Bytes
0df80b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"""
Prompt evaluation script.
Tests the 3-behavior framework: Clarify, Solve, Invite.
Run: python scripts/eval_prompt.py
"""
import httpx
import json
import sys

BASE_URL = "http://localhost:8000/api"

TESTS = [
    {
        "id": "T1_vague",
        "label": "Vague first message → should clarify (1 question only)",
        "message": "I want help with my brand.",
        "history": [],
        "expect": {
            "should_clarify": True,
            "should_solve": False,
            "should_invite": False,
            "note": "Message is too vague. Agent should ask ONE short clarifying question, no contact push.",
        },
    },
    {
        "id": "T2_clear",
        "label": "Clear specific question → skip clarify, go straight to solution + invite",
        "message": "How do I build a content strategy for a B2B SaaS startup that is launching its first product?",
        "history": [],
        "expect": {
            "should_clarify": False,
            "should_solve": True,
            "should_invite": True,
            "note": "Intent is clear. Agent should answer with substance and end with specialist invite.",
        },
    },
    {
        "id": "T3_offtopic",
        "label": "Off-topic question → polite redirect + one question",
        "message": "What is the best programming language to learn in 2025?",
        "history": [],
        "expect": {
            "should_clarify": False,
            "should_solve": False,
            "should_invite": False,
            "note": "Out of scope. Agent should politely redirect to branding/marketing and ask one question.",
        },
    },
    {
        "id": "T4_after_clarify",
        "label": "Follow-up with context provided → should now solve + invite",
        "message": "We sell project management tools to enterprise companies. Our main problem is that no one knows us despite having a great product.",
        "history": [
            {"role": "user", "content": "I want help with my brand."},
            {"role": "assistant", "content": "What industry are you in and who is your main target audience?"},
        ],
        "expect": {
            "should_clarify": False,
            "should_solve": True,
            "should_invite": True,
            "note": "Context is now clear. Should deliver a solid solution and invite, not ask more questions.",
        },
    },
    {
        "id": "T5_ceo_consent",
        "label": "User agrees to specialist contact → CEO escalation triggered",
        "message": "Yes, I would like to speak with someone from your team.",
        "history": [
            {"role": "user", "content": "How do I differentiate my brand?"},
            {"role": "assistant", "content": "Brand differentiation starts with a clear point of view. You need to identify what only you can say and say it consistently. If you would like a more tailored perspective from our team, I can make that happen."},
        ],
        "expect": {
            "should_clarify": False,
            "should_solve": False,
            "should_invite": False,
            "ceo_triggered": True,
            "note": "User clearly consented. notify_ceo_interest should be called, contact details shared.",
        },
    },
    {
        "id": "T6_finnish",
        "label": "Finnish message → answer in Finnish",
        "message": "Miten rakennan vahvan brändin pienelle yritykselle?",
        "history": [],
        "expect": {
            "should_clarify": False,
            "should_solve": True,
            "should_invite": True,
            "language": "fi",
            "note": "Finnish input. Answer must be in Finnish, end with Finnish invite.",
        },
    },
    {
        "id": "T7_no_bullet_lists",
        "label": "Multi-point answer should be prose, not bullet points",
        "message": "What are the key elements of a strong brand identity?",
        "history": [],
        "expect": {
            "should_solve": True,
            "should_invite": True,
            "no_bullets": True,
            "note": "Answer must be written in prose paragraphs, no bullet points or numbered lists.",
        },
    },
]


def call_chat(message: str, history: list) -> str:
    """Call /api/chat/stream and collect all token deltas into a full response string."""
    payload = {"message": message, "history": history}
    full_text = ""
    ceo_triggered = False
    current_event = None
    try:
        with httpx.Client(timeout=60) as client:
            with client.stream("POST", f"{BASE_URL}/chat/stream", json=payload) as resp:
                resp.raise_for_status()
                for line in resp.iter_lines():
                    if not line or line.startswith(":"):
                        continue
                    if line.startswith("event:"):
                        current_event = line[6:].strip()
                        # CEO tool has needs_approval=True → creates an "interrupt" event
                        if current_event in ("ceo_email_sent", "interrupt"):
                            ceo_triggered = True
                    elif line.startswith("data:"):
                        raw = line[5:].strip()
                        try:
                            data = json.loads(raw)
                        except json.JSONDecodeError:
                            continue
                        if "delta" in data:
                            full_text += data["delta"]
                        # interrupt payload may also confirm notify_ceo_interest was called
                        if current_event == "interrupt":
                            interruptions = data.get("interruptions", [])
                            for item in interruptions:
                                if item.get("name") == "notify_ceo_interest":
                                    ceo_triggered = True
    except Exception as e:
        return f"[ERROR: {e}]", False
    return full_text.strip(), ceo_triggered


def grade(test: dict, response: str, ceo_triggered: bool) -> dict:
    expect = test["expect"]
    result = {"id": test["id"], "label": test["label"], "response_preview": response[:300], "issues": [], "pass": True}

    # Detect clarifying question (ends with ?, single question, short)
    question_marks = response.count("?")
    is_clarify = question_marks >= 1 and len(response) < 400

    # Detect substantial solution (longer response)
    is_solve = len(response) > 200

    # Detect invite sentence (loose keyword match)
    invite_keywords = ["specialist", "team", "arrange", "contact", "asiantuntija", "jatkaa", "onnistuu", "reach out"]
    is_invite = any(kw in response.lower() for kw in invite_keywords)

    # Detect bullet points
    has_bullets = any(line.strip().startswith(("-", "*", "•")) or (len(line) > 2 and line.strip()[0].isdigit() and line.strip()[1] == ".") for line in response.split("\n"))

    if expect.get("should_clarify") and not is_clarify:
        result["issues"].append("❌ Expected a clarifying question but got a long response")
    if expect.get("should_clarify") is False and is_clarify and not is_solve:
        result["issues"].append("❌ Asked clarifying question when context was already clear")
    if expect.get("should_solve") and not is_solve:
        result["issues"].append("❌ Expected a substantive solution but response is too short")
    if expect.get("should_invite") and not is_invite:
        result["issues"].append("❌ Missing specialist invite at the end")
    if expect.get("should_invite") is False and is_invite and not expect.get("ceo_triggered"):
        result["issues"].append("⚠️  Invite present when not expected (minor)")
    if expect.get("ceo_triggered") and not ceo_triggered:
        result["issues"].append("❌ CEO escalation not triggered after clear user consent")
    if expect.get("no_bullets") and has_bullets:
        result["issues"].append("❌ Response uses bullet points — must be prose only")
    if expect.get("language") == "fi" and not any(w in response for w in ["ja", "on", "tai", "myös", "Jos", "brändi"]):
        result["issues"].append("❌ Response does not appear to be in Finnish")

    if result["issues"]:
        result["pass"] = False
    return result


def main():
    print(f"\n{'='*70}")
    print("  BRÄNDIVÄLKKY PROMPT EVALUATION SUITE")
    print(f"{'='*70}\n")

    results = []
    for test in TESTS:
        print(f"Running [{test['id']}] {test['label']}...")
        print(f"  Note: {test['expect']['note']}")
        response, ceo_triggered = call_chat(test["message"], test["history"])
        graded = grade(test, response, ceo_triggered)
        results.append(graded)

        status = "✅ PASS" if graded["pass"] else "❌ FAIL"
        print(f"  Status: {status}")
        if graded["issues"]:
            for issue in graded["issues"]:
                print(f"    {issue}")
        print(f"  Response preview: {graded['response_preview'][:200]!r}")
        print()

    passed = sum(1 for r in results if r["pass"])
    total = len(results)
    print(f"{'='*70}")
    print(f"  RESULT: {passed}/{total} tests passed")
    print(f"{'='*70}\n")
    if passed < total:
        print("Failed tests:")
        for r in results:
            if not r["pass"]:
                print(f"  - [{r['id']}] {r['label']}")
                for issue in r["issues"]:
                    print(f"      {issue}")
    return 0 if passed == total else 1


if __name__ == "__main__":
    sys.exit(main())