Spaces:

ABAO77
/

chatbot-rag-fi

Sleeping

File size: 9,481 Bytes

0df80b4

"""
Prompt evaluation script.
Tests the 3-behavior framework: Clarify, Solve, Invite.
Run: python scripts/eval_prompt.py
"""
import httpx
import json
import sys

BASE_URL = "http://localhost:8000/api"

TESTS = [
    {
        "id": "T1_vague",
        "label": "Vague first message → should clarify (1 question only)",
        "message": "I want help with my brand.",
        "history": [],
        "expect": {
            "should_clarify": True,
            "should_solve": False,
            "should_invite": False,
            "note": "Message is too vague. Agent should ask ONE short clarifying question, no contact push.",
        },
    },
    {
        "id": "T2_clear",
        "label": "Clear specific question → skip clarify, go straight to solution + invite",
        "message": "How do I build a content strategy for a B2B SaaS startup that is launching its first product?",
        "history": [],
        "expect": {
            "should_clarify": False,
            "should_solve": True,
            "should_invite": True,
            "note": "Intent is clear. Agent should answer with substance and end with specialist invite.",
        },
    },
    {
        "id": "T3_offtopic",
        "label": "Off-topic question → polite redirect + one question",
        "message": "What is the best programming language to learn in 2025?",
        "history": [],
        "expect": {
            "should_clarify": False,
            "should_solve": False,
            "should_invite": False,
            "note": "Out of scope. Agent should politely redirect to branding/marketing and ask one question.",
        },
    },
    {
        "id": "T4_after_clarify",
        "label": "Follow-up with context provided → should now solve + invite",
        "message": "We sell project management tools to enterprise companies. Our main problem is that no one knows us despite having a great product.",
        "history": [
            {"role": "user", "content": "I want help with my brand."},
            {"role": "assistant", "content": "What industry are you in and who is your main target audience?"},
        ],
        "expect": {
            "should_clarify": False,
            "should_solve": True,
            "should_invite": True,
            "note": "Context is now clear. Should deliver a solid solution and invite, not ask more questions.",
        },
    },
    {
        "id": "T5_ceo_consent",
        "label": "User agrees to specialist contact → CEO escalation triggered",
        "message": "Yes, I would like to speak with someone from your team.",
        "history": [
            {"role": "user", "content": "How do I differentiate my brand?"},
            {"role": "assistant", "content": "Brand differentiation starts with a clear point of view. You need to identify what only you can say and say it consistently. If you would like a more tailored perspective from our team, I can make that happen."},
        ],
        "expect": {
            "should_clarify": False,
            "should_solve": False,
            "should_invite": False,
            "ceo_triggered": True,
            "note": "User clearly consented. notify_ceo_interest should be called, contact details shared.",
        },
    },
    {
        "id": "T6_finnish",
        "label": "Finnish message → answer in Finnish",
        "message": "Miten rakennan vahvan brändin pienelle yritykselle?",
        "history": [],
        "expect": {
            "should_clarify": False,
            "should_solve": True,
            "should_invite": True,
            "language": "fi",
            "note": "Finnish input. Answer must be in Finnish, end with Finnish invite.",
        },
    },
    {
        "id": "T7_no_bullet_lists",
        "label": "Multi-point answer should be prose, not bullet points",
        "message": "What are the key elements of a strong brand identity?",
        "history": [],
        "expect": {
            "should_solve": True,
            "should_invite": True,
            "no_bullets": True,
            "note": "Answer must be written in prose paragraphs, no bullet points or numbered lists.",
        },
    },
]


def call_chat(message: str, history: list) -> str:
    """Call /api/chat/stream and collect all token deltas into a full response string."""
    payload = {"message": message, "history": history}
    full_text = ""
    ceo_triggered = False
    current_event = None
    try:
        with httpx.Client(timeout=60) as client:
            with client.stream("POST", f"{BASE_URL}/chat/stream", json=payload) as resp:
                resp.raise_for_status()
                for line in resp.iter_lines():
                    if not line or line.startswith(":"):
                        continue
                    if line.startswith("event:"):
                        current_event = line[6:].strip()
                        # CEO tool has needs_approval=True → creates an "interrupt" event
                        if current_event in ("ceo_email_sent", "interrupt"):
                            ceo_triggered = True
                    elif line.startswith("data:"):
                        raw = line[5:].strip()
                        try:
                            data = json.loads(raw)
                        except json.JSONDecodeError:
                            continue
                        if "delta" in data:
                            full_text += data["delta"]
                        # interrupt payload may also confirm notify_ceo_interest was called
                        if current_event == "interrupt":
                            interruptions = data.get("interruptions", [])
                            for item in interruptions:
                                if item.get("name") == "notify_ceo_interest":
                                    ceo_triggered = True
    except Exception as e:
        return f"[ERROR: {e}]", False
    return full_text.strip(), ceo_triggered


def grade(test: dict, response: str, ceo_triggered: bool) -> dict:
    expect = test["expect"]
    result = {"id": test["id"], "label": test["label"], "response_preview": response[:300], "issues": [], "pass": True}

    # Detect clarifying question (ends with ?, single question, short)
    question_marks = response.count("?")
    is_clarify = question_marks >= 1 and len(response) < 400

    # Detect substantial solution (longer response)
    is_solve = len(response) > 200

    # Detect invite sentence (loose keyword match)
    invite_keywords = ["specialist", "team", "arrange", "contact", "asiantuntija", "jatkaa", "onnistuu", "reach out"]
    is_invite = any(kw in response.lower() for kw in invite_keywords)

    # Detect bullet points
    has_bullets = any(line.strip().startswith(("-", "*", "•")) or (len(line) > 2 and line.strip()[0].isdigit() and line.strip()[1] == ".") for line in response.split("\n"))

    if expect.get("should_clarify") and not is_clarify:
        result["issues"].append("❌ Expected a clarifying question but got a long response")
    if expect.get("should_clarify") is False and is_clarify and not is_solve:
        result["issues"].append("❌ Asked clarifying question when context was already clear")
    if expect.get("should_solve") and not is_solve:
        result["issues"].append("❌ Expected a substantive solution but response is too short")
    if expect.get("should_invite") and not is_invite:
        result["issues"].append("❌ Missing specialist invite at the end")
    if expect.get("should_invite") is False and is_invite and not expect.get("ceo_triggered"):
        result["issues"].append("⚠️  Invite present when not expected (minor)")
    if expect.get("ceo_triggered") and not ceo_triggered:
        result["issues"].append("❌ CEO escalation not triggered after clear user consent")
    if expect.get("no_bullets") and has_bullets:
        result["issues"].append("❌ Response uses bullet points — must be prose only")
    if expect.get("language") == "fi" and not any(w in response for w in ["ja", "on", "tai", "myös", "Jos", "brändi"]):
        result["issues"].append("❌ Response does not appear to be in Finnish")

    if result["issues"]:
        result["pass"] = False
    return result


def main():
    print(f"\n{'='*70}")
    print("  BRÄNDIVÄLKKY PROMPT EVALUATION SUITE")
    print(f"{'='*70}\n")

    results = []
    for test in TESTS:
        print(f"Running [{test['id']}] {test['label']}...")
        print(f"  Note: {test['expect']['note']}")
        response, ceo_triggered = call_chat(test["message"], test["history"])
        graded = grade(test, response, ceo_triggered)
        results.append(graded)

        status = "✅ PASS" if graded["pass"] else "❌ FAIL"
        print(f"  Status: {status}")
        if graded["issues"]:
            for issue in graded["issues"]:
                print(f"    {issue}")
        print(f"  Response preview: {graded['response_preview'][:200]!r}")
        print()

    passed = sum(1 for r in results if r["pass"])
    total = len(results)
    print(f"{'='*70}")
    print(f"  RESULT: {passed}/{total} tests passed")
    print(f"{'='*70}\n")
    if passed < total:
        print("Failed tests:")
        for r in results:
            if not r["pass"]:
                print(f"  - [{r['id']}] {r['label']}")
                for issue in r["issues"]:
                    print(f"      {issue}")
    return 0 if passed == total else 1


if __name__ == "__main__":
    sys.exit(main())