Spaces:

GrowWithTalha
/

todoappapi

Running

File size: 12,234 Bytes

dc3879e

#!/usr/bin/env python3
"""Test script for AI chatbot prompts.

Sends test prompts to the chatbot API and generates a report on what worked.
Run from backend directory: PYTHONPATH=. uv run python scripts/test_chatbot_prompts.py

Usage:
    python scripts/test_chatbot_prompts.py [--base-url URL] [--user-id UUID]
"""
import argparse
import asyncio
import json
import sys
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any

import httpx


# Test prompts organized by tool/functionality
TEST_CASES = [
    # 1. add_task tests
    {
        "category": "add_task",
        "prompt": "Add a task to buy groceries",
        "expected_indicators": ["added", "created", "task", "groceries"],
        "expected_tool": "add_task"
    },
    {
        "category": "add_task",
        "prompt": "Create a high priority task called 'Finish project report' due tomorrow",
        "expected_indicators": ["added", "created", "task", "high priority"],
        "expected_tool": "add_task"
    },

    # 2. list_tasks tests
    {
        "category": "list_tasks",
        "prompt": "What are my tasks?",
        "expected_indicators": ["task"],
        "expected_tool": "list_tasks"
    },
    {
        "category": "list_tasks",
        "prompt": "Show me my pending tasks",
        "expected_indicators": ["task", "pending"],
        "expected_tool": "list_tasks"
    },

    # 3. update_task tests (requires existing task)
    {
        "category": "update_task",
        "prompt": "Change my first task to high priority",
        "expected_indicators": ["updated", "changed", "priority"],
        "expected_tool": "update_task",
        "note": "Requires at least one existing task"
    },

    # 4. complete_task tests
    {
        "category": "complete_task",
        "prompt": "Mark my first task as complete",
        "expected_indicators": ["complete", "done", "marked"],
        "expected_tool": "complete_task",
        "note": "Requires at least one existing task"
    },
    {
        "category": "complete_task",
        "prompt": "Mark all my tasks as complete",
        "expected_indicators": ["complete", "marked"],
        "expected_tool": "complete_all_tasks"
    },

    # 5. delete_task tests
    {
        "category": "delete_task",
        "prompt": "Delete my last task",
        "expected_indicators": ["deleted", "removed"],
        "expected_tool": "delete_task",
        "note": "Requires at least one existing task"
    },
    {
        "category": "delete_all_tasks",
        "prompt": "Delete all my tasks",
        "expected_indicators": ["delete", "confirm"],
        "expected_tool": "delete_all_tasks"
    },

    # 6. Edge cases
    {
        "category": "edge_case",
        "prompt": "What are my tasks?",
        "expected_indicators": [],
        "expected_tool": None,
        "note": "Empty list - should handle gracefully"
    },

    # 7. Ambiguous references
    {
        "category": "ambiguous_reference",
        "prompt": "Show me my tasks",
        "expected_indicators": ["task"],
        "expected_tool": "list_tasks",
        "note": "Priming for ambiguous reference"
    },
]


class ChatbotTester:
    """Test chatbot with various prompts."""

    def __init__(self, base_url: str, user_id: str, timeout: float = 30.0):
        self.base_url = base_url.rstrip("/")
        self.user_id = user_id
        self.timeout = timeout
        self.conversation_id: str | None = None
        self.results: list[dict[str, Any]] = []

    async def send_prompt(self, prompt: str) -> dict[str, Any]:
        """Send a prompt to the chatbot API."""
        url = f"{self.base_url}/api/{self.user_id}/chat"
        payload = {
            "message": prompt,
            "conversation_id": self.conversation_id
        }

        async with httpx.AsyncClient(timeout=self.timeout) as client:
            try:
                response = await client.post(url, json=payload)
                response.raise_for_status()
                data = response.json()

                # Update conversation_id for next request
                self.conversation_id = data.get("conversation_id")

                return {
                    "success": True,
                    "status_code": response.status_code,
                    "response": data.get("response", ""),
                    "conversation_id": data.get("conversation_id"),
                    "error": None
                }
            except httpx.HTTPStatusError as e:
                return {
                    "success": False,
                    "status_code": e.response.status_code,
                    "response": None,
                    "conversation_id": self.conversation_id,
                    "error": f"HTTP {e.response.status_code}: {e.response.text}"
                }
            except httpx.RequestError as e:
                return {
                    "success": False,
                    "status_code": None,
                    "response": None,
                    "conversation_id": self.conversation_id,
                    "error": f"Request error: {str(e)}"
                }
            except Exception as e:
                return {
                    "success": False,
                    "status_code": None,
                    "response": None,
                    "conversation_id": self.conversation_id,
                    "error": f"Unexpected error: {str(e)}"
                }

    def check_indicators(self, response_text: str, indicators: list[str]) -> bool:
        """Check if expected indicators are present in response."""
        if not indicators:
            return True
        response_lower = response_text.lower()
        return any(ind in response_lower for ind in indicators)

    async def run_test_case(self, test_case: dict[str, Any], index: int) -> dict[str, Any]:
        """Run a single test case."""
        prompt = test_case["prompt"]
        category = test_case["category"]
        expected_indicators = test_case.get("expected_indicators", [])
        expected_tool = test_case.get("expected_tool")

        print(f"\n[{index}] Testing: {category}")
        print(f"    Prompt: \"{prompt}\"")

        result = await self.send_prompt(prompt)

        # Determine if test passed
        passed = False
        failure_reason = ""

        if not result["success"]:
            failure_reason = f"Request failed: {result['error']}"
        elif result["response"] is None:
            failure_reason = "No response received"
        elif expected_indicators and not self.check_indicators(result["response"], expected_indicators):
            missing = [i for i in expected_indicators if i not in result["response"].lower()]
            failure_reason = f"Missing indicators: {missing}"
        else:
            passed = True

        return {
            "index": index,
            "category": category,
            "prompt": prompt,
            "expected_tool": expected_tool,
            "passed": passed,
            "failure_reason": failure_reason,
            "response": result.get("response") if result["success"] else None,
            "error": result.get("error"),
            "status_code": result.get("status_code"),
            "note": test_case.get("note", "")
        }

    async def run_all_tests(self) -> dict[str, Any]:
        """Run all test cases."""
        print(f"\n{'='*60}")
        print(f"Chatbot Test Suite")
        print(f"Target: {self.base_url}")
        print(f"User ID: {self.user_id}")
        print(f"Started at: {datetime.now().isoformat()}")
        print(f"{'='*60}")

        start_time = datetime.now()

        for i, test_case in enumerate(TEST_CASES, 1):
            result = await self.run_test_case(test_case, i)
            self.results.append(result)

            status = "✓ PASS" if result["passed"] else "✗ FAIL"
            print(f"    {status}")

            if result["response"]:
                response_preview = result["response"][:100]
                if len(result["response"]) > 100:
                    response_preview += "..."
                print(f"    Response: \"{response_preview}\"")
            elif result["error"]:
                print(f"    Error: {result['error']}")

        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()

        return self.generate_report(duration)

    def generate_report(self, duration: float) -> dict[str, Any]:
        """Generate test report."""
        total = len(self.results)
        passed = sum(1 for r in self.results if r["passed"])
        failed = total - passed
        pass_rate = (passed / total * 100) if total > 0 else 0

        # Group by category
        by_category: dict[str, dict[str, int]] = {}
        for result in self.results:
            cat = result["category"]
            if cat not in by_category:
                by_category[cat] = {"passed": 0, "failed": 0, "total": 0}
            by_category[cat]["total"] += 1
            if result["passed"]:
                by_category[cat]["passed"] += 1
            else:
                by_category[cat]["failed"] += 1

        return {
            "summary": {
                "total": total,
                "passed": passed,
                "failed": failed,
                "pass_rate": f"{pass_rate:.1f}%",
                "duration_seconds": duration
            },
            "by_category": by_category,
            "results": self.results
        }

    def print_report(self, report: dict[str, Any]) -> None:
        """Print formatted report."""
        print(f"\n{'='*60}")
        print(f"TEST REPORT")
        print(f"{'='*60}")

        summary = report["summary"]
        print(f"\nSummary:")
        print(f"  Total Tests:  {summary['total']}")
        print(f"  Passed:       {summary['passed']} ✓")
        print(f"  Failed:       {summary['failed']} ✗")
        print(f"  Pass Rate:    {summary['pass_rate']}")
        print(f"  Duration:     {summary['duration_seconds']:.2f}s")

        print(f"\nResults by Category:")
        for cat, stats in report["by_category"].items():
            print(f"  {cat}:")
            print(f"    Passed: {stats['passed']}/{stats['total']}")

        if summary["failed"] > 0:
            print(f"\n{'='*60}")
            print(f"Failed Tests:")
            print(f"{'='*60}")
            for result in report["results"]:
                if not result["passed"]:
                    print(f"\n[{result['index']}] {result['category']}")
                    print(f"  Prompt: \"{result['prompt']}\"")
                    print(f"  Reason: {result['failure_reason']}")
                    if result["note"]:
                        print(f"  Note: {result['note']}")

        print(f"\n{'='*60}")

    def save_report(self, report: dict[str, Any], output_path: str) -> None:
        """Save report to JSON file."""
        with open(output_path, "w") as f:
            json.dump(report, f, indent=2)
        print(f"Report saved to: {output_path}")


async def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description="Test chatbot with sample prompts")
    parser.add_argument(
        "--base-url",
        default="http://localhost:8000",
        help="Base URL of the chatbot API (default: http://localhost:8000)"
    )
    parser.add_argument(
        "--user-id",
        default=str(uuid.uuid4()),
        help="User ID for testing (default: random UUID)"
    )
    parser.add_argument(
        "--output",
        default="test_chatbot_report.json",
        help="Output file for JSON report (default: test_chatbot_report.json)"
    )
    parser.add_argument(
        "--timeout",
        type=float,
        default=30.0,
        help="Request timeout in seconds (default: 30.0)"
    )

    args = parser.parse_args()

    tester = ChatbotTester(
        base_url=args.base_url,
        user_id=args.user_id,
        timeout=args.timeout
    )

    report = await tester.run_all_tests()
    tester.print_report(report)
    tester.save_report(report, args.output)

    # Exit with error code if any tests failed
    sys.exit(0 if report["summary"]["failed"] == 0 else 1)


if __name__ == "__main__":
    asyncio.run(main())