todoappapi / scripts /test_chatbot_prompts.py
GrowWithTalha's picture
feat: sync backend changes from main repo
dc3879e
#!/usr/bin/env python3
"""Test script for AI chatbot prompts.
Sends test prompts to the chatbot API and generates a report on what worked.
Run from backend directory: PYTHONPATH=. uv run python scripts/test_chatbot_prompts.py
Usage:
python scripts/test_chatbot_prompts.py [--base-url URL] [--user-id UUID]
"""
import argparse
import asyncio
import json
import sys
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any
import httpx
# Test prompts organized by tool/functionality
TEST_CASES = [
# 1. add_task tests
{
"category": "add_task",
"prompt": "Add a task to buy groceries",
"expected_indicators": ["added", "created", "task", "groceries"],
"expected_tool": "add_task"
},
{
"category": "add_task",
"prompt": "Create a high priority task called 'Finish project report' due tomorrow",
"expected_indicators": ["added", "created", "task", "high priority"],
"expected_tool": "add_task"
},
# 2. list_tasks tests
{
"category": "list_tasks",
"prompt": "What are my tasks?",
"expected_indicators": ["task"],
"expected_tool": "list_tasks"
},
{
"category": "list_tasks",
"prompt": "Show me my pending tasks",
"expected_indicators": ["task", "pending"],
"expected_tool": "list_tasks"
},
# 3. update_task tests (requires existing task)
{
"category": "update_task",
"prompt": "Change my first task to high priority",
"expected_indicators": ["updated", "changed", "priority"],
"expected_tool": "update_task",
"note": "Requires at least one existing task"
},
# 4. complete_task tests
{
"category": "complete_task",
"prompt": "Mark my first task as complete",
"expected_indicators": ["complete", "done", "marked"],
"expected_tool": "complete_task",
"note": "Requires at least one existing task"
},
{
"category": "complete_task",
"prompt": "Mark all my tasks as complete",
"expected_indicators": ["complete", "marked"],
"expected_tool": "complete_all_tasks"
},
# 5. delete_task tests
{
"category": "delete_task",
"prompt": "Delete my last task",
"expected_indicators": ["deleted", "removed"],
"expected_tool": "delete_task",
"note": "Requires at least one existing task"
},
{
"category": "delete_all_tasks",
"prompt": "Delete all my tasks",
"expected_indicators": ["delete", "confirm"],
"expected_tool": "delete_all_tasks"
},
# 6. Edge cases
{
"category": "edge_case",
"prompt": "What are my tasks?",
"expected_indicators": [],
"expected_tool": None,
"note": "Empty list - should handle gracefully"
},
# 7. Ambiguous references
{
"category": "ambiguous_reference",
"prompt": "Show me my tasks",
"expected_indicators": ["task"],
"expected_tool": "list_tasks",
"note": "Priming for ambiguous reference"
},
]
class ChatbotTester:
"""Test chatbot with various prompts."""
def __init__(self, base_url: str, user_id: str, timeout: float = 30.0):
self.base_url = base_url.rstrip("/")
self.user_id = user_id
self.timeout = timeout
self.conversation_id: str | None = None
self.results: list[dict[str, Any]] = []
async def send_prompt(self, prompt: str) -> dict[str, Any]:
"""Send a prompt to the chatbot API."""
url = f"{self.base_url}/api/{self.user_id}/chat"
payload = {
"message": prompt,
"conversation_id": self.conversation_id
}
async with httpx.AsyncClient(timeout=self.timeout) as client:
try:
response = await client.post(url, json=payload)
response.raise_for_status()
data = response.json()
# Update conversation_id for next request
self.conversation_id = data.get("conversation_id")
return {
"success": True,
"status_code": response.status_code,
"response": data.get("response", ""),
"conversation_id": data.get("conversation_id"),
"error": None
}
except httpx.HTTPStatusError as e:
return {
"success": False,
"status_code": e.response.status_code,
"response": None,
"conversation_id": self.conversation_id,
"error": f"HTTP {e.response.status_code}: {e.response.text}"
}
except httpx.RequestError as e:
return {
"success": False,
"status_code": None,
"response": None,
"conversation_id": self.conversation_id,
"error": f"Request error: {str(e)}"
}
except Exception as e:
return {
"success": False,
"status_code": None,
"response": None,
"conversation_id": self.conversation_id,
"error": f"Unexpected error: {str(e)}"
}
def check_indicators(self, response_text: str, indicators: list[str]) -> bool:
"""Check if expected indicators are present in response."""
if not indicators:
return True
response_lower = response_text.lower()
return any(ind in response_lower for ind in indicators)
async def run_test_case(self, test_case: dict[str, Any], index: int) -> dict[str, Any]:
"""Run a single test case."""
prompt = test_case["prompt"]
category = test_case["category"]
expected_indicators = test_case.get("expected_indicators", [])
expected_tool = test_case.get("expected_tool")
print(f"\n[{index}] Testing: {category}")
print(f" Prompt: \"{prompt}\"")
result = await self.send_prompt(prompt)
# Determine if test passed
passed = False
failure_reason = ""
if not result["success"]:
failure_reason = f"Request failed: {result['error']}"
elif result["response"] is None:
failure_reason = "No response received"
elif expected_indicators and not self.check_indicators(result["response"], expected_indicators):
missing = [i for i in expected_indicators if i not in result["response"].lower()]
failure_reason = f"Missing indicators: {missing}"
else:
passed = True
return {
"index": index,
"category": category,
"prompt": prompt,
"expected_tool": expected_tool,
"passed": passed,
"failure_reason": failure_reason,
"response": result.get("response") if result["success"] else None,
"error": result.get("error"),
"status_code": result.get("status_code"),
"note": test_case.get("note", "")
}
async def run_all_tests(self) -> dict[str, Any]:
"""Run all test cases."""
print(f"\n{'='*60}")
print(f"Chatbot Test Suite")
print(f"Target: {self.base_url}")
print(f"User ID: {self.user_id}")
print(f"Started at: {datetime.now().isoformat()}")
print(f"{'='*60}")
start_time = datetime.now()
for i, test_case in enumerate(TEST_CASES, 1):
result = await self.run_test_case(test_case, i)
self.results.append(result)
status = "✓ PASS" if result["passed"] else "✗ FAIL"
print(f" {status}")
if result["response"]:
response_preview = result["response"][:100]
if len(result["response"]) > 100:
response_preview += "..."
print(f" Response: \"{response_preview}\"")
elif result["error"]:
print(f" Error: {result['error']}")
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
return self.generate_report(duration)
def generate_report(self, duration: float) -> dict[str, Any]:
"""Generate test report."""
total = len(self.results)
passed = sum(1 for r in self.results if r["passed"])
failed = total - passed
pass_rate = (passed / total * 100) if total > 0 else 0
# Group by category
by_category: dict[str, dict[str, int]] = {}
for result in self.results:
cat = result["category"]
if cat not in by_category:
by_category[cat] = {"passed": 0, "failed": 0, "total": 0}
by_category[cat]["total"] += 1
if result["passed"]:
by_category[cat]["passed"] += 1
else:
by_category[cat]["failed"] += 1
return {
"summary": {
"total": total,
"passed": passed,
"failed": failed,
"pass_rate": f"{pass_rate:.1f}%",
"duration_seconds": duration
},
"by_category": by_category,
"results": self.results
}
def print_report(self, report: dict[str, Any]) -> None:
"""Print formatted report."""
print(f"\n{'='*60}")
print(f"TEST REPORT")
print(f"{'='*60}")
summary = report["summary"]
print(f"\nSummary:")
print(f" Total Tests: {summary['total']}")
print(f" Passed: {summary['passed']} ✓")
print(f" Failed: {summary['failed']} ✗")
print(f" Pass Rate: {summary['pass_rate']}")
print(f" Duration: {summary['duration_seconds']:.2f}s")
print(f"\nResults by Category:")
for cat, stats in report["by_category"].items():
print(f" {cat}:")
print(f" Passed: {stats['passed']}/{stats['total']}")
if summary["failed"] > 0:
print(f"\n{'='*60}")
print(f"Failed Tests:")
print(f"{'='*60}")
for result in report["results"]:
if not result["passed"]:
print(f"\n[{result['index']}] {result['category']}")
print(f" Prompt: \"{result['prompt']}\"")
print(f" Reason: {result['failure_reason']}")
if result["note"]:
print(f" Note: {result['note']}")
print(f"\n{'='*60}")
def save_report(self, report: dict[str, Any], output_path: str) -> None:
"""Save report to JSON file."""
with open(output_path, "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved to: {output_path}")
async def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description="Test chatbot with sample prompts")
parser.add_argument(
"--base-url",
default="http://localhost:8000",
help="Base URL of the chatbot API (default: http://localhost:8000)"
)
parser.add_argument(
"--user-id",
default=str(uuid.uuid4()),
help="User ID for testing (default: random UUID)"
)
parser.add_argument(
"--output",
default="test_chatbot_report.json",
help="Output file for JSON report (default: test_chatbot_report.json)"
)
parser.add_argument(
"--timeout",
type=float,
default=30.0,
help="Request timeout in seconds (default: 30.0)"
)
args = parser.parse_args()
tester = ChatbotTester(
base_url=args.base_url,
user_id=args.user_id,
timeout=args.timeout
)
report = await tester.run_all_tests()
tester.print_report(report)
tester.save_report(report, args.output)
# Exit with error code if any tests failed
sys.exit(0 if report["summary"]["failed"] == 0 else 1)
if __name__ == "__main__":
asyncio.run(main())