Spaces:

GrowWithTalha
/

todoappapi

Running

App Files Files Community

todoappapi / scripts /test_chatbot_prompts.py

GrowWithTalha

feat: sync backend changes from main repo

dc3879e 17 days ago

raw

history blame contribute delete

12.2 kB

	#!/usr/bin/env python3
	"""Test script for AI chatbot prompts.

	Sends test prompts to the chatbot API and generates a report on what worked.
	Run from backend directory: PYTHONPATH=. uv run python scripts/test_chatbot_prompts.py

	Usage:
	python scripts/test_chatbot_prompts.py [--base-url URL] [--user-id UUID]
	"""
	import argparse
	import asyncio
	import json
	import sys
	import uuid
	from datetime import datetime
	from pathlib import Path
	from typing import Any

	import httpx


	# Test prompts organized by tool/functionality
	TEST_CASES = [
	# 1. add_task tests
	{
	"category": "add_task",
	"prompt": "Add a task to buy groceries",
	"expected_indicators": ["added", "created", "task", "groceries"],
	"expected_tool": "add_task"
	},
	{
	"category": "add_task",
	"prompt": "Create a high priority task called 'Finish project report' due tomorrow",
	"expected_indicators": ["added", "created", "task", "high priority"],
	"expected_tool": "add_task"
	},

	# 2. list_tasks tests
	{
	"category": "list_tasks",
	"prompt": "What are my tasks?",
	"expected_indicators": ["task"],
	"expected_tool": "list_tasks"
	},
	{
	"category": "list_tasks",
	"prompt": "Show me my pending tasks",
	"expected_indicators": ["task", "pending"],
	"expected_tool": "list_tasks"
	},

	# 3. update_task tests (requires existing task)
	{
	"category": "update_task",
	"prompt": "Change my first task to high priority",
	"expected_indicators": ["updated", "changed", "priority"],
	"expected_tool": "update_task",
	"note": "Requires at least one existing task"
	},

	# 4. complete_task tests
	{
	"category": "complete_task",
	"prompt": "Mark my first task as complete",
	"expected_indicators": ["complete", "done", "marked"],
	"expected_tool": "complete_task",
	"note": "Requires at least one existing task"
	},
	{
	"category": "complete_task",
	"prompt": "Mark all my tasks as complete",
	"expected_indicators": ["complete", "marked"],
	"expected_tool": "complete_all_tasks"
	},

	# 5. delete_task tests
	{
	"category": "delete_task",
	"prompt": "Delete my last task",
	"expected_indicators": ["deleted", "removed"],
	"expected_tool": "delete_task",
	"note": "Requires at least one existing task"
	},
	{
	"category": "delete_all_tasks",
	"prompt": "Delete all my tasks",
	"expected_indicators": ["delete", "confirm"],
	"expected_tool": "delete_all_tasks"
	},

	# 6. Edge cases
	{
	"category": "edge_case",
	"prompt": "What are my tasks?",
	"expected_indicators": [],
	"expected_tool": None,
	"note": "Empty list - should handle gracefully"
	},

	# 7. Ambiguous references
	{
	"category": "ambiguous_reference",
	"prompt": "Show me my tasks",
	"expected_indicators": ["task"],
	"expected_tool": "list_tasks",
	"note": "Priming for ambiguous reference"
	},
	]


	class ChatbotTester:
	"""Test chatbot with various prompts."""

	def __init__(self, base_url: str, user_id: str, timeout: float = 30.0):
	self.base_url = base_url.rstrip("/")
	self.user_id = user_id
	self.timeout = timeout
	self.conversation_id: str \| None = None
	self.results: list[dict[str, Any]] = []

	async def send_prompt(self, prompt: str) -> dict[str, Any]:
	"""Send a prompt to the chatbot API."""
	url = f"{self.base_url}/api/{self.user_id}/chat"
	payload = {
	"message": prompt,
	"conversation_id": self.conversation_id
	}

	async with httpx.AsyncClient(timeout=self.timeout) as client:
	try:
	response = await client.post(url, json=payload)
	response.raise_for_status()
	data = response.json()

	# Update conversation_id for next request
	self.conversation_id = data.get("conversation_id")

	return {
	"success": True,
	"status_code": response.status_code,
	"response": data.get("response", ""),
	"conversation_id": data.get("conversation_id"),
	"error": None
	}
	except httpx.HTTPStatusError as e:
	return {
	"success": False,
	"status_code": e.response.status_code,
	"response": None,
	"conversation_id": self.conversation_id,
	"error": f"HTTP {e.response.status_code}: {e.response.text}"
	}
	except httpx.RequestError as e:
	return {
	"success": False,
	"status_code": None,
	"response": None,
	"conversation_id": self.conversation_id,
	"error": f"Request error: {str(e)}"
	}
	except Exception as e:
	return {
	"success": False,
	"status_code": None,
	"response": None,
	"conversation_id": self.conversation_id,
	"error": f"Unexpected error: {str(e)}"
	}

	def check_indicators(self, response_text: str, indicators: list[str]) -> bool:
	"""Check if expected indicators are present in response."""
	if not indicators:
	return True
	response_lower = response_text.lower()
	return any(ind in response_lower for ind in indicators)

	async def run_test_case(self, test_case: dict[str, Any], index: int) -> dict[str, Any]:
	"""Run a single test case."""
	prompt = test_case["prompt"]
	category = test_case["category"]
	expected_indicators = test_case.get("expected_indicators", [])
	expected_tool = test_case.get("expected_tool")

	print(f"\n[{index}] Testing: {category}")
	print(f" Prompt: \"{prompt}\"")

	result = await self.send_prompt(prompt)

	# Determine if test passed
	passed = False
	failure_reason = ""

	if not result["success"]:
	failure_reason = f"Request failed: {result['error']}"
	elif result["response"] is None:
	failure_reason = "No response received"
	elif expected_indicators and not self.check_indicators(result["response"], expected_indicators):
	missing = [i for i in expected_indicators if i not in result["response"].lower()]
	failure_reason = f"Missing indicators: {missing}"
	else:
	passed = True

	return {
	"index": index,
	"category": category,
	"prompt": prompt,
	"expected_tool": expected_tool,
	"passed": passed,
	"failure_reason": failure_reason,
	"response": result.get("response") if result["success"] else None,
	"error": result.get("error"),
	"status_code": result.get("status_code"),
	"note": test_case.get("note", "")
	}

	async def run_all_tests(self) -> dict[str, Any]:
	"""Run all test cases."""
	print(f"\n{'='*60}")
	print(f"Chatbot Test Suite")
	print(f"Target: {self.base_url}")
	print(f"User ID: {self.user_id}")
	print(f"Started at: {datetime.now().isoformat()}")
	print(f"{'='*60}")

	start_time = datetime.now()

	for i, test_case in enumerate(TEST_CASES, 1):
	result = await self.run_test_case(test_case, i)
	self.results.append(result)

	status = "✓ PASS" if result["passed"] else "✗ FAIL"
	print(f" {status}")

	if result["response"]:
	response_preview = result["response"][:100]
	if len(result["response"]) > 100:
	response_preview += "..."
	print(f" Response: \"{response_preview}\"")
	elif result["error"]:
	print(f" Error: {result['error']}")

	end_time = datetime.now()
	duration = (end_time - start_time).total_seconds()

	return self.generate_report(duration)

	def generate_report(self, duration: float) -> dict[str, Any]:
	"""Generate test report."""
	total = len(self.results)
	passed = sum(1 for r in self.results if r["passed"])
	failed = total - passed
	pass_rate = (passed / total * 100) if total > 0 else 0

	# Group by category
	by_category: dict[str, dict[str, int]] = {}
	for result in self.results:
	cat = result["category"]
	if cat not in by_category:
	by_category[cat] = {"passed": 0, "failed": 0, "total": 0}
	by_category[cat]["total"] += 1
	if result["passed"]:
	by_category[cat]["passed"] += 1
	else:
	by_category[cat]["failed"] += 1

	return {
	"summary": {
	"total": total,
	"passed": passed,
	"failed": failed,
	"pass_rate": f"{pass_rate:.1f}%",
	"duration_seconds": duration
	},
	"by_category": by_category,
	"results": self.results
	}

	def print_report(self, report: dict[str, Any]) -> None:
	"""Print formatted report."""
	print(f"\n{'='*60}")
	print(f"TEST REPORT")
	print(f"{'='*60}")

	summary = report["summary"]
	print(f"\nSummary:")
	print(f" Total Tests: {summary['total']}")
	print(f" Passed: {summary['passed']} ✓")
	print(f" Failed: {summary['failed']} ✗")
	print(f" Pass Rate: {summary['pass_rate']}")
	print(f" Duration: {summary['duration_seconds']:.2f}s")

	print(f"\nResults by Category:")
	for cat, stats in report["by_category"].items():
	print(f" {cat}:")
	print(f" Passed: {stats['passed']}/{stats['total']}")

	if summary["failed"] > 0:
	print(f"\n{'='*60}")
	print(f"Failed Tests:")
	print(f"{'='*60}")
	for result in report["results"]:
	if not result["passed"]:
	print(f"\n[{result['index']}] {result['category']}")
	print(f" Prompt: \"{result['prompt']}\"")
	print(f" Reason: {result['failure_reason']}")
	if result["note"]:
	print(f" Note: {result['note']}")

	print(f"\n{'='*60}")

	def save_report(self, report: dict[str, Any], output_path: str) -> None:
	"""Save report to JSON file."""
	with open(output_path, "w") as f:
	json.dump(report, f, indent=2)
	print(f"Report saved to: {output_path}")


	async def main():
	"""Main entry point."""
	parser = argparse.ArgumentParser(description="Test chatbot with sample prompts")
	parser.add_argument(
	"--base-url",
	default="http://localhost:8000",
	help="Base URL of the chatbot API (default: http://localhost:8000)"
	)
	parser.add_argument(
	"--user-id",
	default=str(uuid.uuid4()),
	help="User ID for testing (default: random UUID)"
	)
	parser.add_argument(
	"--output",
	default="test_chatbot_report.json",
	help="Output file for JSON report (default: test_chatbot_report.json)"
	)
	parser.add_argument(
	"--timeout",
	type=float,
	default=30.0,
	help="Request timeout in seconds (default: 30.0)"
	)

	args = parser.parse_args()

	tester = ChatbotTester(
	base_url=args.base_url,
	user_id=args.user_id,
	timeout=args.timeout
	)

	report = await tester.run_all_tests()
	tester.print_report(report)
	tester.save_report(report, args.output)

	# Exit with error code if any tests failed
	sys.exit(0 if report["summary"]["failed"] == 0 else 1)


	if __name__ == "__main__":
	asyncio.run(main())