Spaces:

Cuong2004
/

LocalMate

Sleeping

App Files Files Community

LocalMate / tests /test_react_comparison.py

Cuong2004

fix ReAct loop

0140f42 about 2 months ago

raw

history blame

11.6 kB

	"""
	LocalMate Agent Test Script - Single vs ReAct Mode Comparison

	Tests 10 queries in both modes:
	- Single mode: configurable delay between queries
	- ReAct mode: configurable delay between queries
	- Configurable delay between modes

	Generates detailed report with all step inputs/outputs.
	"""

	import asyncio
	import json
	import time
	from datetime import datetime
	import httpx

	# =============================================================================
	# CONFIGURATION - Adjust these values as needed
	# =============================================================================

	# API Settings
	API_BASE = "https://cuong2004-localmate.hf.space/api/v1"
	USER_ID = "test_comparison"

	# Delay Settings (in seconds)
	SINGLE_MODE_DELAY = 10 # Delay between queries in single mode
	REACT_MODE_DELAY = 60 # Delay between queries in ReAct mode
	MODE_SWITCH_DELAY = 60 # Delay between switching modes
	REQUEST_TIMEOUT = 120 # Timeout for each API request

	# =============================================================================

	# Test Cases - 10 queries covering different scenarios
	TEST_CASES = [
	# {
	# "id": 1,
	# "query": "Quán cafe view đẹp",
	# "description": "Simple text search - no location",
	# "expected_tools": ["retrieve_context_text"],
	# },
	{
	"id": 2,
	"query": "Nhà hàng gần bãi biển Mỹ Khê",
	"description": "Location-based search",
	"expected_tools": ["find_nearby_places"],
	},
	# {
	# "id": 3,
	# "query": "Quán cafe có không gian xanh mát gần Cầu Rồng",
	# "description": "Complex: location + feature (should use multiple tools in ReAct)",
	# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
	# },
	# {
	# "id": 4,
	# "query": "Phở ngon giá rẻ",
	# "description": "Food-specific text search",
	# "expected_tools": ["retrieve_context_text"],
	# },
	# {
	# "id": 5,
	# "query": "Địa điểm checkin đẹp gần Bà Nà",
	# "description": "Location + activity type",
	# "expected_tools": ["find_nearby_places"],
	# },
	# {
	# "id": 6,
	# "query": "Quán ăn hải sản có view sông gần trung tâm",
	# "description": "Complex: location + category + feature",
	# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
	# },
	# {
	# "id": 7,
	# "query": "Khách sạn 5 sao gần biển",
	# "description": "Hotel + location search",
	# "expected_tools": ["find_nearby_places"],
	# },
	# {
	# "id": 8,
	# "query": "Quán bar có view đẹp về đêm",
	# "description": "Nightlife text search",
	# "expected_tools": ["retrieve_context_text"],
	# },
	# {
	# "id": 9,
	# "query": "Cafe rooftop gần Sơn Trà có coffee ngon",
	# "description": "Complex: location + feature + quality",
	# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
	# },
	# {
	# "id": 10,
	# "query": "Nhà hàng Việt Nam authentic gần Rex Hotel",
	# "description": "Specific location + category + style",
	# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
	# },
	]


	async def run_test(client: httpx.AsyncClient, test_case: dict, react_mode: bool) -> dict:
	"""Run a single test case and return results."""
	start_time = time.time()

	try:
	response = await client.post(
	f"{API_BASE}/chat",
	json={
	"message": test_case["query"],
	"user_id": USER_ID,
	"provider": "MegaLLM",
	"react_mode": react_mode,
	"max_steps": 5,
	},
	timeout=float(REQUEST_TIMEOUT),
	)

	duration = (time.time() - start_time) * 1000

	if response.status_code == 200:
	data = response.json()
	return {
	"success": True,
	"test_id": test_case["id"],
	"query": test_case["query"],
	"description": test_case["description"],
	"react_mode": react_mode,
	"response": data.get("response", "")[:300],
	"workflow": data.get("workflow", {}),
	"tools_used": data.get("tools_used", []),
	"api_duration_ms": data.get("duration_ms", 0),
	"total_duration_ms": duration,
	}
	else:
	return {
	"success": False,
	"test_id": test_case["id"],
	"query": test_case["query"],
	"react_mode": react_mode,
	"error": f"HTTP {response.status_code}: {response.text[:200]}",
	"total_duration_ms": duration,
	}

	except Exception as e:
	return {
	"success": False,
	"test_id": test_case["id"],
	"query": test_case["query"],
	"react_mode": react_mode,
	"error": str(e),
	"total_duration_ms": (time.time() - start_time) * 1000,
	}


	def format_workflow_steps(workflow: dict) -> str:
	"""Format workflow steps for report."""
	steps = workflow.get("steps", [])
	if not steps:
	return "No steps recorded"

	lines = []
	for step in steps:
	tool = step.get("tool", "N/A")
	purpose = step.get("purpose", "")
	results = step.get("results", 0)
	lines.append(f" - {step.get('step', 'Unknown')}")
	lines.append(f" Tool: `{tool}` \| Results: {results}")

	return "\n".join(lines)


	def generate_report(single_results: list, react_results: list) -> str:
	"""Generate detailed markdown report."""
	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	report = f"""# LocalMate Agent Test Report

	Generated: {timestamp}

	## Summary

	\| Metric \| Single Mode \| ReAct Mode \|
	\|--------\|-------------\|------------\|
	\| Total Tests \| {len(single_results)} \| {len(react_results)} \|
	\| Success \| {sum(1 for r in single_results if r.get('success'))} \| {sum(1 for r in react_results if r.get('success'))} \|
	\| Avg Duration \| {sum(r.get('api_duration_ms', 0) for r in single_results if r.get('success')) / max(1, sum(1 for r in single_results if r.get('success'))):.0f}ms \| {sum(r.get('api_duration_ms', 0) for r in react_results if r.get('success')) / max(1, sum(1 for r in react_results if r.get('success'))):.0f}ms \|

	---

	## Detailed Results

	"""

	for i, (single, react) in enumerate(zip(single_results, react_results)):
	test_id = single.get("test_id", i + 1)
	query = single.get("query", "N/A")
	description = single.get("description", "")

	report += f"""### Test Case {test_id}: {description}

	Query: `{query}`

	#### Single Mode

	"""
	if single.get("success"):
	report += f"""- Status: ✅ Success
	- Duration: {single.get('api_duration_ms', 0):.0f}ms
	- Tools Used: {', '.join(single.get('tools_used', [])) or 'None'}

	Workflow:
	{format_workflow_steps(single.get('workflow', {}))}

	Response Preview:
	> {single.get('response', 'N/A')[:200]}...

	"""
	else:
	report += f"""- Status: ❌ Failed
	- Error: {single.get('error', 'Unknown')}

	"""

	report += """#### ReAct Mode

	"""
	if react.get("success"):
	workflow = react.get("workflow", {})
	report += f"""- Status: ✅ Success
	- Duration: {react.get('api_duration_ms', 0):.0f}ms
	- Tools Used: {', '.join(react.get('tools_used', [])) or 'None'}
	- Steps: {len(workflow.get('steps', []))}
	- Intent Detected: {workflow.get('intent_detected', 'N/A')}

	Workflow Steps:
	{format_workflow_steps(workflow)}

	Response Preview:
	> {react.get('response', 'N/A')[:200]}...

	"""
	else:
	report += f"""- Status: ❌ Failed
	- Error: {react.get('error', 'Unknown')}

	"""

	report += "---\n\n"

	# Comparison analysis
	report += """## Analysis

	### Tool Usage Comparison

	\| Test \| Single Mode Tools \| ReAct Mode Tools \| ReAct Steps \|
	\|------\|-------------------\|------------------\|-------------\|
	"""

	for single, react in zip(single_results, react_results):
	test_id = single.get("test_id", "?")
	single_tools = ", ".join(single.get("tools_used", [])) if single.get("success") else "❌"
	react_tools = ", ".join(react.get("tools_used", [])) if react.get("success") else "❌"
	react_steps = len(react.get("workflow", {}).get("steps", [])) if react.get("success") else 0
	report += f"\| {test_id} \| {single_tools} \| {react_tools} \| {react_steps} \|\n"

	report += """

	### Key Observations

	1. Multi-tool queries: ReAct mode can chain multiple tools for complex queries
	2. Single-tool queries: Both modes perform similarly for simple queries
	3. Reasoning steps: ReAct mode shows explicit reasoning before each tool call

	"""

	return report


	async def main():
	"""Main test runner."""
	print("=" * 60)
	print("LocalMate Agent Mode Comparison Test")
	print("=" * 60)
	print()

	single_results = []
	react_results = []

	async with httpx.AsyncClient() as client:
	# Test Single Mode
	print(f"📌 Running Single Mode Tests ({SINGLE_MODE_DELAY}s delay)...")
	print("-" * 40)

	for test in TEST_CASES:
	print(f" Test {test['id']}: {test['query'][:40]}...")
	result = await run_test(client, test, react_mode=False)
	single_results.append(result)

	status = "✅" if result.get("success") else "❌"
	tools = ", ".join(result.get("tools_used", [])) or "None"
	print(f" {status} Tools: {tools} \| {result.get('api_duration_ms', 0):.0f}ms")

	if test["id"] < len(TEST_CASES):
	await asyncio.sleep(SINGLE_MODE_DELAY)

	print()
	print(f"⏸️ Waiting {MODE_SWITCH_DELAY}s before ReAct mode...")
	await asyncio.sleep(MODE_SWITCH_DELAY)

	# Test ReAct Mode
	print()
	print(f"🧠 Running ReAct Mode Tests ({REACT_MODE_DELAY}s delay)...")
	print("-" * 40)

	for test in TEST_CASES:
	print(f" Test {test['id']}: {test['query'][:40]}...")
	result = await run_test(client, test, react_mode=True)
	react_results.append(result)

	status = "✅" if result.get("success") else "❌"
	tools = ", ".join(result.get("tools_used", [])) or "None"
	steps = len(result.get("workflow", {}).get("steps", []))
	print(f" {status} Tools: {tools} \| Steps: {steps} \| {result.get('api_duration_ms', 0):.0f}ms")

	if test["id"] < len(TEST_CASES):
	await asyncio.sleep(REACT_MODE_DELAY)

	# Generate report
	print()
	print("📝 Generating report...")
	report = generate_report(single_results, react_results)

	# Use absolute path based on script location
	import os
	script_dir = os.path.dirname(os.path.abspath(__file__))
	report_path = os.path.join(script_dir, "react_comparison_report.md")
	with open(report_path, "w", encoding="utf-8") as f:
	f.write(report)

	print(f"✅ Report saved to: {report_path}")
	print()
	print("=" * 60)
	print("Test Complete!")
	print("=" * 60)


	if __name__ == "__main__":
	asyncio.run(main())