LocalMate / tests /test_react_comparison.py
Cuong2004's picture
fix ReAct loop
0140f42
raw
history blame
11.6 kB
"""
LocalMate Agent Test Script - Single vs ReAct Mode Comparison
Tests 10 queries in both modes:
- Single mode: configurable delay between queries
- ReAct mode: configurable delay between queries
- Configurable delay between modes
Generates detailed report with all step inputs/outputs.
"""
import asyncio
import json
import time
from datetime import datetime
import httpx
# =============================================================================
# CONFIGURATION - Adjust these values as needed
# =============================================================================
# API Settings
API_BASE = "https://cuong2004-localmate.hf.space/api/v1"
USER_ID = "test_comparison"
# Delay Settings (in seconds)
SINGLE_MODE_DELAY = 10 # Delay between queries in single mode
REACT_MODE_DELAY = 60 # Delay between queries in ReAct mode
MODE_SWITCH_DELAY = 60 # Delay between switching modes
REQUEST_TIMEOUT = 120 # Timeout for each API request
# =============================================================================
# Test Cases - 10 queries covering different scenarios
TEST_CASES = [
# {
# "id": 1,
# "query": "Quán cafe view đẹp",
# "description": "Simple text search - no location",
# "expected_tools": ["retrieve_context_text"],
# },
{
"id": 2,
"query": "Nhà hàng gần bãi biển Mỹ Khê",
"description": "Location-based search",
"expected_tools": ["find_nearby_places"],
},
# {
# "id": 3,
# "query": "Quán cafe có không gian xanh mát gần Cầu Rồng",
# "description": "Complex: location + feature (should use multiple tools in ReAct)",
# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
# },
# {
# "id": 4,
# "query": "Phở ngon giá rẻ",
# "description": "Food-specific text search",
# "expected_tools": ["retrieve_context_text"],
# },
# {
# "id": 5,
# "query": "Địa điểm checkin đẹp gần Bà Nà",
# "description": "Location + activity type",
# "expected_tools": ["find_nearby_places"],
# },
# {
# "id": 6,
# "query": "Quán ăn hải sản có view sông gần trung tâm",
# "description": "Complex: location + category + feature",
# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
# },
# {
# "id": 7,
# "query": "Khách sạn 5 sao gần biển",
# "description": "Hotel + location search",
# "expected_tools": ["find_nearby_places"],
# },
# {
# "id": 8,
# "query": "Quán bar có view đẹp về đêm",
# "description": "Nightlife text search",
# "expected_tools": ["retrieve_context_text"],
# },
# {
# "id": 9,
# "query": "Cafe rooftop gần Sơn Trà có coffee ngon",
# "description": "Complex: location + feature + quality",
# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
# },
# {
# "id": 10,
# "query": "Nhà hàng Việt Nam authentic gần Rex Hotel",
# "description": "Specific location + category + style",
# "expected_tools": ["find_nearby_places", "retrieve_context_text"],
# },
]
async def run_test(client: httpx.AsyncClient, test_case: dict, react_mode: bool) -> dict:
"""Run a single test case and return results."""
start_time = time.time()
try:
response = await client.post(
f"{API_BASE}/chat",
json={
"message": test_case["query"],
"user_id": USER_ID,
"provider": "MegaLLM",
"react_mode": react_mode,
"max_steps": 5,
},
timeout=float(REQUEST_TIMEOUT),
)
duration = (time.time() - start_time) * 1000
if response.status_code == 200:
data = response.json()
return {
"success": True,
"test_id": test_case["id"],
"query": test_case["query"],
"description": test_case["description"],
"react_mode": react_mode,
"response": data.get("response", "")[:300],
"workflow": data.get("workflow", {}),
"tools_used": data.get("tools_used", []),
"api_duration_ms": data.get("duration_ms", 0),
"total_duration_ms": duration,
}
else:
return {
"success": False,
"test_id": test_case["id"],
"query": test_case["query"],
"react_mode": react_mode,
"error": f"HTTP {response.status_code}: {response.text[:200]}",
"total_duration_ms": duration,
}
except Exception as e:
return {
"success": False,
"test_id": test_case["id"],
"query": test_case["query"],
"react_mode": react_mode,
"error": str(e),
"total_duration_ms": (time.time() - start_time) * 1000,
}
def format_workflow_steps(workflow: dict) -> str:
"""Format workflow steps for report."""
steps = workflow.get("steps", [])
if not steps:
return "No steps recorded"
lines = []
for step in steps:
tool = step.get("tool", "N/A")
purpose = step.get("purpose", "")
results = step.get("results", 0)
lines.append(f" - {step.get('step', 'Unknown')}")
lines.append(f" Tool: `{tool}` | Results: {results}")
return "\n".join(lines)
def generate_report(single_results: list, react_results: list) -> str:
"""Generate detailed markdown report."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
report = f"""# LocalMate Agent Test Report
**Generated:** {timestamp}
## Summary
| Metric | Single Mode | ReAct Mode |
|--------|-------------|------------|
| Total Tests | {len(single_results)} | {len(react_results)} |
| Success | {sum(1 for r in single_results if r.get('success'))} | {sum(1 for r in react_results if r.get('success'))} |
| Avg Duration | {sum(r.get('api_duration_ms', 0) for r in single_results if r.get('success')) / max(1, sum(1 for r in single_results if r.get('success'))):.0f}ms | {sum(r.get('api_duration_ms', 0) for r in react_results if r.get('success')) / max(1, sum(1 for r in react_results if r.get('success'))):.0f}ms |
---
## Detailed Results
"""
for i, (single, react) in enumerate(zip(single_results, react_results)):
test_id = single.get("test_id", i + 1)
query = single.get("query", "N/A")
description = single.get("description", "")
report += f"""### Test Case {test_id}: {description}
**Query:** `{query}`
#### Single Mode
"""
if single.get("success"):
report += f"""- **Status:** ✅ Success
- **Duration:** {single.get('api_duration_ms', 0):.0f}ms
- **Tools Used:** {', '.join(single.get('tools_used', [])) or 'None'}
**Workflow:**
{format_workflow_steps(single.get('workflow', {}))}
**Response Preview:**
> {single.get('response', 'N/A')[:200]}...
"""
else:
report += f"""- **Status:** ❌ Failed
- **Error:** {single.get('error', 'Unknown')}
"""
report += """#### ReAct Mode
"""
if react.get("success"):
workflow = react.get("workflow", {})
report += f"""- **Status:** ✅ Success
- **Duration:** {react.get('api_duration_ms', 0):.0f}ms
- **Tools Used:** {', '.join(react.get('tools_used', [])) or 'None'}
- **Steps:** {len(workflow.get('steps', []))}
- **Intent Detected:** {workflow.get('intent_detected', 'N/A')}
**Workflow Steps:**
{format_workflow_steps(workflow)}
**Response Preview:**
> {react.get('response', 'N/A')[:200]}...
"""
else:
report += f"""- **Status:** ❌ Failed
- **Error:** {react.get('error', 'Unknown')}
"""
report += "---\n\n"
# Comparison analysis
report += """## Analysis
### Tool Usage Comparison
| Test | Single Mode Tools | ReAct Mode Tools | ReAct Steps |
|------|-------------------|------------------|-------------|
"""
for single, react in zip(single_results, react_results):
test_id = single.get("test_id", "?")
single_tools = ", ".join(single.get("tools_used", [])) if single.get("success") else "❌"
react_tools = ", ".join(react.get("tools_used", [])) if react.get("success") else "❌"
react_steps = len(react.get("workflow", {}).get("steps", [])) if react.get("success") else 0
report += f"| {test_id} | {single_tools} | {react_tools} | {react_steps} |\n"
report += """
### Key Observations
1. **Multi-tool queries**: ReAct mode can chain multiple tools for complex queries
2. **Single-tool queries**: Both modes perform similarly for simple queries
3. **Reasoning steps**: ReAct mode shows explicit reasoning before each tool call
"""
return report
async def main():
"""Main test runner."""
print("=" * 60)
print("LocalMate Agent Mode Comparison Test")
print("=" * 60)
print()
single_results = []
react_results = []
async with httpx.AsyncClient() as client:
# Test Single Mode
print(f"📌 Running Single Mode Tests ({SINGLE_MODE_DELAY}s delay)...")
print("-" * 40)
for test in TEST_CASES:
print(f" Test {test['id']}: {test['query'][:40]}...")
result = await run_test(client, test, react_mode=False)
single_results.append(result)
status = "✅" if result.get("success") else "❌"
tools = ", ".join(result.get("tools_used", [])) or "None"
print(f" {status} Tools: {tools} | {result.get('api_duration_ms', 0):.0f}ms")
if test["id"] < len(TEST_CASES):
await asyncio.sleep(SINGLE_MODE_DELAY)
print()
print(f"⏸️ Waiting {MODE_SWITCH_DELAY}s before ReAct mode...")
await asyncio.sleep(MODE_SWITCH_DELAY)
# Test ReAct Mode
print()
print(f"🧠 Running ReAct Mode Tests ({REACT_MODE_DELAY}s delay)...")
print("-" * 40)
for test in TEST_CASES:
print(f" Test {test['id']}: {test['query'][:40]}...")
result = await run_test(client, test, react_mode=True)
react_results.append(result)
status = "✅" if result.get("success") else "❌"
tools = ", ".join(result.get("tools_used", [])) or "None"
steps = len(result.get("workflow", {}).get("steps", []))
print(f" {status} Tools: {tools} | Steps: {steps} | {result.get('api_duration_ms', 0):.0f}ms")
if test["id"] < len(TEST_CASES):
await asyncio.sleep(REACT_MODE_DELAY)
# Generate report
print()
print("📝 Generating report...")
report = generate_report(single_results, react_results)
# Use absolute path based on script location
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
report_path = os.path.join(script_dir, "react_comparison_report.md")
with open(report_path, "w", encoding="utf-8") as f:
f.write(report)
print(f"✅ Report saved to: {report_path}")
print()
print("=" * 60)
print("Test Complete!")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(main())