|
|
""" |
|
|
LocalMate Agent Test Script - Single vs ReAct Mode Comparison |
|
|
|
|
|
Tests 10 queries in both modes: |
|
|
- Single mode: configurable delay between queries |
|
|
- ReAct mode: configurable delay between queries |
|
|
- Configurable delay between modes |
|
|
|
|
|
Generates detailed report with all step inputs/outputs. |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
import json |
|
|
import time |
|
|
from datetime import datetime |
|
|
import httpx |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
API_BASE = "https://cuong2004-localmate.hf.space/api/v1" |
|
|
USER_ID = "test_comparison" |
|
|
|
|
|
|
|
|
SINGLE_MODE_DELAY = 10 |
|
|
REACT_MODE_DELAY = 60 |
|
|
MODE_SWITCH_DELAY = 60 |
|
|
REQUEST_TIMEOUT = 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TEST_CASES = [ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ |
|
|
"id": 2, |
|
|
"query": "Nhà hàng gần bãi biển Mỹ Khê", |
|
|
"description": "Location-based search", |
|
|
"expected_tools": ["find_nearby_places"], |
|
|
}, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
async def run_test(client: httpx.AsyncClient, test_case: dict, react_mode: bool) -> dict: |
|
|
"""Run a single test case and return results.""" |
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
response = await client.post( |
|
|
f"{API_BASE}/chat", |
|
|
json={ |
|
|
"message": test_case["query"], |
|
|
"user_id": USER_ID, |
|
|
"provider": "MegaLLM", |
|
|
"react_mode": react_mode, |
|
|
"max_steps": 5, |
|
|
}, |
|
|
timeout=float(REQUEST_TIMEOUT), |
|
|
) |
|
|
|
|
|
duration = (time.time() - start_time) * 1000 |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
return { |
|
|
"success": True, |
|
|
"test_id": test_case["id"], |
|
|
"query": test_case["query"], |
|
|
"description": test_case["description"], |
|
|
"react_mode": react_mode, |
|
|
"response": data.get("response", "")[:300], |
|
|
"workflow": data.get("workflow", {}), |
|
|
"tools_used": data.get("tools_used", []), |
|
|
"api_duration_ms": data.get("duration_ms", 0), |
|
|
"total_duration_ms": duration, |
|
|
} |
|
|
else: |
|
|
return { |
|
|
"success": False, |
|
|
"test_id": test_case["id"], |
|
|
"query": test_case["query"], |
|
|
"react_mode": react_mode, |
|
|
"error": f"HTTP {response.status_code}: {response.text[:200]}", |
|
|
"total_duration_ms": duration, |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
return { |
|
|
"success": False, |
|
|
"test_id": test_case["id"], |
|
|
"query": test_case["query"], |
|
|
"react_mode": react_mode, |
|
|
"error": str(e), |
|
|
"total_duration_ms": (time.time() - start_time) * 1000, |
|
|
} |
|
|
|
|
|
|
|
|
def format_workflow_steps(workflow: dict) -> str: |
|
|
"""Format workflow steps for report.""" |
|
|
steps = workflow.get("steps", []) |
|
|
if not steps: |
|
|
return "No steps recorded" |
|
|
|
|
|
lines = [] |
|
|
for step in steps: |
|
|
tool = step.get("tool", "N/A") |
|
|
purpose = step.get("purpose", "") |
|
|
results = step.get("results", 0) |
|
|
lines.append(f" - {step.get('step', 'Unknown')}") |
|
|
lines.append(f" Tool: `{tool}` | Results: {results}") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def generate_report(single_results: list, react_results: list) -> str: |
|
|
"""Generate detailed markdown report.""" |
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
|
|
|
|
report = f"""# LocalMate Agent Test Report |
|
|
|
|
|
**Generated:** {timestamp} |
|
|
|
|
|
## Summary |
|
|
|
|
|
| Metric | Single Mode | ReAct Mode | |
|
|
|--------|-------------|------------| |
|
|
| Total Tests | {len(single_results)} | {len(react_results)} | |
|
|
| Success | {sum(1 for r in single_results if r.get('success'))} | {sum(1 for r in react_results if r.get('success'))} | |
|
|
| Avg Duration | {sum(r.get('api_duration_ms', 0) for r in single_results if r.get('success')) / max(1, sum(1 for r in single_results if r.get('success'))):.0f}ms | {sum(r.get('api_duration_ms', 0) for r in react_results if r.get('success')) / max(1, sum(1 for r in react_results if r.get('success'))):.0f}ms | |
|
|
|
|
|
--- |
|
|
|
|
|
## Detailed Results |
|
|
|
|
|
""" |
|
|
|
|
|
for i, (single, react) in enumerate(zip(single_results, react_results)): |
|
|
test_id = single.get("test_id", i + 1) |
|
|
query = single.get("query", "N/A") |
|
|
description = single.get("description", "") |
|
|
|
|
|
report += f"""### Test Case {test_id}: {description} |
|
|
|
|
|
**Query:** `{query}` |
|
|
|
|
|
#### Single Mode |
|
|
|
|
|
""" |
|
|
if single.get("success"): |
|
|
report += f"""- **Status:** ✅ Success |
|
|
- **Duration:** {single.get('api_duration_ms', 0):.0f}ms |
|
|
- **Tools Used:** {', '.join(single.get('tools_used', [])) or 'None'} |
|
|
|
|
|
**Workflow:** |
|
|
{format_workflow_steps(single.get('workflow', {}))} |
|
|
|
|
|
**Response Preview:** |
|
|
> {single.get('response', 'N/A')[:200]}... |
|
|
|
|
|
""" |
|
|
else: |
|
|
report += f"""- **Status:** ❌ Failed |
|
|
- **Error:** {single.get('error', 'Unknown')} |
|
|
|
|
|
""" |
|
|
|
|
|
report += """#### ReAct Mode |
|
|
|
|
|
""" |
|
|
if react.get("success"): |
|
|
workflow = react.get("workflow", {}) |
|
|
report += f"""- **Status:** ✅ Success |
|
|
- **Duration:** {react.get('api_duration_ms', 0):.0f}ms |
|
|
- **Tools Used:** {', '.join(react.get('tools_used', [])) or 'None'} |
|
|
- **Steps:** {len(workflow.get('steps', []))} |
|
|
- **Intent Detected:** {workflow.get('intent_detected', 'N/A')} |
|
|
|
|
|
**Workflow Steps:** |
|
|
{format_workflow_steps(workflow)} |
|
|
|
|
|
**Response Preview:** |
|
|
> {react.get('response', 'N/A')[:200]}... |
|
|
|
|
|
""" |
|
|
else: |
|
|
report += f"""- **Status:** ❌ Failed |
|
|
- **Error:** {react.get('error', 'Unknown')} |
|
|
|
|
|
""" |
|
|
|
|
|
report += "---\n\n" |
|
|
|
|
|
|
|
|
report += """## Analysis |
|
|
|
|
|
### Tool Usage Comparison |
|
|
|
|
|
| Test | Single Mode Tools | ReAct Mode Tools | ReAct Steps | |
|
|
|------|-------------------|------------------|-------------| |
|
|
""" |
|
|
|
|
|
for single, react in zip(single_results, react_results): |
|
|
test_id = single.get("test_id", "?") |
|
|
single_tools = ", ".join(single.get("tools_used", [])) if single.get("success") else "❌" |
|
|
react_tools = ", ".join(react.get("tools_used", [])) if react.get("success") else "❌" |
|
|
react_steps = len(react.get("workflow", {}).get("steps", [])) if react.get("success") else 0 |
|
|
report += f"| {test_id} | {single_tools} | {react_tools} | {react_steps} |\n" |
|
|
|
|
|
report += """ |
|
|
|
|
|
### Key Observations |
|
|
|
|
|
1. **Multi-tool queries**: ReAct mode can chain multiple tools for complex queries |
|
|
2. **Single-tool queries**: Both modes perform similarly for simple queries |
|
|
3. **Reasoning steps**: ReAct mode shows explicit reasoning before each tool call |
|
|
|
|
|
""" |
|
|
|
|
|
return report |
|
|
|
|
|
|
|
|
async def main(): |
|
|
"""Main test runner.""" |
|
|
print("=" * 60) |
|
|
print("LocalMate Agent Mode Comparison Test") |
|
|
print("=" * 60) |
|
|
print() |
|
|
|
|
|
single_results = [] |
|
|
react_results = [] |
|
|
|
|
|
async with httpx.AsyncClient() as client: |
|
|
|
|
|
print(f"📌 Running Single Mode Tests ({SINGLE_MODE_DELAY}s delay)...") |
|
|
print("-" * 40) |
|
|
|
|
|
for test in TEST_CASES: |
|
|
print(f" Test {test['id']}: {test['query'][:40]}...") |
|
|
result = await run_test(client, test, react_mode=False) |
|
|
single_results.append(result) |
|
|
|
|
|
status = "✅" if result.get("success") else "❌" |
|
|
tools = ", ".join(result.get("tools_used", [])) or "None" |
|
|
print(f" {status} Tools: {tools} | {result.get('api_duration_ms', 0):.0f}ms") |
|
|
|
|
|
if test["id"] < len(TEST_CASES): |
|
|
await asyncio.sleep(SINGLE_MODE_DELAY) |
|
|
|
|
|
print() |
|
|
print(f"⏸️ Waiting {MODE_SWITCH_DELAY}s before ReAct mode...") |
|
|
await asyncio.sleep(MODE_SWITCH_DELAY) |
|
|
|
|
|
|
|
|
print() |
|
|
print(f"🧠 Running ReAct Mode Tests ({REACT_MODE_DELAY}s delay)...") |
|
|
print("-" * 40) |
|
|
|
|
|
for test in TEST_CASES: |
|
|
print(f" Test {test['id']}: {test['query'][:40]}...") |
|
|
result = await run_test(client, test, react_mode=True) |
|
|
react_results.append(result) |
|
|
|
|
|
status = "✅" if result.get("success") else "❌" |
|
|
tools = ", ".join(result.get("tools_used", [])) or "None" |
|
|
steps = len(result.get("workflow", {}).get("steps", [])) |
|
|
print(f" {status} Tools: {tools} | Steps: {steps} | {result.get('api_duration_ms', 0):.0f}ms") |
|
|
|
|
|
if test["id"] < len(TEST_CASES): |
|
|
await asyncio.sleep(REACT_MODE_DELAY) |
|
|
|
|
|
|
|
|
print() |
|
|
print("📝 Generating report...") |
|
|
report = generate_report(single_results, react_results) |
|
|
|
|
|
|
|
|
import os |
|
|
script_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
report_path = os.path.join(script_dir, "react_comparison_report.md") |
|
|
with open(report_path, "w", encoding="utf-8") as f: |
|
|
f.write(report) |
|
|
|
|
|
print(f"✅ Report saved to: {report_path}") |
|
|
print() |
|
|
print("=" * 60) |
|
|
print("Test Complete!") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |
|
|
|