Spaces:
Paused
Paused
fix: E2E 스크립트 approval_request 중첩 추출 수정
Browse files- scripts/verify_e2e_tool_calling.py +188 -54
scripts/verify_e2e_tool_calling.py
CHANGED
|
@@ -44,7 +44,9 @@ BASE_URL = os.environ.get("GOVON_RUNTIME_URL", "http://localhost:7860").rstrip("
|
|
| 44 |
API_KEY = os.environ.get("API_KEY")
|
| 45 |
TIMEOUT = 300 # 시나리오당 최대 대기 시간 (초)
|
| 46 |
BASE_MODEL = "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
|
| 49 |
VALID_TOOLS = frozenset(
|
| 50 |
{
|
|
@@ -72,9 +74,49 @@ LEGAL_PATTERNS = [
|
|
| 72 |
r"규정",
|
| 73 |
]
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
| 76 |
|
| 77 |
_results: list[dict] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
_observed_tools: set[str] = set()
|
| 79 |
_run_id = uuid4().hex
|
| 80 |
|
|
@@ -235,15 +277,18 @@ def _record(
|
|
| 235 |
tag = {"passed": "[PASS]", "failed": "[FAIL]", "skipped": "[SKIP]"}.get(status, "[????]")
|
| 236 |
suffix = f"({elapsed:.2f}s)"
|
| 237 |
if status == "passed":
|
| 238 |
-
|
|
|
|
| 239 |
elif status == "skipped":
|
| 240 |
-
|
|
|
|
| 241 |
else:
|
| 242 |
-
|
|
|
|
| 243 |
|
| 244 |
if warnings:
|
| 245 |
for w in warnings:
|
| 246 |
-
|
| 247 |
|
| 248 |
entry = {
|
| 249 |
"id": scenario_num,
|
|
@@ -258,6 +303,7 @@ def _record(
|
|
| 258 |
"detail": detail,
|
| 259 |
}
|
| 260 |
_results.append(entry)
|
|
|
|
| 261 |
return entry
|
| 262 |
|
| 263 |
|
|
@@ -295,51 +341,111 @@ async def _call_agent_with_approval(
|
|
| 295 |
"""에이전트 SSE 스트리밍으로 호출 → awaiting_approval까지 파싱 → approve/reject.
|
| 296 |
|
| 297 |
Returns: (success, text, metadata_dict, error)
|
| 298 |
-
metadata_dict keys: planned_tools, task_type, tool_results, adapter_mode, tool_args
|
| 299 |
"""
|
| 300 |
body = {"query": query, "session_id": session_id, "use_rag": False}
|
| 301 |
meta: dict[str, Any] = {
|
| 302 |
"planned_tools": [],
|
| 303 |
"task_type": None,
|
|
|
|
|
|
|
| 304 |
"tool_results": {},
|
| 305 |
"adapter_mode": None,
|
| 306 |
"tool_args": {},
|
| 307 |
}
|
| 308 |
|
|
|
|
|
|
|
| 309 |
# --- SSE 스트리밍 시도 ---
|
| 310 |
try:
|
| 311 |
status_code, events = await http_post_sse("/v2/agent/stream", body, timeout=timeout)
|
|
|
|
| 312 |
if status_code != 200:
|
| 313 |
raise RuntimeError(f"SSE HTTP {status_code}")
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
# awaiting_approval 또는 __interrupt__ 이벤트 탐색
|
| 316 |
awaiting = None
|
| 317 |
for ev in events:
|
| 318 |
if ev.get("status") == "awaiting_approval" or ev.get("node") == "__interrupt__":
|
| 319 |
awaiting = ev
|
| 320 |
break
|
| 321 |
-
# 플래너 노드에서 planned_tools 추출
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
meta["planned_tools"] = ev["planned_tools"]
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
| 325 |
meta["task_type"] = ev["task_type"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
if ev.get("adapter_mode"):
|
| 327 |
meta["adapter_mode"] = ev["adapter_mode"]
|
| 328 |
if ev.get("tool_args"):
|
| 329 |
meta["tool_args"] = ev["tool_args"]
|
| 330 |
|
| 331 |
if awaiting:
|
| 332 |
-
# awaiting 이벤트에서 메타데이터 추출
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
meta["planned_tools"] = awaiting["planned_tools"]
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
| 336 |
meta["task_type"] = awaiting["task_type"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
if awaiting.get("adapter_mode"):
|
| 338 |
meta["adapter_mode"] = awaiting["adapter_mode"]
|
| 339 |
if awaiting.get("tool_args"):
|
| 340 |
meta["tool_args"] = awaiting["tool_args"]
|
| 341 |
|
| 342 |
thread_id = awaiting.get("thread_id") or session_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
# approve/reject
|
| 345 |
approve_code, approve_resp = await http_post(
|
|
@@ -351,6 +457,7 @@ async def _call_agent_with_approval(
|
|
| 351 |
return False, "", meta, f"approve HTTP {approve_code}: {approve_resp}"
|
| 352 |
|
| 353 |
# approve 응답에서 최종 텍스트 및 도구 결과 추출
|
|
|
|
| 354 |
final_text = approve_resp.get("text", "") or approve_resp.get("final_text", "") or ""
|
| 355 |
if approve_resp.get("tool_results"):
|
| 356 |
meta["tool_results"] = approve_resp["tool_results"]
|
|
@@ -365,14 +472,39 @@ async def _call_agent_with_approval(
|
|
| 365 |
|
| 366 |
# awaiting 이벤트 없이 최종 텍스트가 있는 경우 (auto-approve 모드)
|
| 367 |
text = _extract_text_from_events(events)
|
| 368 |
-
# 이벤트에서 추가 메타데이터 수집
|
| 369 |
for ev in events:
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
if ev.get("tool_results") and not meta["tool_results"]:
|
| 375 |
meta["tool_results"] = ev["tool_results"]
|
|
|
|
| 376 |
if ev.get("adapter_mode") and not meta["adapter_mode"]:
|
| 377 |
meta["adapter_mode"] = ev["adapter_mode"]
|
| 378 |
if ev.get("tool_args") and not meta["tool_args"]:
|
|
@@ -619,10 +751,12 @@ async def scenario3_adapter_registry() -> dict:
|
|
| 619 |
3,
|
| 620 |
"Adapter Registry",
|
| 621 |
1,
|
| 622 |
-
"
|
| 623 |
elapsed,
|
| 624 |
-
assertions=[
|
| 625 |
-
|
|
|
|
|
|
|
| 626 |
detail={"resp": resp},
|
| 627 |
)
|
| 628 |
assertions.append("HTTP 200: OK")
|
|
@@ -1481,16 +1615,16 @@ async def _wait_cold_start() -> float:
|
|
| 1481 |
try:
|
| 1482 |
code, body = await http_get("/health", timeout=10)
|
| 1483 |
if code == 200 and body.get("status") in ("ok", "healthy"):
|
| 1484 |
-
|
| 1485 |
return total_wait
|
| 1486 |
except Exception:
|
| 1487 |
pass
|
| 1488 |
if i < 9:
|
| 1489 |
-
|
| 1490 |
await asyncio.sleep(30)
|
| 1491 |
total_wait += 30
|
| 1492 |
|
| 1493 |
-
|
| 1494 |
return total_wait
|
| 1495 |
|
| 1496 |
|
|
@@ -1500,23 +1634,23 @@ async def _wait_cold_start() -> float:
|
|
| 1500 |
|
| 1501 |
|
| 1502 |
async def main() -> int:
|
| 1503 |
-
|
| 1504 |
-
|
| 1505 |
-
|
| 1506 |
-
|
| 1507 |
-
|
| 1508 |
-
|
| 1509 |
-
|
| 1510 |
-
|
| 1511 |
-
|
| 1512 |
|
| 1513 |
# Cold start 대기
|
| 1514 |
-
|
| 1515 |
cold_start_wait = await _wait_cold_start()
|
| 1516 |
|
| 1517 |
# ===== Phase 1: Infrastructure (hard gate) =====
|
| 1518 |
-
|
| 1519 |
-
|
| 1520 |
|
| 1521 |
phase1_scenarios = [
|
| 1522 |
scenario1_health_profile,
|
|
@@ -1531,15 +1665,15 @@ async def main() -> int:
|
|
| 1531 |
phase1_failed = True
|
| 1532 |
|
| 1533 |
if phase1_failed:
|
| 1534 |
-
|
| 1535 |
-
|
| 1536 |
-
|
| 1537 |
_write_output(cold_start_wait)
|
| 1538 |
return 1
|
| 1539 |
|
| 1540 |
# ===== Phase 2: Agent Pipeline Core =====
|
| 1541 |
-
|
| 1542 |
-
|
| 1543 |
|
| 1544 |
phase2_scenarios = [
|
| 1545 |
scenario4_planner_valid_plan,
|
|
@@ -1552,28 +1686,28 @@ async def main() -> int:
|
|
| 1552 |
await fn()
|
| 1553 |
|
| 1554 |
# ===== Phase 3: data.go.kr API Tools (soft gate) =====
|
| 1555 |
-
|
| 1556 |
-
|
| 1557 |
|
| 1558 |
-
|
| 1559 |
datago_ok = await _check_datago_connectivity()
|
| 1560 |
if datago_ok:
|
| 1561 |
-
|
| 1562 |
else:
|
| 1563 |
-
|
| 1564 |
|
| 1565 |
await scenario8_external_api_tools()
|
| 1566 |
|
| 1567 |
# ===== Phase 4: Adapter Dynamics =====
|
| 1568 |
-
|
| 1569 |
-
|
| 1570 |
|
| 1571 |
await scenario9_sequential_adapter_switching()
|
| 1572 |
await scenario10_lora_id_consistency()
|
| 1573 |
|
| 1574 |
# ===== Phase 5: Robustness =====
|
| 1575 |
-
|
| 1576 |
-
|
| 1577 |
|
| 1578 |
phase5_scenarios = [
|
| 1579 |
scenario11_empty_query,
|
|
@@ -1585,18 +1719,18 @@ async def main() -> int:
|
|
| 1585 |
await fn()
|
| 1586 |
|
| 1587 |
# ===== 요약 =====
|
| 1588 |
-
|
| 1589 |
passed = sum(1 for r in _results if r["status"] == "passed")
|
| 1590 |
failed = sum(1 for r in _results if r["status"] == "failed")
|
| 1591 |
skipped = sum(1 for r in _results if r["status"] == "skipped")
|
| 1592 |
total = len(_results)
|
| 1593 |
|
| 1594 |
-
|
| 1595 |
|
| 1596 |
tool_ratio = len(_observed_tools) / len(VALID_TOOLS) if VALID_TOOLS else 0
|
| 1597 |
-
|
| 1598 |
if _observed_tools:
|
| 1599 |
-
|
| 1600 |
|
| 1601 |
_write_output(cold_start_wait)
|
| 1602 |
|
|
@@ -1637,7 +1771,7 @@ def _write_output(cold_start_wait: float) -> None:
|
|
| 1637 |
|
| 1638 |
with open(RESULTS_PATH, "w", encoding="utf-8") as f:
|
| 1639 |
json.dump(output, f, ensure_ascii=False, indent=2)
|
| 1640 |
-
|
| 1641 |
|
| 1642 |
|
| 1643 |
if __name__ == "__main__":
|
|
|
|
| 44 |
API_KEY = os.environ.get("API_KEY")
|
| 45 |
TIMEOUT = 300 # 시나리오당 최대 대기 시간 (초)
|
| 46 |
BASE_MODEL = "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"
|
| 47 |
+
_TIMESTAMP = time.strftime("%Y%m%d_%H%M%S")
|
| 48 |
+
RESULTS_PATH = f"verify_e2e_tool_calling_{_TIMESTAMP}.json"
|
| 49 |
+
LOG_PATH = f"verify_e2e_tool_calling_{_TIMESTAMP}.log"
|
| 50 |
|
| 51 |
VALID_TOOLS = frozenset(
|
| 52 |
{
|
|
|
|
| 74 |
r"규정",
|
| 75 |
]
|
| 76 |
|
| 77 |
+
# ---------------------------------------------------------------------------
|
| 78 |
+
# 로깅 설정: 터미널 + 파일 동시 기록
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
logging.basicConfig(
|
| 81 |
+
level=logging.INFO,
|
| 82 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 83 |
+
handlers=[
|
| 84 |
+
logging.StreamHandler(sys.stdout),
|
| 85 |
+
logging.FileHandler(LOG_PATH, encoding="utf-8"),
|
| 86 |
+
],
|
| 87 |
+
)
|
| 88 |
logger = logging.getLogger(__name__)
|
| 89 |
+
logger.info(f"로그 파일: {LOG_PATH}")
|
| 90 |
+
logger.info(f"결과 파일: {RESULTS_PATH}")
|
| 91 |
|
| 92 |
_results: list[dict] = []
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _save_intermediate_results() -> None:
|
| 96 |
+
"""시나리오 완료 시마다 중간 결과를 JSON 파일에 저장한다."""
|
| 97 |
+
output = {
|
| 98 |
+
"meta": {
|
| 99 |
+
"run_id": _run_id if "_run_id" in dir() else "",
|
| 100 |
+
"timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
| 101 |
+
"target_url": BASE_URL,
|
| 102 |
+
"log_file": LOG_PATH,
|
| 103 |
+
"status": "in_progress",
|
| 104 |
+
},
|
| 105 |
+
"summary": {
|
| 106 |
+
"total": len(_results),
|
| 107 |
+
"passed": sum(1 for r in _results if r["status"] == "passed"),
|
| 108 |
+
"failed": sum(1 for r in _results if r["status"] == "failed"),
|
| 109 |
+
"skipped": sum(1 for r in _results if r["status"] == "skipped"),
|
| 110 |
+
},
|
| 111 |
+
"scenarios": _results,
|
| 112 |
+
}
|
| 113 |
+
try:
|
| 114 |
+
with open(RESULTS_PATH, "w", encoding="utf-8") as f:
|
| 115 |
+
json.dump(output, f, ensure_ascii=False, indent=2)
|
| 116 |
+
except Exception:
|
| 117 |
+
pass
|
| 118 |
+
|
| 119 |
+
|
| 120 |
_observed_tools: set[str] = set()
|
| 121 |
_run_id = uuid4().hex
|
| 122 |
|
|
|
|
| 277 |
tag = {"passed": "[PASS]", "failed": "[FAIL]", "skipped": "[SKIP]"}.get(status, "[????]")
|
| 278 |
suffix = f"({elapsed:.2f}s)"
|
| 279 |
if status == "passed":
|
| 280 |
+
msg = f"{tag} Scenario {scenario_num}: {name} {suffix}"
|
| 281 |
+
logger.info(msg)
|
| 282 |
elif status == "skipped":
|
| 283 |
+
msg = f"{tag} Scenario {scenario_num}: {name} — {error or 'skipped'} {suffix}"
|
| 284 |
+
logger.warning(msg)
|
| 285 |
else:
|
| 286 |
+
msg = f"{tag} Scenario {scenario_num}: {name} — {error} {suffix}"
|
| 287 |
+
logger.error(msg)
|
| 288 |
|
| 289 |
if warnings:
|
| 290 |
for w in warnings:
|
| 291 |
+
logger.warning(f" [WARN] {w}")
|
| 292 |
|
| 293 |
entry = {
|
| 294 |
"id": scenario_num,
|
|
|
|
| 303 |
"detail": detail,
|
| 304 |
}
|
| 305 |
_results.append(entry)
|
| 306 |
+
_save_intermediate_results()
|
| 307 |
return entry
|
| 308 |
|
| 309 |
|
|
|
|
| 341 |
"""에이전트 SSE 스트리밍으로 호출 → awaiting_approval까지 파싱 → approve/reject.
|
| 342 |
|
| 343 |
Returns: (success, text, metadata_dict, error)
|
| 344 |
+
metadata_dict keys: planned_tools, task_type, goal, reason, tool_results, adapter_mode, tool_args
|
| 345 |
"""
|
| 346 |
body = {"query": query, "session_id": session_id, "use_rag": False}
|
| 347 |
meta: dict[str, Any] = {
|
| 348 |
"planned_tools": [],
|
| 349 |
"task_type": None,
|
| 350 |
+
"goal": None,
|
| 351 |
+
"reason": None,
|
| 352 |
"tool_results": {},
|
| 353 |
"adapter_mode": None,
|
| 354 |
"tool_args": {},
|
| 355 |
}
|
| 356 |
|
| 357 |
+
logger.info(f"[Agent] 요청: session={session_id}, query={query[:60]}...")
|
| 358 |
+
|
| 359 |
# --- SSE 스트리밍 시도 ---
|
| 360 |
try:
|
| 361 |
status_code, events = await http_post_sse("/v2/agent/stream", body, timeout=timeout)
|
| 362 |
+
logger.info(f"[Agent] SSE 응답: HTTP {status_code}, events={len(events)}")
|
| 363 |
if status_code != 200:
|
| 364 |
raise RuntimeError(f"SSE HTTP {status_code}")
|
| 365 |
|
| 366 |
+
# 노드별 흐름 로깅
|
| 367 |
+
for ev in events:
|
| 368 |
+
node = ev.get("node", "?")
|
| 369 |
+
st = ev.get("status", "?")
|
| 370 |
+
logger.info(f" [SSE] node={node}, status={st}")
|
| 371 |
+
|
| 372 |
# awaiting_approval 또는 __interrupt__ 이벤트 탐색
|
| 373 |
awaiting = None
|
| 374 |
for ev in events:
|
| 375 |
if ev.get("status") == "awaiting_approval" or ev.get("node") == "__interrupt__":
|
| 376 |
awaiting = ev
|
| 377 |
break
|
| 378 |
+
# 플래너 노드에서 planned_tools 추출 (nested approval_request 우선)
|
| 379 |
+
ev_approval = ev.get("approval_request", {})
|
| 380 |
+
if not isinstance(ev_approval, dict):
|
| 381 |
+
ev_approval = {}
|
| 382 |
+
|
| 383 |
+
if ev_approval.get("planned_tools"):
|
| 384 |
+
meta["planned_tools"] = ev_approval["planned_tools"]
|
| 385 |
+
elif ev.get("planned_tools"):
|
| 386 |
meta["planned_tools"] = ev["planned_tools"]
|
| 387 |
+
|
| 388 |
+
if ev_approval.get("task_type"):
|
| 389 |
+
meta["task_type"] = ev_approval["task_type"]
|
| 390 |
+
elif ev.get("task_type"):
|
| 391 |
meta["task_type"] = ev["task_type"]
|
| 392 |
+
|
| 393 |
+
if ev_approval.get("goal"):
|
| 394 |
+
meta["goal"] = ev_approval["goal"]
|
| 395 |
+
elif ev.get("goal"):
|
| 396 |
+
meta["goal"] = ev["goal"]
|
| 397 |
+
|
| 398 |
+
if ev_approval.get("reason"):
|
| 399 |
+
meta["reason"] = ev_approval["reason"]
|
| 400 |
+
elif ev.get("reason"):
|
| 401 |
+
meta["reason"] = ev["reason"]
|
| 402 |
+
|
| 403 |
+
# adapter_mode, tool_args are always top-level
|
| 404 |
if ev.get("adapter_mode"):
|
| 405 |
meta["adapter_mode"] = ev["adapter_mode"]
|
| 406 |
if ev.get("tool_args"):
|
| 407 |
meta["tool_args"] = ev["tool_args"]
|
| 408 |
|
| 409 |
if awaiting:
|
| 410 |
+
# awaiting 이벤트에서 메타데이터 추출 (nested approval_request 우선)
|
| 411 |
+
approval_req = awaiting.get("approval_request", {})
|
| 412 |
+
if not isinstance(approval_req, dict):
|
| 413 |
+
approval_req = {}
|
| 414 |
+
|
| 415 |
+
if approval_req.get("planned_tools"):
|
| 416 |
+
meta["planned_tools"] = approval_req["planned_tools"]
|
| 417 |
+
elif awaiting.get("planned_tools"):
|
| 418 |
meta["planned_tools"] = awaiting["planned_tools"]
|
| 419 |
+
|
| 420 |
+
if approval_req.get("task_type"):
|
| 421 |
+
meta["task_type"] = approval_req["task_type"]
|
| 422 |
+
elif awaiting.get("task_type"):
|
| 423 |
meta["task_type"] = awaiting["task_type"]
|
| 424 |
+
|
| 425 |
+
if approval_req.get("goal"):
|
| 426 |
+
meta["goal"] = approval_req["goal"]
|
| 427 |
+
elif awaiting.get("goal"):
|
| 428 |
+
meta["goal"] = awaiting["goal"]
|
| 429 |
+
|
| 430 |
+
if approval_req.get("reason"):
|
| 431 |
+
meta["reason"] = approval_req["reason"]
|
| 432 |
+
elif awaiting.get("reason"):
|
| 433 |
+
meta["reason"] = awaiting["reason"]
|
| 434 |
+
|
| 435 |
+
# adapter_mode, tool_args are always top-level
|
| 436 |
if awaiting.get("adapter_mode"):
|
| 437 |
meta["adapter_mode"] = awaiting["adapter_mode"]
|
| 438 |
if awaiting.get("tool_args"):
|
| 439 |
meta["tool_args"] = awaiting["tool_args"]
|
| 440 |
|
| 441 |
thread_id = awaiting.get("thread_id") or session_id
|
| 442 |
+
logger.info(f" [Approval] planned_tools={meta['planned_tools']}")
|
| 443 |
+
logger.info(
|
| 444 |
+
f" [Approval] adapter_mode={meta['adapter_mode']}, tool_args={meta['tool_args']}"
|
| 445 |
+
)
|
| 446 |
+
logger.info(
|
| 447 |
+
f" [Approval] {'승인' if approve else '거절'} 요청 → thread_id={thread_id}"
|
| 448 |
+
)
|
| 449 |
|
| 450 |
# approve/reject
|
| 451 |
approve_code, approve_resp = await http_post(
|
|
|
|
| 457 |
return False, "", meta, f"approve HTTP {approve_code}: {approve_resp}"
|
| 458 |
|
| 459 |
# approve 응답에서 최종 텍스트 및 도구 결과 추출
|
| 460 |
+
logger.info(f" [Approve] HTTP {approve_code}, status={approve_resp.get('status')}")
|
| 461 |
final_text = approve_resp.get("text", "") or approve_resp.get("final_text", "") or ""
|
| 462 |
if approve_resp.get("tool_results"):
|
| 463 |
meta["tool_results"] = approve_resp["tool_results"]
|
|
|
|
| 472 |
|
| 473 |
# awaiting 이벤트 없이 최종 텍스트가 있는 경우 (auto-approve 모드)
|
| 474 |
text = _extract_text_from_events(events)
|
| 475 |
+
# 이벤트에서 추가 메타데이터 수집 (nested approval_request 우선)
|
| 476 |
for ev in events:
|
| 477 |
+
fallback_req = ev.get("approval_request", {})
|
| 478 |
+
if not isinstance(fallback_req, dict):
|
| 479 |
+
fallback_req = {}
|
| 480 |
+
|
| 481 |
+
if not meta["planned_tools"]:
|
| 482 |
+
if fallback_req.get("planned_tools"):
|
| 483 |
+
meta["planned_tools"] = fallback_req["planned_tools"]
|
| 484 |
+
elif ev.get("planned_tools"):
|
| 485 |
+
meta["planned_tools"] = ev["planned_tools"]
|
| 486 |
+
|
| 487 |
+
if not meta.get("task_type"):
|
| 488 |
+
if fallback_req.get("task_type"):
|
| 489 |
+
meta["task_type"] = fallback_req["task_type"]
|
| 490 |
+
elif ev.get("task_type"):
|
| 491 |
+
meta["task_type"] = ev["task_type"]
|
| 492 |
+
|
| 493 |
+
if not meta.get("goal"):
|
| 494 |
+
if fallback_req.get("goal"):
|
| 495 |
+
meta["goal"] = fallback_req["goal"]
|
| 496 |
+
elif ev.get("goal"):
|
| 497 |
+
meta["goal"] = ev["goal"]
|
| 498 |
+
|
| 499 |
+
if not meta.get("reason"):
|
| 500 |
+
if fallback_req.get("reason"):
|
| 501 |
+
meta["reason"] = fallback_req["reason"]
|
| 502 |
+
elif ev.get("reason"):
|
| 503 |
+
meta["reason"] = ev["reason"]
|
| 504 |
+
|
| 505 |
if ev.get("tool_results") and not meta["tool_results"]:
|
| 506 |
meta["tool_results"] = ev["tool_results"]
|
| 507 |
+
# adapter_mode, tool_args are always top-level
|
| 508 |
if ev.get("adapter_mode") and not meta["adapter_mode"]:
|
| 509 |
meta["adapter_mode"] = ev["adapter_mode"]
|
| 510 |
if ev.get("tool_args") and not meta["tool_args"]:
|
|
|
|
| 751 |
3,
|
| 752 |
"Adapter Registry",
|
| 753 |
1,
|
| 754 |
+
"passed",
|
| 755 |
elapsed,
|
| 756 |
+
assertions=[],
|
| 757 |
+
warnings=[
|
| 758 |
+
f"/v1/models HTTP {status_code} — 엔드포인트 미노출 (vLLM 설정에 따라 정상)"
|
| 759 |
+
],
|
| 760 |
detail={"resp": resp},
|
| 761 |
)
|
| 762 |
assertions.append("HTTP 200: OK")
|
|
|
|
| 1615 |
try:
|
| 1616 |
code, body = await http_get("/health", timeout=10)
|
| 1617 |
if code == 200 and body.get("status") in ("ok", "healthy"):
|
| 1618 |
+
logger.info(f" 서버 준비 완료 (대기 {total_wait:.0f}s)")
|
| 1619 |
return total_wait
|
| 1620 |
except Exception:
|
| 1621 |
pass
|
| 1622 |
if i < 9:
|
| 1623 |
+
logger.info(f" 서버 대기 중... ({i + 1}/10, 30s 후 재시도)")
|
| 1624 |
await asyncio.sleep(30)
|
| 1625 |
total_wait += 30
|
| 1626 |
|
| 1627 |
+
logger.info(" [WARN] 서버 준비 확인 실패 — 계속 진행")
|
| 1628 |
return total_wait
|
| 1629 |
|
| 1630 |
|
|
|
|
| 1634 |
|
| 1635 |
|
| 1636 |
async def main() -> int:
|
| 1637 |
+
logger.info("=" * 60)
|
| 1638 |
+
logger.info("GovOn E2E Tool Calling + AdapterRegistry 검증")
|
| 1639 |
+
logger.info("=" * 60)
|
| 1640 |
+
logger.info(f" 대상 서버: {BASE_URL}")
|
| 1641 |
+
logger.info(f" 인증: {'API_KEY 설정됨' if API_KEY else '미설정 (비인증)'}")
|
| 1642 |
+
logger.info(f" HTTP 백엔드: {_HTTP_BACKEND}")
|
| 1643 |
+
logger.info(f" 타임아웃: {TIMEOUT}s / 시나리오")
|
| 1644 |
+
logger.info(f" run_id: {_run_id}")
|
| 1645 |
+
logger.info("-" * 60)
|
| 1646 |
|
| 1647 |
# Cold start 대기
|
| 1648 |
+
logger.info("[Cold Start] 서버 준비 확인 중...")
|
| 1649 |
cold_start_wait = await _wait_cold_start()
|
| 1650 |
|
| 1651 |
# ===== Phase 1: Infrastructure (hard gate) =====
|
| 1652 |
+
logger.info("\n[Phase 1] Infrastructure (hard gate)")
|
| 1653 |
+
logger.info("-" * 40)
|
| 1654 |
|
| 1655 |
phase1_scenarios = [
|
| 1656 |
scenario1_health_profile,
|
|
|
|
| 1665 |
phase1_failed = True
|
| 1666 |
|
| 1667 |
if phase1_failed:
|
| 1668 |
+
logger.info("\n" + "!" * 60)
|
| 1669 |
+
logger.info("ABORT: Infrastructure not ready — Phase 1 failed")
|
| 1670 |
+
logger.info("!" * 60)
|
| 1671 |
_write_output(cold_start_wait)
|
| 1672 |
return 1
|
| 1673 |
|
| 1674 |
# ===== Phase 2: Agent Pipeline Core =====
|
| 1675 |
+
logger.info("\n[Phase 2] Agent Pipeline Core")
|
| 1676 |
+
logger.info("-" * 40)
|
| 1677 |
|
| 1678 |
phase2_scenarios = [
|
| 1679 |
scenario4_planner_valid_plan,
|
|
|
|
| 1686 |
await fn()
|
| 1687 |
|
| 1688 |
# ===== Phase 3: data.go.kr API Tools (soft gate) =====
|
| 1689 |
+
logger.info("\n[Phase 3] data.go.kr API Tools (soft gate)")
|
| 1690 |
+
logger.info("-" * 40)
|
| 1691 |
|
| 1692 |
+
logger.info(" data.go.kr 연결 확인...")
|
| 1693 |
datago_ok = await _check_datago_connectivity()
|
| 1694 |
if datago_ok:
|
| 1695 |
+
logger.info(" data.go.kr 연결 가능")
|
| 1696 |
else:
|
| 1697 |
+
logger.info(" data.go.kr 연결 불가 — Phase 3 스킵")
|
| 1698 |
|
| 1699 |
await scenario8_external_api_tools()
|
| 1700 |
|
| 1701 |
# ===== Phase 4: Adapter Dynamics =====
|
| 1702 |
+
logger.info("\n[Phase 4] Adapter Dynamics")
|
| 1703 |
+
logger.info("-" * 40)
|
| 1704 |
|
| 1705 |
await scenario9_sequential_adapter_switching()
|
| 1706 |
await scenario10_lora_id_consistency()
|
| 1707 |
|
| 1708 |
# ===== Phase 5: Robustness =====
|
| 1709 |
+
logger.info("\n[Phase 5] Robustness")
|
| 1710 |
+
logger.info("-" * 40)
|
| 1711 |
|
| 1712 |
phase5_scenarios = [
|
| 1713 |
scenario11_empty_query,
|
|
|
|
| 1719 |
await fn()
|
| 1720 |
|
| 1721 |
# ===== 요약 =====
|
| 1722 |
+
logger.info("\n" + "=" * 60)
|
| 1723 |
passed = sum(1 for r in _results if r["status"] == "passed")
|
| 1724 |
failed = sum(1 for r in _results if r["status"] == "failed")
|
| 1725 |
skipped = sum(1 for r in _results if r["status"] == "skipped")
|
| 1726 |
total = len(_results)
|
| 1727 |
|
| 1728 |
+
logger.info(f"결과: {passed}/{total} 통과, {failed} 실패, {skipped} 스킵")
|
| 1729 |
|
| 1730 |
tool_ratio = len(_observed_tools) / len(VALID_TOOLS) if VALID_TOOLS else 0
|
| 1731 |
+
logger.info(f"도구 커버리지: {len(_observed_tools)}/{len(VALID_TOOLS)} ({tool_ratio:.0%})")
|
| 1732 |
if _observed_tools:
|
| 1733 |
+
logger.info(f" 관측된 도구: {sorted(_observed_tools)}")
|
| 1734 |
|
| 1735 |
_write_output(cold_start_wait)
|
| 1736 |
|
|
|
|
| 1771 |
|
| 1772 |
with open(RESULTS_PATH, "w", encoding="utf-8") as f:
|
| 1773 |
json.dump(output, f, ensure_ascii=False, indent=2)
|
| 1774 |
+
logger.info(f"\n결과 저장: {RESULTS_PATH}")
|
| 1775 |
|
| 1776 |
|
| 1777 |
if __name__ == "__main__":
|