umyunsang commited on
Commit
e21a580
·
verified ·
1 Parent(s): e7c7f3c

fix: E2E 스크립트 approval_request 중첩 추출 수정

Browse files
Files changed (1) hide show
  1. scripts/verify_e2e_tool_calling.py +188 -54
scripts/verify_e2e_tool_calling.py CHANGED
@@ -44,7 +44,9 @@ BASE_URL = os.environ.get("GOVON_RUNTIME_URL", "http://localhost:7860").rstrip("
44
  API_KEY = os.environ.get("API_KEY")
45
  TIMEOUT = 300 # 시나리오당 최대 대기 시간 (초)
46
  BASE_MODEL = "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"
47
- RESULTS_PATH = "verify_e2e_tool_calling_results.json"
 
 
48
 
49
  VALID_TOOLS = frozenset(
50
  {
@@ -72,9 +74,49 @@ LEGAL_PATTERNS = [
72
  r"규정",
73
  ]
74
 
 
 
 
 
 
 
 
 
 
 
 
75
  logger = logging.getLogger(__name__)
 
 
76
 
77
  _results: list[dict] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  _observed_tools: set[str] = set()
79
  _run_id = uuid4().hex
80
 
@@ -235,15 +277,18 @@ def _record(
235
  tag = {"passed": "[PASS]", "failed": "[FAIL]", "skipped": "[SKIP]"}.get(status, "[????]")
236
  suffix = f"({elapsed:.2f}s)"
237
  if status == "passed":
238
- print(f"{tag} Scenario {scenario_num}: {name} {suffix}")
 
239
  elif status == "skipped":
240
- print(f"{tag} Scenario {scenario_num}: {name} — {error or 'skipped'} {suffix}")
 
241
  else:
242
- print(f"{tag} Scenario {scenario_num}: {name} — {error} {suffix}")
 
243
 
244
  if warnings:
245
  for w in warnings:
246
- print(f" [WARN] {w}")
247
 
248
  entry = {
249
  "id": scenario_num,
@@ -258,6 +303,7 @@ def _record(
258
  "detail": detail,
259
  }
260
  _results.append(entry)
 
261
  return entry
262
 
263
 
@@ -295,51 +341,111 @@ async def _call_agent_with_approval(
295
  """에이전트 SSE 스트리밍으로 호출 → awaiting_approval까지 파싱 → approve/reject.
296
 
297
  Returns: (success, text, metadata_dict, error)
298
- metadata_dict keys: planned_tools, task_type, tool_results, adapter_mode, tool_args
299
  """
300
  body = {"query": query, "session_id": session_id, "use_rag": False}
301
  meta: dict[str, Any] = {
302
  "planned_tools": [],
303
  "task_type": None,
 
 
304
  "tool_results": {},
305
  "adapter_mode": None,
306
  "tool_args": {},
307
  }
308
 
 
 
309
  # --- SSE 스트리밍 시도 ---
310
  try:
311
  status_code, events = await http_post_sse("/v2/agent/stream", body, timeout=timeout)
 
312
  if status_code != 200:
313
  raise RuntimeError(f"SSE HTTP {status_code}")
314
 
 
 
 
 
 
 
315
  # awaiting_approval 또는 __interrupt__ 이벤트 탐색
316
  awaiting = None
317
  for ev in events:
318
  if ev.get("status") == "awaiting_approval" or ev.get("node") == "__interrupt__":
319
  awaiting = ev
320
  break
321
- # 플래너 노드에서 planned_tools 추출
322
- if ev.get("planned_tools"):
 
 
 
 
 
 
323
  meta["planned_tools"] = ev["planned_tools"]
324
- if ev.get("task_type"):
 
 
 
325
  meta["task_type"] = ev["task_type"]
 
 
 
 
 
 
 
 
 
 
 
 
326
  if ev.get("adapter_mode"):
327
  meta["adapter_mode"] = ev["adapter_mode"]
328
  if ev.get("tool_args"):
329
  meta["tool_args"] = ev["tool_args"]
330
 
331
  if awaiting:
332
- # awaiting 이벤트에서 메타데이터 추출
333
- if awaiting.get("planned_tools"):
 
 
 
 
 
 
334
  meta["planned_tools"] = awaiting["planned_tools"]
335
- if awaiting.get("task_type"):
 
 
 
336
  meta["task_type"] = awaiting["task_type"]
 
 
 
 
 
 
 
 
 
 
 
 
337
  if awaiting.get("adapter_mode"):
338
  meta["adapter_mode"] = awaiting["adapter_mode"]
339
  if awaiting.get("tool_args"):
340
  meta["tool_args"] = awaiting["tool_args"]
341
 
342
  thread_id = awaiting.get("thread_id") or session_id
 
 
 
 
 
 
 
343
 
344
  # approve/reject
345
  approve_code, approve_resp = await http_post(
@@ -351,6 +457,7 @@ async def _call_agent_with_approval(
351
  return False, "", meta, f"approve HTTP {approve_code}: {approve_resp}"
352
 
353
  # approve 응답에서 최종 텍스트 및 도구 결과 추출
 
354
  final_text = approve_resp.get("text", "") or approve_resp.get("final_text", "") or ""
355
  if approve_resp.get("tool_results"):
356
  meta["tool_results"] = approve_resp["tool_results"]
@@ -365,14 +472,39 @@ async def _call_agent_with_approval(
365
 
366
  # awaiting 이벤트 없이 최종 텍스트가 있는 경우 (auto-approve 모드)
367
  text = _extract_text_from_events(events)
368
- # 이벤트에서 추가 메타데이터 수집
369
  for ev in events:
370
- if ev.get("planned_tools") and not meta["planned_tools"]:
371
- meta["planned_tools"] = ev["planned_tools"]
372
- if ev.get("task_type") and not meta["task_type"]:
373
- meta["task_type"] = ev["task_type"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  if ev.get("tool_results") and not meta["tool_results"]:
375
  meta["tool_results"] = ev["tool_results"]
 
376
  if ev.get("adapter_mode") and not meta["adapter_mode"]:
377
  meta["adapter_mode"] = ev["adapter_mode"]
378
  if ev.get("tool_args") and not meta["tool_args"]:
@@ -619,10 +751,12 @@ async def scenario3_adapter_registry() -> dict:
619
  3,
620
  "Adapter Registry",
621
  1,
622
- "failed",
623
  elapsed,
624
- assertions=["HTTP 200"],
625
- error=f"HTTP {status_code}",
 
 
626
  detail={"resp": resp},
627
  )
628
  assertions.append("HTTP 200: OK")
@@ -1481,16 +1615,16 @@ async def _wait_cold_start() -> float:
1481
  try:
1482
  code, body = await http_get("/health", timeout=10)
1483
  if code == 200 and body.get("status") in ("ok", "healthy"):
1484
- print(f" 서버 준비 완료 (대기 {total_wait:.0f}s)")
1485
  return total_wait
1486
  except Exception:
1487
  pass
1488
  if i < 9:
1489
- print(f" 서버 대기 중... ({i + 1}/10, 30s 후 재시도)")
1490
  await asyncio.sleep(30)
1491
  total_wait += 30
1492
 
1493
- print(" [WARN] 서버 준비 확인 실패 — 계속 진행")
1494
  return total_wait
1495
 
1496
 
@@ -1500,23 +1634,23 @@ async def _wait_cold_start() -> float:
1500
 
1501
 
1502
  async def main() -> int:
1503
- print("=" * 60)
1504
- print("GovOn E2E Tool Calling + AdapterRegistry 검증")
1505
- print("=" * 60)
1506
- print(f" 대상 서버: {BASE_URL}")
1507
- print(f" 인증: {'API_KEY 설정됨' if API_KEY else '미설정 (비인증)'}")
1508
- print(f" HTTP 백엔드: {_HTTP_BACKEND}")
1509
- print(f" 타임아웃: {TIMEOUT}s / 시나리오")
1510
- print(f" run_id: {_run_id}")
1511
- print("-" * 60)
1512
 
1513
  # Cold start 대기
1514
- print("[Cold Start] 서버 준비 확인 중...")
1515
  cold_start_wait = await _wait_cold_start()
1516
 
1517
  # ===== Phase 1: Infrastructure (hard gate) =====
1518
- print("\n[Phase 1] Infrastructure (hard gate)")
1519
- print("-" * 40)
1520
 
1521
  phase1_scenarios = [
1522
  scenario1_health_profile,
@@ -1531,15 +1665,15 @@ async def main() -> int:
1531
  phase1_failed = True
1532
 
1533
  if phase1_failed:
1534
- print("\n" + "!" * 60)
1535
- print("ABORT: Infrastructure not ready — Phase 1 failed")
1536
- print("!" * 60)
1537
  _write_output(cold_start_wait)
1538
  return 1
1539
 
1540
  # ===== Phase 2: Agent Pipeline Core =====
1541
- print("\n[Phase 2] Agent Pipeline Core")
1542
- print("-" * 40)
1543
 
1544
  phase2_scenarios = [
1545
  scenario4_planner_valid_plan,
@@ -1552,28 +1686,28 @@ async def main() -> int:
1552
  await fn()
1553
 
1554
  # ===== Phase 3: data.go.kr API Tools (soft gate) =====
1555
- print("\n[Phase 3] data.go.kr API Tools (soft gate)")
1556
- print("-" * 40)
1557
 
1558
- print(" data.go.kr 연결 확인...")
1559
  datago_ok = await _check_datago_connectivity()
1560
  if datago_ok:
1561
- print(" data.go.kr 연결 가능")
1562
  else:
1563
- print(" data.go.kr 연결 불가 — Phase 3 스킵")
1564
 
1565
  await scenario8_external_api_tools()
1566
 
1567
  # ===== Phase 4: Adapter Dynamics =====
1568
- print("\n[Phase 4] Adapter Dynamics")
1569
- print("-" * 40)
1570
 
1571
  await scenario9_sequential_adapter_switching()
1572
  await scenario10_lora_id_consistency()
1573
 
1574
  # ===== Phase 5: Robustness =====
1575
- print("\n[Phase 5] Robustness")
1576
- print("-" * 40)
1577
 
1578
  phase5_scenarios = [
1579
  scenario11_empty_query,
@@ -1585,18 +1719,18 @@ async def main() -> int:
1585
  await fn()
1586
 
1587
  # ===== 요약 =====
1588
- print("\n" + "=" * 60)
1589
  passed = sum(1 for r in _results if r["status"] == "passed")
1590
  failed = sum(1 for r in _results if r["status"] == "failed")
1591
  skipped = sum(1 for r in _results if r["status"] == "skipped")
1592
  total = len(_results)
1593
 
1594
- print(f"결과: {passed}/{total} 통과, {failed} 실패, {skipped} 스킵")
1595
 
1596
  tool_ratio = len(_observed_tools) / len(VALID_TOOLS) if VALID_TOOLS else 0
1597
- print(f"도구 커버리지: {len(_observed_tools)}/{len(VALID_TOOLS)} ({tool_ratio:.0%})")
1598
  if _observed_tools:
1599
- print(f" 관측된 도구: {sorted(_observed_tools)}")
1600
 
1601
  _write_output(cold_start_wait)
1602
 
@@ -1637,7 +1771,7 @@ def _write_output(cold_start_wait: float) -> None:
1637
 
1638
  with open(RESULTS_PATH, "w", encoding="utf-8") as f:
1639
  json.dump(output, f, ensure_ascii=False, indent=2)
1640
- print(f"\n결과 저장: {RESULTS_PATH}")
1641
 
1642
 
1643
  if __name__ == "__main__":
 
44
  API_KEY = os.environ.get("API_KEY")
45
  TIMEOUT = 300 # 시나리오당 최대 대기 시간 (초)
46
  BASE_MODEL = "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"
47
+ _TIMESTAMP = time.strftime("%Y%m%d_%H%M%S")
48
+ RESULTS_PATH = f"verify_e2e_tool_calling_{_TIMESTAMP}.json"
49
+ LOG_PATH = f"verify_e2e_tool_calling_{_TIMESTAMP}.log"
50
 
51
  VALID_TOOLS = frozenset(
52
  {
 
74
  r"규정",
75
  ]
76
 
77
+ # ---------------------------------------------------------------------------
78
+ # 로깅 설정: 터미널 + 파일 동시 기록
79
+ # ---------------------------------------------------------------------------
80
+ logging.basicConfig(
81
+ level=logging.INFO,
82
+ format="%(asctime)s [%(levelname)s] %(message)s",
83
+ handlers=[
84
+ logging.StreamHandler(sys.stdout),
85
+ logging.FileHandler(LOG_PATH, encoding="utf-8"),
86
+ ],
87
+ )
88
  logger = logging.getLogger(__name__)
89
+ logger.info(f"로그 파일: {LOG_PATH}")
90
+ logger.info(f"결과 파일: {RESULTS_PATH}")
91
 
92
  _results: list[dict] = []
93
+
94
+
95
+ def _save_intermediate_results() -> None:
96
+ """시나리오 완료 시마다 중간 결과를 JSON 파일에 저장한다."""
97
+ output = {
98
+ "meta": {
99
+ "run_id": _run_id if "_run_id" in dir() else "",
100
+ "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
101
+ "target_url": BASE_URL,
102
+ "log_file": LOG_PATH,
103
+ "status": "in_progress",
104
+ },
105
+ "summary": {
106
+ "total": len(_results),
107
+ "passed": sum(1 for r in _results if r["status"] == "passed"),
108
+ "failed": sum(1 for r in _results if r["status"] == "failed"),
109
+ "skipped": sum(1 for r in _results if r["status"] == "skipped"),
110
+ },
111
+ "scenarios": _results,
112
+ }
113
+ try:
114
+ with open(RESULTS_PATH, "w", encoding="utf-8") as f:
115
+ json.dump(output, f, ensure_ascii=False, indent=2)
116
+ except Exception:
117
+ pass
118
+
119
+
120
  _observed_tools: set[str] = set()
121
  _run_id = uuid4().hex
122
 
 
277
  tag = {"passed": "[PASS]", "failed": "[FAIL]", "skipped": "[SKIP]"}.get(status, "[????]")
278
  suffix = f"({elapsed:.2f}s)"
279
  if status == "passed":
280
+ msg = f"{tag} Scenario {scenario_num}: {name} {suffix}"
281
+ logger.info(msg)
282
  elif status == "skipped":
283
+ msg = f"{tag} Scenario {scenario_num}: {name} — {error or 'skipped'} {suffix}"
284
+ logger.warning(msg)
285
  else:
286
+ msg = f"{tag} Scenario {scenario_num}: {name} — {error} {suffix}"
287
+ logger.error(msg)
288
 
289
  if warnings:
290
  for w in warnings:
291
+ logger.warning(f" [WARN] {w}")
292
 
293
  entry = {
294
  "id": scenario_num,
 
303
  "detail": detail,
304
  }
305
  _results.append(entry)
306
+ _save_intermediate_results()
307
  return entry
308
 
309
 
 
341
  """에이전트 SSE 스트리밍으로 호출 → awaiting_approval까지 파싱 → approve/reject.
342
 
343
  Returns: (success, text, metadata_dict, error)
344
+ metadata_dict keys: planned_tools, task_type, goal, reason, tool_results, adapter_mode, tool_args
345
  """
346
  body = {"query": query, "session_id": session_id, "use_rag": False}
347
  meta: dict[str, Any] = {
348
  "planned_tools": [],
349
  "task_type": None,
350
+ "goal": None,
351
+ "reason": None,
352
  "tool_results": {},
353
  "adapter_mode": None,
354
  "tool_args": {},
355
  }
356
 
357
+ logger.info(f"[Agent] 요청: session={session_id}, query={query[:60]}...")
358
+
359
  # --- SSE 스트리밍 시도 ---
360
  try:
361
  status_code, events = await http_post_sse("/v2/agent/stream", body, timeout=timeout)
362
+ logger.info(f"[Agent] SSE 응답: HTTP {status_code}, events={len(events)}")
363
  if status_code != 200:
364
  raise RuntimeError(f"SSE HTTP {status_code}")
365
 
366
+ # 노드별 흐름 로깅
367
+ for ev in events:
368
+ node = ev.get("node", "?")
369
+ st = ev.get("status", "?")
370
+ logger.info(f" [SSE] node={node}, status={st}")
371
+
372
  # awaiting_approval 또는 __interrupt__ 이벤트 탐색
373
  awaiting = None
374
  for ev in events:
375
  if ev.get("status") == "awaiting_approval" or ev.get("node") == "__interrupt__":
376
  awaiting = ev
377
  break
378
+ # 플래너 노드에서 planned_tools 추출 (nested approval_request 우선)
379
+ ev_approval = ev.get("approval_request", {})
380
+ if not isinstance(ev_approval, dict):
381
+ ev_approval = {}
382
+
383
+ if ev_approval.get("planned_tools"):
384
+ meta["planned_tools"] = ev_approval["planned_tools"]
385
+ elif ev.get("planned_tools"):
386
  meta["planned_tools"] = ev["planned_tools"]
387
+
388
+ if ev_approval.get("task_type"):
389
+ meta["task_type"] = ev_approval["task_type"]
390
+ elif ev.get("task_type"):
391
  meta["task_type"] = ev["task_type"]
392
+
393
+ if ev_approval.get("goal"):
394
+ meta["goal"] = ev_approval["goal"]
395
+ elif ev.get("goal"):
396
+ meta["goal"] = ev["goal"]
397
+
398
+ if ev_approval.get("reason"):
399
+ meta["reason"] = ev_approval["reason"]
400
+ elif ev.get("reason"):
401
+ meta["reason"] = ev["reason"]
402
+
403
+ # adapter_mode, tool_args are always top-level
404
  if ev.get("adapter_mode"):
405
  meta["adapter_mode"] = ev["adapter_mode"]
406
  if ev.get("tool_args"):
407
  meta["tool_args"] = ev["tool_args"]
408
 
409
  if awaiting:
410
+ # awaiting 이벤트에서 메타데이터 추출 (nested approval_request 우선)
411
+ approval_req = awaiting.get("approval_request", {})
412
+ if not isinstance(approval_req, dict):
413
+ approval_req = {}
414
+
415
+ if approval_req.get("planned_tools"):
416
+ meta["planned_tools"] = approval_req["planned_tools"]
417
+ elif awaiting.get("planned_tools"):
418
  meta["planned_tools"] = awaiting["planned_tools"]
419
+
420
+ if approval_req.get("task_type"):
421
+ meta["task_type"] = approval_req["task_type"]
422
+ elif awaiting.get("task_type"):
423
  meta["task_type"] = awaiting["task_type"]
424
+
425
+ if approval_req.get("goal"):
426
+ meta["goal"] = approval_req["goal"]
427
+ elif awaiting.get("goal"):
428
+ meta["goal"] = awaiting["goal"]
429
+
430
+ if approval_req.get("reason"):
431
+ meta["reason"] = approval_req["reason"]
432
+ elif awaiting.get("reason"):
433
+ meta["reason"] = awaiting["reason"]
434
+
435
+ # adapter_mode, tool_args are always top-level
436
  if awaiting.get("adapter_mode"):
437
  meta["adapter_mode"] = awaiting["adapter_mode"]
438
  if awaiting.get("tool_args"):
439
  meta["tool_args"] = awaiting["tool_args"]
440
 
441
  thread_id = awaiting.get("thread_id") or session_id
442
+ logger.info(f" [Approval] planned_tools={meta['planned_tools']}")
443
+ logger.info(
444
+ f" [Approval] adapter_mode={meta['adapter_mode']}, tool_args={meta['tool_args']}"
445
+ )
446
+ logger.info(
447
+ f" [Approval] {'승인' if approve else '거절'} 요청 → thread_id={thread_id}"
448
+ )
449
 
450
  # approve/reject
451
  approve_code, approve_resp = await http_post(
 
457
  return False, "", meta, f"approve HTTP {approve_code}: {approve_resp}"
458
 
459
  # approve 응답에서 최종 텍스트 및 도구 결과 추출
460
+ logger.info(f" [Approve] HTTP {approve_code}, status={approve_resp.get('status')}")
461
  final_text = approve_resp.get("text", "") or approve_resp.get("final_text", "") or ""
462
  if approve_resp.get("tool_results"):
463
  meta["tool_results"] = approve_resp["tool_results"]
 
472
 
473
  # awaiting 이벤트 없이 최종 텍스트가 있는 경우 (auto-approve 모드)
474
  text = _extract_text_from_events(events)
475
+ # 이벤트에서 추가 메타데이터 수집 (nested approval_request 우선)
476
  for ev in events:
477
+ fallback_req = ev.get("approval_request", {})
478
+ if not isinstance(fallback_req, dict):
479
+ fallback_req = {}
480
+
481
+ if not meta["planned_tools"]:
482
+ if fallback_req.get("planned_tools"):
483
+ meta["planned_tools"] = fallback_req["planned_tools"]
484
+ elif ev.get("planned_tools"):
485
+ meta["planned_tools"] = ev["planned_tools"]
486
+
487
+ if not meta.get("task_type"):
488
+ if fallback_req.get("task_type"):
489
+ meta["task_type"] = fallback_req["task_type"]
490
+ elif ev.get("task_type"):
491
+ meta["task_type"] = ev["task_type"]
492
+
493
+ if not meta.get("goal"):
494
+ if fallback_req.get("goal"):
495
+ meta["goal"] = fallback_req["goal"]
496
+ elif ev.get("goal"):
497
+ meta["goal"] = ev["goal"]
498
+
499
+ if not meta.get("reason"):
500
+ if fallback_req.get("reason"):
501
+ meta["reason"] = fallback_req["reason"]
502
+ elif ev.get("reason"):
503
+ meta["reason"] = ev["reason"]
504
+
505
  if ev.get("tool_results") and not meta["tool_results"]:
506
  meta["tool_results"] = ev["tool_results"]
507
+ # adapter_mode, tool_args are always top-level
508
  if ev.get("adapter_mode") and not meta["adapter_mode"]:
509
  meta["adapter_mode"] = ev["adapter_mode"]
510
  if ev.get("tool_args") and not meta["tool_args"]:
 
751
  3,
752
  "Adapter Registry",
753
  1,
754
+ "passed",
755
  elapsed,
756
+ assertions=[],
757
+ warnings=[
758
+ f"/v1/models HTTP {status_code} — 엔드포인트 미노출 (vLLM 설정에 따라 정상)"
759
+ ],
760
  detail={"resp": resp},
761
  )
762
  assertions.append("HTTP 200: OK")
 
1615
  try:
1616
  code, body = await http_get("/health", timeout=10)
1617
  if code == 200 and body.get("status") in ("ok", "healthy"):
1618
+ logger.info(f" 서버 준비 완료 (대기 {total_wait:.0f}s)")
1619
  return total_wait
1620
  except Exception:
1621
  pass
1622
  if i < 9:
1623
+ logger.info(f" 서버 대기 중... ({i + 1}/10, 30s 후 재시도)")
1624
  await asyncio.sleep(30)
1625
  total_wait += 30
1626
 
1627
+ logger.info(" [WARN] 서버 준비 확인 실패 — 계속 진행")
1628
  return total_wait
1629
 
1630
 
 
1634
 
1635
 
1636
  async def main() -> int:
1637
+ logger.info("=" * 60)
1638
+ logger.info("GovOn E2E Tool Calling + AdapterRegistry 검증")
1639
+ logger.info("=" * 60)
1640
+ logger.info(f" 대상 서버: {BASE_URL}")
1641
+ logger.info(f" 인증: {'API_KEY 설정됨' if API_KEY else '미설정 (비인증)'}")
1642
+ logger.info(f" HTTP 백엔드: {_HTTP_BACKEND}")
1643
+ logger.info(f" 타임아웃: {TIMEOUT}s / 시나리오")
1644
+ logger.info(f" run_id: {_run_id}")
1645
+ logger.info("-" * 60)
1646
 
1647
  # Cold start 대기
1648
+ logger.info("[Cold Start] 서버 준비 확인 중...")
1649
  cold_start_wait = await _wait_cold_start()
1650
 
1651
  # ===== Phase 1: Infrastructure (hard gate) =====
1652
+ logger.info("\n[Phase 1] Infrastructure (hard gate)")
1653
+ logger.info("-" * 40)
1654
 
1655
  phase1_scenarios = [
1656
  scenario1_health_profile,
 
1665
  phase1_failed = True
1666
 
1667
  if phase1_failed:
1668
+ logger.info("\n" + "!" * 60)
1669
+ logger.info("ABORT: Infrastructure not ready — Phase 1 failed")
1670
+ logger.info("!" * 60)
1671
  _write_output(cold_start_wait)
1672
  return 1
1673
 
1674
  # ===== Phase 2: Agent Pipeline Core =====
1675
+ logger.info("\n[Phase 2] Agent Pipeline Core")
1676
+ logger.info("-" * 40)
1677
 
1678
  phase2_scenarios = [
1679
  scenario4_planner_valid_plan,
 
1686
  await fn()
1687
 
1688
  # ===== Phase 3: data.go.kr API Tools (soft gate) =====
1689
+ logger.info("\n[Phase 3] data.go.kr API Tools (soft gate)")
1690
+ logger.info("-" * 40)
1691
 
1692
+ logger.info(" data.go.kr 연결 확인...")
1693
  datago_ok = await _check_datago_connectivity()
1694
  if datago_ok:
1695
+ logger.info(" data.go.kr 연결 가능")
1696
  else:
1697
+ logger.info(" data.go.kr 연결 불가 — Phase 3 스킵")
1698
 
1699
  await scenario8_external_api_tools()
1700
 
1701
  # ===== Phase 4: Adapter Dynamics =====
1702
+ logger.info("\n[Phase 4] Adapter Dynamics")
1703
+ logger.info("-" * 40)
1704
 
1705
  await scenario9_sequential_adapter_switching()
1706
  await scenario10_lora_id_consistency()
1707
 
1708
  # ===== Phase 5: Robustness =====
1709
+ logger.info("\n[Phase 5] Robustness")
1710
+ logger.info("-" * 40)
1711
 
1712
  phase5_scenarios = [
1713
  scenario11_empty_query,
 
1719
  await fn()
1720
 
1721
  # ===== 요약 =====
1722
+ logger.info("\n" + "=" * 60)
1723
  passed = sum(1 for r in _results if r["status"] == "passed")
1724
  failed = sum(1 for r in _results if r["status"] == "failed")
1725
  skipped = sum(1 for r in _results if r["status"] == "skipped")
1726
  total = len(_results)
1727
 
1728
+ logger.info(f"결과: {passed}/{total} 통과, {failed} 실패, {skipped} 스킵")
1729
 
1730
  tool_ratio = len(_observed_tools) / len(VALID_TOOLS) if VALID_TOOLS else 0
1731
+ logger.info(f"도구 커버리지: {len(_observed_tools)}/{len(VALID_TOOLS)} ({tool_ratio:.0%})")
1732
  if _observed_tools:
1733
+ logger.info(f" 관측된 도구: {sorted(_observed_tools)}")
1734
 
1735
  _write_output(cold_start_wait)
1736
 
 
1771
 
1772
  with open(RESULTS_PATH, "w", encoding="utf-8") as f:
1773
  json.dump(output, f, ensure_ascii=False, indent=2)
1774
+ logger.info(f"\n결과 저장: {RESULTS_PATH}")
1775
 
1776
 
1777
  if __name__ == "__main__":