insurance-chatbot / scripts /eval_tool_recall.py
κΉ€λ―Όκ²½
refactor: ToolCard μ •ν•©μ„± κ°œμ„  + μžλ™ 검증 λ„μž…
098e040
"""Tool Search Recall@k 평가 슀크립트.
μ‚¬μš©μž λ°œν™”(query) β†’ μ •λ‹΅ 도ꡬ 이름(expected)을 μ •μ˜ν•˜κ³ 
ChromaDB 검색이 top-k μ•ˆμ— 정닡을 ν¬ν•¨μ‹œν‚€λŠ”μ§€ μΈ‘μ •ν•©λ‹ˆλ‹€.
μ§€ν‘œ:
── Tool-Call 쿼리 (expected != None) ──
Hit@1 β€” top-1 이 정닡인 λΉ„μœ¨
Recall@k β€” top-k μ•ˆμ— 정닡이 1개 이상 μžˆλŠ” λΉ„μœ¨
MRR β€” Mean Reciprocal Rank (정닡이 처음 λ“±μž₯ν•˜λŠ” μˆœμœ„μ˜ μ—­μˆ˜ 평균)
Tool Acc β€” top-1 이 정닡인 λΉ„μœ¨ (= Hit@1)
── No-Call 쿼리 (expected == None) ──
No-Call Acc β€” top-1 score κ°€ threshold 미만인 λΉ„μœ¨ (도ꡬ λΆˆν•„μš” νŒλ³„)
── 전체 ──
Overall Acc β€” (Tool Acc μ •λ‹΅ 수 + No-Call Acc μ •λ‹΅ 수) / 전체 쿼리 수
μ‹€ν–‰:
python -m scripts.eval_tool_recall
python -m scripts.eval_tool_recall --k 5 --verbose
python -m scripts.eval_tool_recall --compare # k=1,3,5,7,10 λΉ„κ΅ν‘œ
python -m scripts.eval_tool_recall --compare --ks 3 5 7 # μ»€μŠ€ν…€ k κ°’
"""
from __future__ import annotations
import argparse
import sys
from dataclasses import dataclass, field
# ──────────────────────────────────────────────────────────────
# ν…ŒμŠ€νŠΈ μΌ€μ΄μŠ€ μ •μ˜
#
# ν˜•μ‹: (query, expected_tool_name | None)
# expected=None β†’ 도ꡬλ₯Ό ν˜ΈμΆœν•˜λ©΄ μ•ˆ λ˜λŠ” 쿼리 (No-Call)
# ν˜Όλ™ 쌍(confusion pair)은 μ£Όμ„μœΌλ‘œ ν‘œμ‹œν•©λ‹ˆλ‹€.
# ──────────────────────────────────────────────────────────────
TEST_CASES: list[tuple[str, str | None]] = [
# ══════════════════════════════════════════════════════════
# TOOL-CALL μΌ€μ΄μŠ€ (도ꡬλ₯Ό ν˜ΈμΆœν•΄μ•Ό ν•˜λŠ” 쿼리)
# ══════════════════════════════════════════════════════════
# ── product_search ──────────────────────────────────────
("우리 νšŒμ‚¬ μƒν’ˆ 뭐 μžˆμ–΄?", "product_search"),
("λΌμ΄λ‚˜μƒλͺ… 판맀 μƒν’ˆ λͺ©λ‘ μ•Œλ €μ€˜", "product_search"),
("μΉ˜μ•„λ³΄ν—˜ μžˆμ–΄?", "product_search"),
("μ•”λ³΄ν—˜ μƒν’ˆ 뭐가 μžˆμ–΄?", "product_search"),
("전체 μƒν’ˆ 리슀트 λ³΄μ—¬μ€˜", "product_search"),
("μ’…μ‹ λ³΄ν—˜ μƒν’ˆ μžˆμ–΄?", "product_search"),
("치맀 κ΄€λ ¨ μƒν’ˆ μžˆμ–΄?", "product_search"),
("κ°„νŽΈμ‹¬μ‚¬ μƒν’ˆ λͺ©λ‘", "product_search"),
# ν˜Όλ™: coverage_summary vs product_search
("μ–΄λ–€ λ³΄ν—˜ μƒν’ˆ νŒŒλŠ”μ§€ μ•Œκ³  μ‹Άμ–΄", "product_search"),
# ── coverage_summary ────────────────────────────────────
("이 μƒν’ˆ 보μž₯이 뭐야?", "coverage_summary"),
("B00197011 보μž₯ λ‚΄μš© μ•Œλ €μ€˜", "coverage_summary"),
("이 λ³΄ν—˜ 뭘 보μž₯ν•΄μ€˜?", "coverage_summary"),
# ν˜Όλ™: coverage_summary vs product_search
("보μž₯ λ²”μœ„ 전체 λ³΄μ—¬μ€˜", "coverage_summary"),
# ── coverage_detail ─────────────────────────────────────
("μ•” μ§„λ‹¨κΈˆμ΄ μ–Όλ§ˆμ•Ό?", "coverage_detail"),
("μΉ˜μ•„ 보μž₯이 ꡬ체적으둜 μ–΄λ–»κ²Œ 돼?", "coverage_detail"),
("μ‚¬λ§λ³΄ν—˜κΈˆ 상세 λ‚΄μš©", "coverage_detail"),
# ν˜Όλ™: coverage_detail vs coverage_summary
("이 μƒν’ˆμ—μ„œ μž…μ› 보μž₯만 λ”°λ‘œ 보고 μ‹Άμ–΄", "coverage_detail"),
# ── premium_estimate ────────────────────────────────────
("이 μƒν’ˆ λ³΄ν—˜λ£Œ μ–Όλ§ˆμ•Ό?", "premium_estimate"),
("40μ„Έ 남성 λ³΄ν—˜λ£Œ κ³„μ‚°ν•΄μ€˜", "premium_estimate"),
("μ›” λ‚©μž…μ•‘μ΄ μ–Όλ§ˆλ‚˜ 돼?", "premium_estimate"),
# ν˜Όλ™: premium_estimate vs plan_options
("λ³΄ν—˜λ£Œ μ‚°μΆœν•΄μ€˜", "premium_estimate"),
# ── plan_options ────────────────────────────────────────
("λ‚©μž… κΈ°κ°„ μ˜΅μ…˜ 뭐 μžˆμ–΄?", "plan_options"),
("10λ…„λ‚© 20λ…„λ‚© 쀑 선택 κ°€λŠ₯ν•΄?", "plan_options"),
# ν˜Όλ™: plan_options vs premium_estimate
("λ‚©μž… 방식 μ•Œλ €μ€˜", "plan_options"),
# ── underwriting_precheck ───────────────────────────────
("당뇨 이λ ₯ μžˆμ–΄λ„ κ°€μž… κ°€λŠ₯ν•΄?", "underwriting_precheck"),
("κ³ ν˜ˆμ••μΈλ° μ•”λ³΄ν—˜ λ“€ 수 μžˆμ–΄?", "underwriting_precheck"),
("55μ„Έ 남성 κΈ°μ‘΄ 수술 이λ ₯ μžˆλŠ”λ° κ°€μž…λΌ?", "underwriting_precheck"),
# ν˜Όλ™: underwriting_precheck vs eligibility_by_product_rule
("병λ ₯ μžˆλŠ” 고객 인수 κ°€λŠ₯ μ—¬λΆ€ 확인", "underwriting_precheck"),
# ── eligibility_by_product_rule ─────────────────────────
("이 μƒν’ˆ λͺ‡ μ‚΄κΉŒμ§€ κ°€μž… κ°€λŠ₯ν•΄?", "eligibility_by_product_rule"),
("κ°€μž… κ°€λŠ₯ λ‚˜μ΄ λ²”μœ„", "eligibility_by_product_rule"),
("μ–΄λ–€ μ±„λ„μ—μ„œ νŒ”μ•„?", "eligibility_by_product_rule"),
# ── claim_guide ─────────────────────────────────────────
("λ³΄ν—˜κΈˆ 청ꡬ μ–΄λ–»κ²Œ ν•΄?", "claim_guide"),
("μ•” 진단 ν›„ 청ꡬ 절차", "claim_guide"),
("μž…μ›λΉ„ μ²­κ΅¬ν•˜λ €λ©΄?", "claim_guide"),
# ν˜Όλ™: claim_guide vs coverage_detail
("청ꡬ 방법 μ•Œλ €μ€˜", "claim_guide"),
# ── underwriting_waiting_periods ────────────────────────
("면책기간이 μ–Όλ§ˆμ•Ό?", "underwriting_waiting_periods"),
("κ°€μž…ν•˜κ³  μ–Έμ œλΆ€ν„° 보μž₯돼?", "underwriting_waiting_periods"),
("보μž₯κ°œμ‹œμΌμ΄ μ–Έμ œμ•Ό?", "underwriting_waiting_periods"),
# ── underwriting_exclusions ─────────────────────────────
("보μž₯ μ•ˆ λ˜λŠ” κ²½μš°κ°€ 뭐야?", "underwriting_exclusions"),
("λ©΄μ±… μ‚¬μœ  λͺ©λ‘", "underwriting_exclusions"),
# ── rag_terms_query_engine ──────────────────────────────
("μ•½κ΄€μ—μ„œ λ©΄μ±… 쑰건 μ°Ύμ•„μ€˜", "rag_terms_query_engine"),
("약관상 μ•”μ˜ μ •μ˜", "rag_terms_query_engine"),
# ν˜Όλ™: rag_terms vs rag_product_info
("κ³ μ§€μ˜λ¬΄ κ·œμ •μ΄ 약관에 μ–΄λ–»κ²Œ λ‚˜μ™€ μžˆμ–΄?", "rag_terms_query_engine"),
# ── rag_product_info_query_engine ───────────────────────
("μƒν’ˆμš”μ•½μ„œμ—μ„œ 보μž₯ λ‚΄μš© μ°Ύμ•„μ€˜", "rag_product_info_query_engine"),
("이 μƒν’ˆ μš”μ•½μ„œ λ‚΄μš©", "rag_product_info_query_engine"),
# ── compliance ──────────────────────────────────────────
("이 문ꡬ 써도 돼?", "compliance_misleading_check"),
("이 μŠ€ν¬λ¦½νŠΈμ— κΈˆμΉ™μ–΄ μžˆμ–΄?", "compliance_misleading_check"),
("λ©΄μ±… κ΄€λ ¨ 쀀법 멘트 λ§Œλ“€μ–΄μ€˜", "compliance_phrase_generator"),
("TM λ…Ήμ·¨ κ³ μ§€ 멘트", "recording_notice_script"),
("κ°œμΈμ •λ³΄ λ§ˆμŠ€ν‚Ήν•΄μ€˜", "privacy_masking"),
("주민번호 μ§€μ›Œμ€˜", "privacy_masking"),
# ── customer_db ─────────────────────────────────────────
("홍길동 고객 계약 쑰회", "customer_contract_lookup"),
("이 고객 쀑볡 κ°€μž… 돼?", "duplicate_enrollment_check"),
# ── misc ────────────────────────────────────────────────
("κ°±μ‹ ν•˜λ©΄ λ³΄ν—˜λ£Œ μ–Όλ§ˆλ‚˜ 올라?", "renewal_premium_projection"),
("직업 μœ„ν—˜λ„ ν™•μΈν•΄μ€˜", "underwriting_high_risk_job_check"),
("μ†Œλ°©κ΄€λ„ κ°€μž… κ°€λŠ₯ν•΄?", "underwriting_high_risk_job_check"),
("이 병λ ₯ κ³ μ§€ν•΄μ•Ό ν•΄?", "underwriting_disclosure_risk_score"),
("ν•΄μ•½ν•˜λ©΄ 돈 μ–Όλ§ˆ λŒλ €λ°›μ•„?", "surrender_value_explain"),
("계약 ν•΄μ§€ν•˜κ³  μ‹Άμ–΄", "contract_manage"),
("μΉ˜μ•„ 보μž₯ μ—°κ°„ λͺ‡ κ°œκΉŒμ§€μ•Ό?", "benefit_limit_rules"),
("μ•” μ§„λ‹¨κΈˆ μ–Όλ§ˆ λ°›μ•„?", "benefit_amount_lookup"),
("ICD μ½”λ“œ C50 이 무슨 병이야?", "icd_mapping_lookup"),
("고객 λͺ©ν‘œμ— λ§žλŠ” νŠΉμ•½ μΆ”μ²œν•΄μ€˜", "rider_bundle_recommend"),
("동일 μΉ˜μ•„ 쀑볡 청ꡬ κ·œμΉ™", "multi_benefit_conflict_rule"),
# ══════════════════════════════════════════════════════════
# NO-CALL μΌ€μ΄μŠ€ (도ꡬλ₯Ό ν˜ΈμΆœν•˜λ©΄ μ•ˆ λ˜λŠ” 쿼리)
#
# λ³΄ν—˜ 도메인 μ•ˆμ΄μ§€λ§Œ νŠΉμ • 도ꡬ가 ν•„μš” μ—†λŠ” 일반 질문,
# λ˜λŠ” 인사/감사/확인 λ“± λŒ€ν™”ν˜• λ°œν™”.
# ══════════════════════════════════════════════════════════
# ── 일반 λ³΄ν—˜ 지식 (도ꡬ 없이 LLM이 직접 λ‹΅ν•  수 있음) ──
("λ³΄ν—˜μ΄λž€ λ¬΄μ—‡μΈκ°€μš”?", None),
("μ’…μ‹ λ³΄ν—˜μ΄λž‘ μ •κΈ°λ³΄ν—˜ 차이가 뭐야?", None),
("μ‹€μ†λ³΄ν—˜ 뜻이 뭐야?", None),
("λ³΄ν—˜λ£Œμ™€ λ³΄ν—˜κΈˆμ˜ 차이", None),
("λ³΄ν—˜ κ°€μž… μ‹œ μ£Όμ˜μ‚¬ν•­μ΄ 뭐야?", None),
# ── λŒ€ν™”ν˜• λ°œν™” (도ꡬ λΆˆν•„μš”) ──
("κ°μ‚¬ν•©λ‹ˆλ‹€ 잘 μ•Œκ² μŠ΅λ‹ˆλ‹€", None),
("λ„€ μ•Œκ² μ–΄μš”", None),
("방금 말씀해주신 λ‚΄μš© μš”μ•½ν•΄μ€˜", None),
("μ’€ 더 μ‰½κ²Œ μ„€λͺ…해쀄 수 μžˆμ–΄?", None),
("λ‹€λ₯Έ 건 μ—†μ–΄μš” κ°μ‚¬ν•©λ‹ˆλ‹€", None),
# ── 도메인 λ‚΄μ΄μ§€λ§Œ λͺ¨ν˜Έν•œ 질문 (νŠΉμ • 도ꡬ λ§€ν•‘ λΆˆκ°€) ──
("λ³΄ν—˜ λ“€ λ•Œ 뭘 확인해야 ν• κΉŒ?", None),
("λ³΄ν—˜ μ„€κ³„μ‚¬ν•œν…Œ 뭘 물어봐야 ν•΄?", None),
("λ³΄ν—˜ ν•˜λ‚˜λ§Œ λ“€λ €λ©΄ 뭐가 μ’‹μ„κΉŒ?", None),
("λ³΄ν—˜ ν•΄μ§€ν•˜λ©΄ 뢈이읡이 μžˆλ‚˜μš”?", None),
("λ³΄ν—˜λ£Œλ₯Ό μ•„λΌλŠ” 방법이 μžˆμ„κΉŒ?", None),
]
# ──────────────────────────────────────────────────────────────
# 평가 둜직
# ──────────────────────────────────────────────────────────────
DEFAULT_NO_CALL_THRESHOLD = 0.86
@dataclass
class EvalResult:
query: str
expected: str | None
ranked: list[str]
scores: list[float] = field(default_factory=list)
hit_rank: int | None = None
@property
def is_no_call(self) -> bool:
return self.expected is None
@property
def top_score(self) -> float:
return self.scores[0] if self.scores else 0.0
def _reciprocal_rank(result: EvalResult) -> float:
return 1.0 / result.hit_rank if result.hit_rank else 0.0
def _run_search(k: int) -> list[EvalResult]:
"""TEST_CASES λ₯Ό μ‹€ν–‰ν•˜κ³  EvalResult λͺ©λ‘μ„ λ°˜ν™˜."""
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from app.tools import get_all_tools
from app.tool_search.embedder import get_tool_search
searcher = get_tool_search()
all_tools = get_all_tools()
searcher.index_tools(all_tools)
results: list[EvalResult] = []
for query, expected in TEST_CASES:
candidates = searcher.search(query, top_k=k)
ranked = [c.name for c in candidates]
scores = [c.score for c in candidates]
if expected is not None:
hit_rank = next(
(i + 1 for i, name in enumerate(ranked) if name == expected),
None,
)
else:
hit_rank = None
results.append(EvalResult(
query=query, expected=expected,
ranked=ranked, scores=scores, hit_rank=hit_rank,
))
return results
def _compute_metrics(results: list[EvalResult], k: int, threshold: float) -> dict:
"""κ²°κ³Ό λ¦¬μŠ€νŠΈμ—μ„œ μ§€ν‘œλ₯Ό 계산."""
tool_call = [r for r in results if not r.is_no_call]
no_call = [r for r in results if r.is_no_call]
tc_total = len(tool_call)
nc_total = len(no_call)
total = len(results)
hit1 = sum(1 for r in tool_call if r.hit_rank == 1)
recall = sum(1 for r in tool_call if r.hit_rank is not None)
mrr = sum(_reciprocal_rank(r) for r in tool_call) / tc_total if tc_total else 0.0
nc_correct = sum(1 for r in no_call if r.top_score < threshold)
tool_acc = hit1 / tc_total if tc_total else 0.0
recall_at_k = recall / tc_total if tc_total else 0.0
no_call_acc = nc_correct / nc_total if nc_total else 0.0
overall_acc = (hit1 + nc_correct) / total if total else 0.0
return {
"k": k,
"tc_total": tc_total,
"nc_total": nc_total,
"total": total,
"hit1": hit1,
"recall": recall,
"nc_correct": nc_correct,
"tool_acc": tool_acc,
"recall_at_k": recall_at_k,
"mrr": mrr,
"no_call_acc": no_call_acc,
"overall_acc": overall_acc,
}
# ──────────────────────────────────────────────────────────────
# 좜λ ₯
# ──────────────────────────────────────────────────────────────
def _print_single(results: list[EvalResult], k: int, threshold: float,
verbose: bool) -> None:
"""단일 k 에 λŒ€ν•œ 상세 좜λ ₯."""
m = _compute_metrics(results, k, threshold)
sep = "─" * 72
print(f"\n{'=' * 72}")
print(f" Tool Search 평가 (k={k}, threshold={threshold})")
print(f" 쿼리 수: tool-call {m['tc_total']}개 + no-call {m['nc_total']}개 = 총 {m['total']}개")
print(f"{'=' * 72}")
print(f"\n ── Tool-Call μ§€ν‘œ ({m['tc_total']}개 쿼리) ──")
print(f" Tool Acc (Hit@1) : {m['tool_acc']:.1%} ({m['hit1']}/{m['tc_total']})")
print(f" Recall@{k:<2} : {m['recall_at_k']:.1%} ({m['recall']}/{m['tc_total']})")
print(f" MRR : {m['mrr']:.4f}")
print(f"\n ── No-Call μ§€ν‘œ ({m['nc_total']}개 쿼리, threshold={threshold}) ──")
print(f" No-Call Acc : {m['no_call_acc']:.1%} ({m['nc_correct']}/{m['nc_total']})")
print(f"\n ── μ’…ν•© ──")
print(f" Overall Acc : {m['overall_acc']:.1%} ({m['hit1'] + m['nc_correct']}/{m['total']})")
print(sep)
# 미탐 (tool-call 쿼리)
tool_call = [r for r in results if not r.is_no_call]
misses = [r for r in tool_call if r.hit_rank is None]
if misses:
print(f"\n ❌ Tool-Call 미탐 ({len(misses)}개):")
for r in misses:
top3 = ", ".join(r.ranked[:3])
print(f" [{r.expected}] '{r.query}'")
print(f" β†’ top-3: {top3} (scores: {', '.join(f'{s:.3f}' for s in r.scores[:3])})")
else:
print(f"\n βœ… λͺ¨λ“  tool-call 쿼리가 top-{k} μ•ˆμ— μ •λ‹΅ 포함")
# No-Call 였판 (높은 점수둜 도ꡬ가 맀칭된 경우)
no_call = [r for r in results if r.is_no_call]
nc_fails = [r for r in no_call if r.top_score >= threshold]
if nc_fails:
print(f"\n ⚠️ No-Call 였판 ({len(nc_fails)}개 β€” top-1 score β‰₯ {threshold}):")
for r in nc_fails:
print(f" '{r.query}'")
print(f" β†’ top-1: {r.ranked[0]} (score={r.top_score:.3f})")
else:
print(f"\n βœ… λͺ¨λ“  no-call 쿼리가 threshold({threshold}) 미만")
# No-Call 점수 뢄포
if no_call:
nc_scores = [r.top_score for r in no_call]
print(f"\n πŸ“Š No-Call top-1 score 뢄포:")
print(f" min={min(nc_scores):.3f} avg={sum(nc_scores)/len(nc_scores):.3f} max={max(nc_scores):.3f}")
# Tool-Call 점수 뢄포
if tool_call:
tc_scores = [r.scores[0] for r in tool_call if r.scores]
print(f" πŸ“Š Tool-Call top-1 score 뢄포:")
print(f" min={min(tc_scores):.3f} avg={sum(tc_scores)/len(tc_scores):.3f} max={max(tc_scores):.3f}")
if verbose:
_print_verbose(results)
print()
def _print_verbose(results: list[EvalResult]) -> None:
"""전체 κ²°κ³Ό 상세 좜λ ₯."""
tool_call = [r for r in results if not r.is_no_call]
no_call = [r for r in results if r.is_no_call]
print(f"\n πŸ“‹ Tool-Call 전체 κ²°κ³Ό:")
print(f" {'':>2} {'μˆœμœ„':>4} {'score':>6} {'μ •λ‹΅ 도ꡬ':<38} 쿼리")
print(f" {'':>2} {'─'*4} {'─'*6} {'─'*38} {'─'*30}")
for r in sorted(tool_call, key=lambda x: x.hit_rank or 9999):
rank_str = f"#{r.hit_rank}" if r.hit_rank else "miss"
score_str = f"{r.top_score:.3f}" if r.scores else " - "
mark = "βœ…" if r.hit_rank and r.hit_rank <= 3 else ("⚠️" if r.hit_rank else "❌")
print(f" {mark} {rank_str:>4} {score_str:>6} {r.expected:<38} {r.query}")
print(f"\n πŸ“‹ No-Call 전체 κ²°κ³Ό:")
print(f" {'':>2} {'score':>6} {'top-1 도ꡬ':<38} 쿼리")
print(f" {'':>2} {'─'*6} {'─'*38} {'─'*30}")
for r in sorted(no_call, key=lambda x: -x.top_score):
score_str = f"{r.top_score:.3f}" if r.scores else " - "
mark = "βœ…" if r.top_score < DEFAULT_NO_CALL_THRESHOLD else "❌"
top1 = r.ranked[0] if r.ranked else "-"
print(f" {mark} {score_str:>6} {top1:<38} {r.query}")
def _print_compare(ks: list[int], threshold: float) -> None:
"""μ—¬λŸ¬ k 에 λŒ€ν•œ λΉ„κ΅ν‘œ 좜λ ₯."""
print(f"\n{'=' * 72}")
print(f" Tool Search 비ꡐ 평가 (threshold={threshold})")
print(f"{'=' * 72}")
results_cache: dict[int, list[EvalResult]] = {}
metrics_list: list[dict] = []
for k_val in ks:
results = _run_search(k_val)
results_cache[k_val] = results
metrics_list.append(_compute_metrics(results, k_val, threshold))
m0 = metrics_list[0]
print(f"\n 쿼리 수: tool-call {m0['tc_total']}개 + no-call {m0['nc_total']}개 = 총 {m0['total']}개\n")
# λΉ„κ΅ν‘œ
k_header = "".join(f"{'k='+str(m['k']):>10}" for m in metrics_list)
print(f" {'μ§€ν‘œ':<20}{k_header}")
print(f" {'─'*20}{'─'*10*len(metrics_list)}")
def _row(label: str, key: str, fmt: str = ".1%") -> str:
vals = "".join(f"{format(m[key], fmt):>10}" for m in metrics_list)
return f" {label:<20}{vals}"
print(_row("Tool Acc (Hit@1)", "tool_acc"))
print(_row("Recall@k", "recall_at_k"))
print(_row("MRR", "mrr", ".4f"))
print(_row("No-Call Acc", "no_call_acc"))
print(f" {'─'*20}{'─'*10*len(metrics_list)}")
print(_row("Overall Acc", "overall_acc"))
print()
# 미탐/였판 μš”μ•½
for k_val, results in results_cache.items():
tool_misses = [r for r in results if not r.is_no_call and r.hit_rank is None]
nc_fails = [r for r in results if r.is_no_call and r.top_score >= threshold]
if tool_misses or nc_fails:
print(f" k={k_val}: 미탐 {len(tool_misses)}건, no-call 였판 {len(nc_fails)}건")
for r in tool_misses:
print(f" ❌ [{r.expected}] '{r.query}' β†’ top-1: {r.ranked[0] if r.ranked else '-'}")
for r in nc_fails:
print(f" ⚠️ '{r.query}' β†’ {r.ranked[0]}({r.top_score:.3f})")
# 점수 뢄포
last_results = results_cache[ks[-1]]
tc = [r for r in last_results if not r.is_no_call]
nc = [r for r in last_results if r.is_no_call]
if tc and nc:
tc_scores = [r.scores[0] for r in tc if r.scores]
nc_scores = [r.top_score for r in nc]
print(f"\n πŸ“Š 점수 뢄포 (k={ks[-1]} κΈ°μ€€):")
print(f" Tool-Call top-1 : min={min(tc_scores):.3f} avg={sum(tc_scores)/len(tc_scores):.3f} max={max(tc_scores):.3f}")
print(f" No-Call top-1 : min={min(nc_scores):.3f} avg={sum(nc_scores)/len(nc_scores):.3f} max={max(nc_scores):.3f}")
gap = min(tc_scores) - max(nc_scores)
print(f" 뢄리 λ§ˆμ§„ (tool min - no-call max) = {gap:+.3f}")
print()
# ──────────────────────────────────────────────────────────────
# μ—”νŠΈλ¦¬ν¬μΈνŠΈ
# ──────────────────────────────────────────────────────────────
def run_eval(k: int = 10, verbose: bool = False, threshold: float = DEFAULT_NO_CALL_THRESHOLD) -> None:
results = _run_search(k)
_print_single(results, k, threshold, verbose)
def _run_card_validation() -> bool:
"""ToolCard μ •ν•©μ„± 검증. λ¬Έμ œκ°€ 있으면 κ²½κ³  좜λ ₯ ν›„ False λ°˜ν™˜."""
from app.tool_search.tool_cards import (
validate_confusion_pairs,
validate_duplicate_when_to_use,
)
print("=" * 60)
print(" ToolCard μ •ν•©μ„± 검증")
print("=" * 60)
warnings = validate_confusion_pairs() + validate_duplicate_when_to_use()
if warnings:
for w in warnings:
print(f" ⚠️ {w}")
print(f"\n 총 {len(warnings)}건 경고\n")
return False
print(" βœ… ν˜Όλ™ 쌍 cross-reference 정상")
print(" βœ… when_to_use 쀑볡 λ°œν™” μ—†μŒ\n")
return True
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Tool Search 평가 (Tool Acc + No-Call Acc)")
parser.add_argument("--k", type=int, default=10, help="top-k (κΈ°λ³Έκ°’: 10)")
parser.add_argument("--verbose", action="store_true", help="전체 κ²°κ³Ό 좜λ ₯")
parser.add_argument("--threshold", type=float, default=DEFAULT_NO_CALL_THRESHOLD,
help=f"No-Call νŒμ • μž„κ³„κ°’ (κΈ°λ³Έκ°’: {DEFAULT_NO_CALL_THRESHOLD})")
parser.add_argument("--compare", action="store_true", help="μ—¬λŸ¬ k 에 λŒ€ν•œ λΉ„κ΅ν‘œ 좜λ ₯")
parser.add_argument("--ks", type=int, nargs="+", default=[1, 3, 5, 7, 10],
help="비ꡐ할 k κ°’λ“€ (κΈ°λ³Έκ°’: 1 3 5 7 10)")
args = parser.parse_args()
_run_card_validation()
if args.compare:
_print_compare(args.ks, args.threshold)
else:
run_eval(k=args.k, verbose=args.verbose, threshold=args.threshold)