Spaces:
Running
Running
| """Tool Search Recall@k νκ° μ€ν¬λ¦½νΈ. | |
| μ¬μ©μ λ°ν(query) β μ λ΅ λꡬ μ΄λ¦(expected)μ μ μνκ³ | |
| ChromaDB κ²μμ΄ top-k μμ μ λ΅μ ν¬ν¨μν€λμ§ μΈ‘μ ν©λλ€. | |
| μ§ν: | |
| ββ Tool-Call 쿼리 (expected != None) ββ | |
| Hit@1 β top-1 μ΄ μ λ΅μΈ λΉμ¨ | |
| Recall@k β top-k μμ μ λ΅μ΄ 1κ° μ΄μ μλ λΉμ¨ | |
| MRR β Mean Reciprocal Rank (μ λ΅μ΄ μ²μ λ±μ₯νλ μμμ μμ νκ· ) | |
| Tool Acc β top-1 μ΄ μ λ΅μΈ λΉμ¨ (= Hit@1) | |
| ββ No-Call 쿼리 (expected == None) ββ | |
| No-Call Acc β top-1 score κ° threshold λ―Έλ§μΈ λΉμ¨ (λꡬ λΆνμ νλ³) | |
| ββ μ 체 ββ | |
| Overall Acc β (Tool Acc μ λ΅ μ + No-Call Acc μ λ΅ μ) / μ 체 쿼리 μ | |
| μ€ν: | |
| python -m scripts.eval_tool_recall | |
| python -m scripts.eval_tool_recall --k 5 --verbose | |
| python -m scripts.eval_tool_recall --compare # k=1,3,5,7,10 λΉκ΅ν | |
| python -m scripts.eval_tool_recall --compare --ks 3 5 7 # 컀μ€ν k κ° | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from dataclasses import dataclass, field | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ν μ€νΈ μΌμ΄μ€ μ μ | |
| # | |
| # νμ: (query, expected_tool_name | None) | |
| # expected=None β λꡬλ₯Ό νΈμΆνλ©΄ μ λλ 쿼리 (No-Call) | |
| # νΌλ μ(confusion pair)μ μ£ΌμμΌλ‘ νμν©λλ€. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TEST_CASES: list[tuple[str, str | None]] = [ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL-CALL μΌμ΄μ€ (λꡬλ₯Ό νΈμΆν΄μΌ νλ 쿼리) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ββ product_search ββββββββββββββββββββββββββββββββββββββ | |
| ("μ°λ¦¬ νμ¬ μν λ μμ΄?", "product_search"), | |
| ("λΌμ΄λμλͺ νλ§€ μν λͺ©λ‘ μλ €μ€", "product_search"), | |
| ("μΉμ보ν μμ΄?", "product_search"), | |
| ("μ보ν μν λκ° μμ΄?", "product_search"), | |
| ("μ 체 μν 리μ€νΈ 보μ¬μ€", "product_search"), | |
| ("μ’ μ 보ν μν μμ΄?", "product_search"), | |
| ("μΉλ§€ κ΄λ ¨ μν μμ΄?", "product_search"), | |
| ("κ°νΈμ¬μ¬ μν λͺ©λ‘", "product_search"), | |
| # νΌλ: coverage_summary vs product_search | |
| ("μ΄λ€ 보ν μν νλμ§ μκ³ μΆμ΄", "product_search"), | |
| # ββ coverage_summary ββββββββββββββββββββββββββββββββββββ | |
| ("μ΄ μν 보μ₯μ΄ λμΌ?", "coverage_summary"), | |
| ("B00197011 보μ₯ λ΄μ© μλ €μ€", "coverage_summary"), | |
| ("μ΄ λ³΄ν λ 보μ₯ν΄μ€?", "coverage_summary"), | |
| # νΌλ: coverage_summary vs product_search | |
| ("보μ₯ λ²μ μ 체 보μ¬μ€", "coverage_summary"), | |
| # ββ coverage_detail βββββββββββββββββββββββββββββββββββββ | |
| ("μ μ§λ¨κΈμ΄ μΌλ§μΌ?", "coverage_detail"), | |
| ("μΉμ 보μ₯μ΄ κ΅¬μ²΄μ μΌλ‘ μ΄λ»κ² λΌ?", "coverage_detail"), | |
| ("μ¬λ§λ³΄νκΈ μμΈ λ΄μ©", "coverage_detail"), | |
| # νΌλ: coverage_detail vs coverage_summary | |
| ("μ΄ μνμμ μ μ 보μ₯λ§ λ°λ‘ λ³΄κ³ μΆμ΄", "coverage_detail"), | |
| # ββ premium_estimate ββββββββββββββββββββββββββββββββββββ | |
| ("μ΄ μν 보νλ£ μΌλ§μΌ?", "premium_estimate"), | |
| ("40μΈ λ¨μ± 보νλ£ κ³μ°ν΄μ€", "premium_estimate"), | |
| ("μ λ©μ μ‘μ΄ μΌλ§λ λΌ?", "premium_estimate"), | |
| # νΌλ: premium_estimate vs plan_options | |
| ("보νλ£ μ°μΆν΄μ€", "premium_estimate"), | |
| # ββ plan_options ββββββββββββββββββββββββββββββββββββββββ | |
| ("λ©μ κΈ°κ° μ΅μ λ μμ΄?", "plan_options"), | |
| ("10λ λ© 20λ λ© μ€ μ ν κ°λ₯ν΄?", "plan_options"), | |
| # νΌλ: plan_options vs premium_estimate | |
| ("λ©μ λ°©μ μλ €μ€", "plan_options"), | |
| # ββ underwriting_precheck βββββββββββββββββββββββββββββββ | |
| ("λΉλ¨ μ΄λ ₯ μμ΄λ κ°μ κ°λ₯ν΄?", "underwriting_precheck"), | |
| ("κ³ νμμΈλ° μ보ν λ€ μ μμ΄?", "underwriting_precheck"), | |
| ("55μΈ λ¨μ± κΈ°μ‘΄ μμ μ΄λ ₯ μλλ° κ°μ λΌ?", "underwriting_precheck"), | |
| # νΌλ: underwriting_precheck vs eligibility_by_product_rule | |
| ("λ³λ ₯ μλ κ³ κ° μΈμ κ°λ₯ μ¬λΆ νμΈ", "underwriting_precheck"), | |
| # ββ eligibility_by_product_rule βββββββββββββββββββββββββ | |
| ("μ΄ μν λͺ μ΄κΉμ§ κ°μ κ°λ₯ν΄?", "eligibility_by_product_rule"), | |
| ("κ°μ κ°λ₯ λμ΄ λ²μ", "eligibility_by_product_rule"), | |
| ("μ΄λ€ μ±λμμ νμ?", "eligibility_by_product_rule"), | |
| # ββ claim_guide βββββββββββββββββββββββββββββββββββββββββ | |
| ("보νκΈ μ²κ΅¬ μ΄λ»κ² ν΄?", "claim_guide"), | |
| ("μ μ§λ¨ ν μ²κ΅¬ μ μ°¨", "claim_guide"), | |
| ("μ μλΉ μ²κ΅¬νλ €λ©΄?", "claim_guide"), | |
| # νΌλ: claim_guide vs coverage_detail | |
| ("μ²κ΅¬ λ°©λ² μλ €μ€", "claim_guide"), | |
| # ββ underwriting_waiting_periods ββββββββββββββββββββββββ | |
| ("λ©΄μ± κΈ°κ°μ΄ μΌλ§μΌ?", "underwriting_waiting_periods"), | |
| ("κ°μ νκ³ μΈμ λΆν° 보μ₯λΌ?", "underwriting_waiting_periods"), | |
| ("보μ₯κ°μμΌμ΄ μΈμ μΌ?", "underwriting_waiting_periods"), | |
| # ββ underwriting_exclusions βββββββββββββββββββββββββββββ | |
| ("보μ₯ μ λλ κ²½μ°κ° λμΌ?", "underwriting_exclusions"), | |
| ("λ©΄μ± μ¬μ λͺ©λ‘", "underwriting_exclusions"), | |
| # ββ rag_terms_query_engine ββββββββββββββββββββββββββββββ | |
| ("μ½κ΄μμ λ©΄μ± μ‘°κ±΄ μ°Ύμμ€", "rag_terms_query_engine"), | |
| ("μ½κ΄μ μμ μ μ", "rag_terms_query_engine"), | |
| # νΌλ: rag_terms vs rag_product_info | |
| ("κ³ μ§μ무 κ·μ μ΄ μ½κ΄μ μ΄λ»κ² λμ μμ΄?", "rag_terms_query_engine"), | |
| # ββ rag_product_info_query_engine βββββββββββββββββββββββ | |
| ("μνμμ½μμμ 보μ₯ λ΄μ© μ°Ύμμ€", "rag_product_info_query_engine"), | |
| ("μ΄ μν μμ½μ λ΄μ©", "rag_product_info_query_engine"), | |
| # ββ compliance ββββββββββββββββββββββββββββββββββββββββββ | |
| ("μ΄ λ¬Έκ΅¬ μ¨λ λΌ?", "compliance_misleading_check"), | |
| ("μ΄ μ€ν¬λ¦½νΈμ κΈμΉμ΄ μμ΄?", "compliance_misleading_check"), | |
| ("λ©΄μ± κ΄λ ¨ μ€λ² λ©νΈ λ§λ€μ΄μ€", "compliance_phrase_generator"), | |
| ("TM λ Ήμ·¨ κ³ μ§ λ©νΈ", "recording_notice_script"), | |
| ("κ°μΈμ 보 λ§μ€νΉν΄μ€", "privacy_masking"), | |
| ("μ£Όλ―Όλ²νΈ μ§μμ€", "privacy_masking"), | |
| # ββ customer_db βββββββββββββββββββββββββββββββββββββββββ | |
| ("νκΈΈλ κ³ κ° κ³μ½ μ‘°ν", "customer_contract_lookup"), | |
| ("μ΄ κ³ κ° μ€λ³΅ κ°μ λΌ?", "duplicate_enrollment_check"), | |
| # ββ misc ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ("κ°±μ νλ©΄ 보νλ£ μΌλ§λ μ¬λΌ?", "renewal_premium_projection"), | |
| ("μ§μ μνλ νμΈν΄μ€", "underwriting_high_risk_job_check"), | |
| ("μλ°©κ΄λ κ°μ κ°λ₯ν΄?", "underwriting_high_risk_job_check"), | |
| ("μ΄ λ³λ ₯ κ³ μ§ν΄μΌ ν΄?", "underwriting_disclosure_risk_score"), | |
| ("ν΄μ½νλ©΄ λ μΌλ§ λλ €λ°μ?", "surrender_value_explain"), | |
| ("κ³μ½ ν΄μ§νκ³ μΆμ΄", "contract_manage"), | |
| ("μΉμ 보μ₯ μ°κ° λͺ κ°κΉμ§μΌ?", "benefit_limit_rules"), | |
| ("μ μ§λ¨κΈ μΌλ§ λ°μ?", "benefit_amount_lookup"), | |
| ("ICD μ½λ C50 μ΄ λ¬΄μ¨ λ³μ΄μΌ?", "icd_mapping_lookup"), | |
| ("κ³ κ° λͺ©νμ λ§λ νΉμ½ μΆμ²ν΄μ€", "rider_bundle_recommend"), | |
| ("λμΌ μΉμ μ€λ³΅ μ²κ΅¬ κ·μΉ", "multi_benefit_conflict_rule"), | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NO-CALL μΌμ΄μ€ (λꡬλ₯Ό νΈμΆνλ©΄ μ λλ 쿼리) | |
| # | |
| # 보ν λλ©μΈ μμ΄μ§λ§ νΉμ λκ΅¬κ° νμ μλ μΌλ° μ§λ¬Έ, | |
| # λλ μΈμ¬/κ°μ¬/νμΈ λ± λνν λ°ν. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ββ μΌλ° 보ν μ§μ (λꡬ μμ΄ LLMμ΄ μ§μ λ΅ν μ μμ) ββ | |
| ("보νμ΄λ 무μμΈκ°μ?", None), | |
| ("μ’ μ 보νμ΄λ μ 기보ν μ°¨μ΄κ° λμΌ?", None), | |
| ("μ€μ보ν λ»μ΄ λμΌ?", None), | |
| ("보νλ£μ 보νκΈμ μ°¨μ΄", None), | |
| ("보ν κ°μ μ μ£Όμμ¬νμ΄ λμΌ?", None), | |
| # ββ λνν λ°ν (λꡬ λΆνμ) ββ | |
| ("κ°μ¬ν©λλ€ μ μκ² μ΅λλ€", None), | |
| ("λ€ μκ² μ΄μ", None), | |
| ("λ°©κΈ λ§μν΄μ£Όμ λ΄μ© μμ½ν΄μ€", None), | |
| ("μ’ λ μ½κ² μ€λͺ ν΄μ€ μ μμ΄?", None), | |
| ("λ€λ₯Έ 건 μμ΄μ κ°μ¬ν©λλ€", None), | |
| # ββ λλ©μΈ λ΄μ΄μ§λ§ λͺ¨νΈν μ§λ¬Έ (νΉμ λꡬ λ§€ν λΆκ°) ββ | |
| ("보ν λ€ λ λ νμΈν΄μΌ ν κΉ?", None), | |
| ("보ν μ€κ³μ¬νν λ λ¬Όμ΄λ΄μΌ ν΄?", None), | |
| ("보ν νλλ§ λ€λ €λ©΄ λκ° μ’μκΉ?", None), | |
| ("보ν ν΄μ§νλ©΄ λΆμ΄μ΅μ΄ μλμ?", None), | |
| ("보νλ£λ₯Ό μλΌλ λ°©λ²μ΄ μμκΉ?", None), | |
| ] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # νκ° λ‘μ§ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DEFAULT_NO_CALL_THRESHOLD = 0.86 | |
| class EvalResult: | |
| query: str | |
| expected: str | None | |
| ranked: list[str] | |
| scores: list[float] = field(default_factory=list) | |
| hit_rank: int | None = None | |
| def is_no_call(self) -> bool: | |
| return self.expected is None | |
| def top_score(self) -> float: | |
| return self.scores[0] if self.scores else 0.0 | |
| def _reciprocal_rank(result: EvalResult) -> float: | |
| return 1.0 / result.hit_rank if result.hit_rank else 0.0 | |
| def _run_search(k: int) -> list[EvalResult]: | |
| """TEST_CASES λ₯Ό μ€ννκ³ EvalResult λͺ©λ‘μ λ°ν.""" | |
| import os | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) | |
| from app.tools import get_all_tools | |
| from app.tool_search.embedder import get_tool_search | |
| searcher = get_tool_search() | |
| all_tools = get_all_tools() | |
| searcher.index_tools(all_tools) | |
| results: list[EvalResult] = [] | |
| for query, expected in TEST_CASES: | |
| candidates = searcher.search(query, top_k=k) | |
| ranked = [c.name for c in candidates] | |
| scores = [c.score for c in candidates] | |
| if expected is not None: | |
| hit_rank = next( | |
| (i + 1 for i, name in enumerate(ranked) if name == expected), | |
| None, | |
| ) | |
| else: | |
| hit_rank = None | |
| results.append(EvalResult( | |
| query=query, expected=expected, | |
| ranked=ranked, scores=scores, hit_rank=hit_rank, | |
| )) | |
| return results | |
| def _compute_metrics(results: list[EvalResult], k: int, threshold: float) -> dict: | |
| """κ²°κ³Ό 리μ€νΈμμ μ§νλ₯Ό κ³μ°.""" | |
| tool_call = [r for r in results if not r.is_no_call] | |
| no_call = [r for r in results if r.is_no_call] | |
| tc_total = len(tool_call) | |
| nc_total = len(no_call) | |
| total = len(results) | |
| hit1 = sum(1 for r in tool_call if r.hit_rank == 1) | |
| recall = sum(1 for r in tool_call if r.hit_rank is not None) | |
| mrr = sum(_reciprocal_rank(r) for r in tool_call) / tc_total if tc_total else 0.0 | |
| nc_correct = sum(1 for r in no_call if r.top_score < threshold) | |
| tool_acc = hit1 / tc_total if tc_total else 0.0 | |
| recall_at_k = recall / tc_total if tc_total else 0.0 | |
| no_call_acc = nc_correct / nc_total if nc_total else 0.0 | |
| overall_acc = (hit1 + nc_correct) / total if total else 0.0 | |
| return { | |
| "k": k, | |
| "tc_total": tc_total, | |
| "nc_total": nc_total, | |
| "total": total, | |
| "hit1": hit1, | |
| "recall": recall, | |
| "nc_correct": nc_correct, | |
| "tool_acc": tool_acc, | |
| "recall_at_k": recall_at_k, | |
| "mrr": mrr, | |
| "no_call_acc": no_call_acc, | |
| "overall_acc": overall_acc, | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # μΆλ ₯ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _print_single(results: list[EvalResult], k: int, threshold: float, | |
| verbose: bool) -> None: | |
| """λ¨μΌ k μ λν μμΈ μΆλ ₯.""" | |
| m = _compute_metrics(results, k, threshold) | |
| sep = "β" * 72 | |
| print(f"\n{'=' * 72}") | |
| print(f" Tool Search νκ° (k={k}, threshold={threshold})") | |
| print(f" 쿼리 μ: tool-call {m['tc_total']}κ° + no-call {m['nc_total']}κ° = μ΄ {m['total']}κ°") | |
| print(f"{'=' * 72}") | |
| print(f"\n ββ Tool-Call μ§ν ({m['tc_total']}κ° μΏΌλ¦¬) ββ") | |
| print(f" Tool Acc (Hit@1) : {m['tool_acc']:.1%} ({m['hit1']}/{m['tc_total']})") | |
| print(f" Recall@{k:<2} : {m['recall_at_k']:.1%} ({m['recall']}/{m['tc_total']})") | |
| print(f" MRR : {m['mrr']:.4f}") | |
| print(f"\n ββ No-Call μ§ν ({m['nc_total']}κ° μΏΌλ¦¬, threshold={threshold}) ββ") | |
| print(f" No-Call Acc : {m['no_call_acc']:.1%} ({m['nc_correct']}/{m['nc_total']})") | |
| print(f"\n ββ μ’ ν© ββ") | |
| print(f" Overall Acc : {m['overall_acc']:.1%} ({m['hit1'] + m['nc_correct']}/{m['total']})") | |
| print(sep) | |
| # λ―Έν (tool-call 쿼리) | |
| tool_call = [r for r in results if not r.is_no_call] | |
| misses = [r for r in tool_call if r.hit_rank is None] | |
| if misses: | |
| print(f"\n β Tool-Call λ―Έν ({len(misses)}κ°):") | |
| for r in misses: | |
| top3 = ", ".join(r.ranked[:3]) | |
| print(f" [{r.expected}] '{r.query}'") | |
| print(f" β top-3: {top3} (scores: {', '.join(f'{s:.3f}' for s in r.scores[:3])})") | |
| else: | |
| print(f"\n β λͺ¨λ tool-call μΏΌλ¦¬κ° top-{k} μμ μ λ΅ ν¬ν¨") | |
| # No-Call μ€ν (λμ μ μλ‘ λκ΅¬κ° λ§€μΉλ κ²½μ°) | |
| no_call = [r for r in results if r.is_no_call] | |
| nc_fails = [r for r in no_call if r.top_score >= threshold] | |
| if nc_fails: | |
| print(f"\n β οΈ No-Call μ€ν ({len(nc_fails)}κ° β top-1 score β₯ {threshold}):") | |
| for r in nc_fails: | |
| print(f" '{r.query}'") | |
| print(f" β top-1: {r.ranked[0]} (score={r.top_score:.3f})") | |
| else: | |
| print(f"\n β λͺ¨λ no-call μΏΌλ¦¬κ° threshold({threshold}) λ―Έλ§") | |
| # No-Call μ μ λΆν¬ | |
| if no_call: | |
| nc_scores = [r.top_score for r in no_call] | |
| print(f"\n π No-Call top-1 score λΆν¬:") | |
| print(f" min={min(nc_scores):.3f} avg={sum(nc_scores)/len(nc_scores):.3f} max={max(nc_scores):.3f}") | |
| # Tool-Call μ μ λΆν¬ | |
| if tool_call: | |
| tc_scores = [r.scores[0] for r in tool_call if r.scores] | |
| print(f" π Tool-Call top-1 score λΆν¬:") | |
| print(f" min={min(tc_scores):.3f} avg={sum(tc_scores)/len(tc_scores):.3f} max={max(tc_scores):.3f}") | |
| if verbose: | |
| _print_verbose(results) | |
| print() | |
| def _print_verbose(results: list[EvalResult]) -> None: | |
| """μ 체 κ²°κ³Ό μμΈ μΆλ ₯.""" | |
| tool_call = [r for r in results if not r.is_no_call] | |
| no_call = [r for r in results if r.is_no_call] | |
| print(f"\n π Tool-Call μ 체 κ²°κ³Ό:") | |
| print(f" {'':>2} {'μμ':>4} {'score':>6} {'μ λ΅ λꡬ':<38} 쿼리") | |
| print(f" {'':>2} {'β'*4} {'β'*6} {'β'*38} {'β'*30}") | |
| for r in sorted(tool_call, key=lambda x: x.hit_rank or 9999): | |
| rank_str = f"#{r.hit_rank}" if r.hit_rank else "miss" | |
| score_str = f"{r.top_score:.3f}" if r.scores else " - " | |
| mark = "β " if r.hit_rank and r.hit_rank <= 3 else ("β οΈ" if r.hit_rank else "β") | |
| print(f" {mark} {rank_str:>4} {score_str:>6} {r.expected:<38} {r.query}") | |
| print(f"\n π No-Call μ 체 κ²°κ³Ό:") | |
| print(f" {'':>2} {'score':>6} {'top-1 λꡬ':<38} 쿼리") | |
| print(f" {'':>2} {'β'*6} {'β'*38} {'β'*30}") | |
| for r in sorted(no_call, key=lambda x: -x.top_score): | |
| score_str = f"{r.top_score:.3f}" if r.scores else " - " | |
| mark = "β " if r.top_score < DEFAULT_NO_CALL_THRESHOLD else "β" | |
| top1 = r.ranked[0] if r.ranked else "-" | |
| print(f" {mark} {score_str:>6} {top1:<38} {r.query}") | |
| def _print_compare(ks: list[int], threshold: float) -> None: | |
| """μ¬λ¬ k μ λν λΉκ΅ν μΆλ ₯.""" | |
| print(f"\n{'=' * 72}") | |
| print(f" Tool Search λΉκ΅ νκ° (threshold={threshold})") | |
| print(f"{'=' * 72}") | |
| results_cache: dict[int, list[EvalResult]] = {} | |
| metrics_list: list[dict] = [] | |
| for k_val in ks: | |
| results = _run_search(k_val) | |
| results_cache[k_val] = results | |
| metrics_list.append(_compute_metrics(results, k_val, threshold)) | |
| m0 = metrics_list[0] | |
| print(f"\n 쿼리 μ: tool-call {m0['tc_total']}κ° + no-call {m0['nc_total']}κ° = μ΄ {m0['total']}κ°\n") | |
| # λΉκ΅ν | |
| k_header = "".join(f"{'k='+str(m['k']):>10}" for m in metrics_list) | |
| print(f" {'μ§ν':<20}{k_header}") | |
| print(f" {'β'*20}{'β'*10*len(metrics_list)}") | |
| def _row(label: str, key: str, fmt: str = ".1%") -> str: | |
| vals = "".join(f"{format(m[key], fmt):>10}" for m in metrics_list) | |
| return f" {label:<20}{vals}" | |
| print(_row("Tool Acc (Hit@1)", "tool_acc")) | |
| print(_row("Recall@k", "recall_at_k")) | |
| print(_row("MRR", "mrr", ".4f")) | |
| print(_row("No-Call Acc", "no_call_acc")) | |
| print(f" {'β'*20}{'β'*10*len(metrics_list)}") | |
| print(_row("Overall Acc", "overall_acc")) | |
| print() | |
| # λ―Έν/μ€ν μμ½ | |
| for k_val, results in results_cache.items(): | |
| tool_misses = [r for r in results if not r.is_no_call and r.hit_rank is None] | |
| nc_fails = [r for r in results if r.is_no_call and r.top_score >= threshold] | |
| if tool_misses or nc_fails: | |
| print(f" k={k_val}: λ―Έν {len(tool_misses)}건, no-call μ€ν {len(nc_fails)}건") | |
| for r in tool_misses: | |
| print(f" β [{r.expected}] '{r.query}' β top-1: {r.ranked[0] if r.ranked else '-'}") | |
| for r in nc_fails: | |
| print(f" β οΈ '{r.query}' β {r.ranked[0]}({r.top_score:.3f})") | |
| # μ μ λΆν¬ | |
| last_results = results_cache[ks[-1]] | |
| tc = [r for r in last_results if not r.is_no_call] | |
| nc = [r for r in last_results if r.is_no_call] | |
| if tc and nc: | |
| tc_scores = [r.scores[0] for r in tc if r.scores] | |
| nc_scores = [r.top_score for r in nc] | |
| print(f"\n π μ μ λΆν¬ (k={ks[-1]} κΈ°μ€):") | |
| print(f" Tool-Call top-1 : min={min(tc_scores):.3f} avg={sum(tc_scores)/len(tc_scores):.3f} max={max(tc_scores):.3f}") | |
| print(f" No-Call top-1 : min={min(nc_scores):.3f} avg={sum(nc_scores)/len(nc_scores):.3f} max={max(nc_scores):.3f}") | |
| gap = min(tc_scores) - max(nc_scores) | |
| print(f" λΆλ¦¬ λ§μ§ (tool min - no-call max) = {gap:+.3f}") | |
| print() | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # μνΈλ¦¬ν¬μΈνΈ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_eval(k: int = 10, verbose: bool = False, threshold: float = DEFAULT_NO_CALL_THRESHOLD) -> None: | |
| results = _run_search(k) | |
| _print_single(results, k, threshold, verbose) | |
| def _run_card_validation() -> bool: | |
| """ToolCard μ ν©μ± κ²μ¦. λ¬Έμ κ° μμΌλ©΄ κ²½κ³ μΆλ ₯ ν False λ°ν.""" | |
| from app.tool_search.tool_cards import ( | |
| validate_confusion_pairs, | |
| validate_duplicate_when_to_use, | |
| ) | |
| print("=" * 60) | |
| print(" ToolCard μ ν©μ± κ²μ¦") | |
| print("=" * 60) | |
| warnings = validate_confusion_pairs() + validate_duplicate_when_to_use() | |
| if warnings: | |
| for w in warnings: | |
| print(f" β οΈ {w}") | |
| print(f"\n μ΄ {len(warnings)}건 κ²½κ³ \n") | |
| return False | |
| print(" β νΌλ μ cross-reference μ μ") | |
| print(" β when_to_use μ€λ³΅ λ°ν μμ\n") | |
| return True | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Tool Search νκ° (Tool Acc + No-Call Acc)") | |
| parser.add_argument("--k", type=int, default=10, help="top-k (κΈ°λ³Έκ°: 10)") | |
| parser.add_argument("--verbose", action="store_true", help="μ 체 κ²°κ³Ό μΆλ ₯") | |
| parser.add_argument("--threshold", type=float, default=DEFAULT_NO_CALL_THRESHOLD, | |
| help=f"No-Call νμ μκ³κ° (κΈ°λ³Έκ°: {DEFAULT_NO_CALL_THRESHOLD})") | |
| parser.add_argument("--compare", action="store_true", help="μ¬λ¬ k μ λν λΉκ΅ν μΆλ ₯") | |
| parser.add_argument("--ks", type=int, nargs="+", default=[1, 3, 5, 7, 10], | |
| help="λΉκ΅ν k κ°λ€ (κΈ°λ³Έκ°: 1 3 5 7 10)") | |
| args = parser.parse_args() | |
| _run_card_validation() | |
| if args.compare: | |
| _print_compare(args.ks, args.threshold) | |
| else: | |
| run_eval(k=args.k, verbose=args.verbose, threshold=args.threshold) | |