Spaces:
Running
Running
| """ | |
| Step 17: bge-m3 vs bge-m3+HyDE vs +HyDE+Rerank 비교 평가 (Top5) | |
| 세 검색 방식의 Top5 결과를 나란히 비교하는 Excel 출력. | |
| 실행: | |
| python scripts/17_eval_comparison.py | |
| 출력: | |
| data/generated/Menusearch_100_answer.xlsx (비교 시트로 교체) | |
| Excel 컬럼 구조: | |
| A: No | B: 쿼리 | C: 정답 메뉴 | |
| D~H : [bge-m3] Top1~5 | |
| I : [bge-m3] 평가결과 | |
| J : [bge-m3] Top1 유사도 | |
| K~O : [+HyDE] Top1~5 | |
| P : [+HyDE] 평가결과 | |
| Q : [+HyDE] Top1 유사도 | |
| R~V : [+HyDE+Rerank] Top1~5 | |
| W : [+HyDE+Rerank] 평가결과 | |
| X : [+HyDE+Rerank] Top1 유사도 | |
| Y : 변화 (+HyDE vs +HyDE+Rerank) | |
| """ | |
| import sys | |
| import re | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import pandas as pd | |
| from openpyxl import load_workbook | |
| from openpyxl.styles import PatternFill, Font, Alignment, Border, Side | |
| from core.search_engine import MenuSearchEngine | |
| EXCEL_PATH = Path(__file__).parent.parent / "data" / "generated" / "Menusearch_100_answer.xlsx" | |
| # 결과 우선순위 (높을수록 좋음) | |
| RANK_MAP = {"Top1 정답": 3, "Top3 정답": 2, "Top5 정답": 1, "오답": 0, "평가제외": -1} | |
| # --------------------------------------------------------------------------- | |
| # 정답 비교 유틸 (16_eval_with_answer.py 와 동일) | |
| # --------------------------------------------------------------------------- | |
| def normalize_path(path: str) -> str: | |
| if not path: | |
| return "" | |
| path = re.sub(r'\s*>\s*', '>', path) | |
| path = re.sub(r'\(.*?\)', '', path) | |
| return path.strip().lower() | |
| def matches(pred: str, ans: str) -> bool: | |
| if not pred or not ans: | |
| return False | |
| if pred == ans: | |
| return True | |
| pred_end = pred.split(">")[-1].strip() | |
| ans_end = ans.split(">")[-1].strip() | |
| if pred_end and ans_end and pred_end == ans_end: | |
| pred_parts = pred.split(">") | |
| ans_parts = ans.split(">") | |
| overlap = sum(1 for p in pred_parts if p in ans_parts) | |
| return overlap >= min(2, len(ans_parts)) | |
| return False | |
| def evaluate(answer_cell, top_paths: list) -> tuple: | |
| if not answer_cell or pd.isna(answer_cell): | |
| return None, None, None | |
| ans_lower = str(answer_cell).lower() | |
| first_line = ans_lower.split("\n")[0] | |
| if ">" not in first_line and any( | |
| first_line.startswith(m) for m in ["hts", "계좌개설", "마이페이지"] | |
| ): | |
| return None, None, None | |
| valid_answers = [normalize_path(a) for a in str(answer_cell).split("\n") if a.strip()] | |
| norm_top = [normalize_path(p) for p in top_paths if p] | |
| def hit_at(k): | |
| return any(matches(p, a) for p in norm_top[:k] for a in valid_answers) | |
| return hit_at(1), hit_at(3), hit_at(5) | |
| def label_from_eval(a1, a3, a5) -> str: | |
| if a1 is None: | |
| return "평가제외" | |
| if a1: | |
| return "Top1 정답" | |
| if a3: | |
| return "Top3 정답" | |
| if a5: | |
| return "Top5 정답" | |
| return "오답" | |
| # --------------------------------------------------------------------------- | |
| # 검색 실행 (단일 모드) | |
| # --------------------------------------------------------------------------- | |
| def run_search(engine, df_src: pd.DataFrame, use_hyde: bool, use_reranker: bool = False) -> list: | |
| if use_reranker: | |
| mode = "+HyDE+Rerank" if use_hyde else "+Rerank" | |
| else: | |
| mode = "+HyDE" if use_hyde else "bge-m3" | |
| print(f"\n[검색] {mode} 모드 시작...") | |
| results = [] | |
| for _, row in df_src.iterrows(): | |
| query = str(row["쿼리"]).strip() | |
| answer_raw = row["정답 메뉴"] if pd.notna(row.get("정답 메뉴")) else None | |
| hits = engine.search(query, top_n=5, threshold=0.0, use_hyde=use_hyde, use_reranker=use_reranker) | |
| paths = [h.get("menu_path", "") for h in hits] | |
| while len(paths) < 5: | |
| paths.append("") | |
| sims = [round(h["similarity"], 4) for h in hits] | |
| while len(sims) < 5: | |
| sims.append(0.0) | |
| a1, a3, a5 = evaluate(answer_raw, paths) | |
| lbl = label_from_eval(a1, a3, a5) | |
| status = "[O]" if a1 else ("[ ]" if a1 is None else "[X]") | |
| print(f" {status} [{int(row['No']):3d}] {query[:28]:<28} -> {paths[0][:40]}") | |
| results.append({ | |
| "paths": paths, | |
| "sims": sims, | |
| "label": lbl, | |
| "a1": a1, "a3": a3, "a5": a5, | |
| }) | |
| return results | |
| # --------------------------------------------------------------------------- | |
| # 메인 | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| print("[17_eval] bge-m3 vs bge-m3+HyDE 비교 평가 시작") | |
| # ── 1. 원본 쿼리/정답 로드 ────────────────────────────────────────────── | |
| df_src = pd.read_excel(EXCEL_PATH, sheet_name="시트1", usecols=["No", "쿼리", "정답 메뉴"]) | |
| df_src = df_src[pd.to_numeric(df_src["No"], errors="coerce").notna()].reset_index(drop=True) | |
| print(f"[17_eval] {len(df_src)}개 쿼리 로드 완료") | |
| # ── 2. 검색엔진 초기화 ────────────────────────────────────────────────── | |
| print("[17_eval] 검색엔진 초기화 중...") | |
| engine = MenuSearchEngine.get_instance() | |
| print("[17_eval] 검색엔진 준비 완료") | |
| # ── 3. 세 모드 검색 실행 ──────────────────────────────────────────────── | |
| base_results = run_search(engine, df_src, use_hyde=False, use_reranker=False) | |
| hyde_results = run_search(engine, df_src, use_hyde=True, use_reranker=False) | |
| rerank_results = run_search(engine, df_src, use_hyde=True, use_reranker=True) | |
| # ── 4. 통계 계산 ──────────────────────────────────────────────────────── | |
| def calc_stats(results): | |
| a1_list = [r["a1"] for r in results] | |
| a3_list = [r["a3"] for r in results] | |
| a5_list = [r["a5"] for r in results] | |
| valid = sum(a is not None for a in a1_list) | |
| skip = len(results) - valid | |
| n1 = sum(a for a in a1_list if a is not None) | |
| n3 = sum(a for a in a3_list if a is not None) | |
| n5 = sum(a for a in a5_list if a is not None) | |
| r1 = n1 / valid if valid else 0 | |
| r3 = n3 / valid if valid else 0 | |
| r5 = n5 / valid if valid else 0 | |
| return valid, skip, n1, n3, n5, r1, r3, r5 | |
| b_valid, b_skip, b_n1, b_n3, b_n5, b_r1, b_r3, b_r5 = calc_stats(base_results) | |
| h_valid, h_skip, h_n1, h_n3, h_n5, h_r1, h_r3, h_r5 = calc_stats(hyde_results) | |
| rr_valid, rr_skip, rr_n1, rr_n3, rr_n5, rr_r1, rr_r3, rr_r5 = calc_stats(rerank_results) | |
| print("\n" + "=" * 80) | |
| print("== 비교 결과 요약 ==") | |
| print("=" * 80) | |
| print(f"{'':20s} {'bge-m3':>12s} {'+HyDE':>12s} {'+HyDE+Rerank':>14s}") | |
| print(f" {'Acc@1':<18s} {b_n1}/{b_valid}={b_r1:.1%} {h_n1}/{h_valid}={h_r1:.1%} {rr_n1}/{rr_valid}={rr_r1:.1%}") | |
| print(f" {'Acc@3':<18s} {b_n3}/{b_valid}={b_r3:.1%} {h_n3}/{h_valid}={h_r3:.1%} {rr_n3}/{rr_valid}={rr_r3:.1%}") | |
| print(f" {'Acc@5':<18s} {b_n5}/{b_valid}={b_r5:.1%} {h_n5}/{h_valid}={h_r5:.1%} {rr_n5}/{rr_valid}={rr_r5:.1%}") | |
| print("=" * 80) | |
| # ── 5. DataFrame 구성 ─────────────────────────────────────────────────── | |
| rows = [] | |
| for i, (_, src_row) in enumerate(df_src.iterrows()): | |
| br = base_results[i] | |
| hr = hyde_results[i] | |
| rr = rerank_results[i] | |
| hyde_rank = RANK_MAP.get(hr["label"], -1) | |
| rerank_rank = RANK_MAP.get(rr["label"], -1) | |
| if hr["label"] == "평가제외": | |
| change = "제외" | |
| elif rerank_rank > hyde_rank: | |
| change = "개선" | |
| elif rerank_rank < hyde_rank: | |
| change = "하락" | |
| else: | |
| change = "동일" | |
| rows.append({ | |
| "No": src_row["No"], | |
| "쿼리": str(src_row["쿼리"]).strip(), | |
| "정답 메뉴": src_row["정답 메뉴"] if pd.notna(src_row.get("정답 메뉴")) else "", | |
| # bge-m3 | |
| "bge Top1": br["paths"][0], "bge Top2": br["paths"][1], "bge Top3": br["paths"][2], | |
| "bge Top4": br["paths"][3], "bge Top5": br["paths"][4], | |
| "bge 결과": br["label"], "bge 유사도": br["sims"][0], | |
| # +HyDE | |
| "hyde Top1": hr["paths"][0], "hyde Top2": hr["paths"][1], "hyde Top3": hr["paths"][2], | |
| "hyde Top4": hr["paths"][3], "hyde Top5": hr["paths"][4], | |
| "hyde 결과": hr["label"], "hyde 유사도": hr["sims"][0], | |
| # +HyDE+Rerank | |
| "rr Top1": rr["paths"][0], "rr Top2": rr["paths"][1], "rr Top3": rr["paths"][2], | |
| "rr Top4": rr["paths"][3], "rr Top5": rr["paths"][4], | |
| "rr 결과": rr["label"], "rr 유사도": rr["sims"][0], | |
| # 변화 (+HyDE vs +HyDE+Rerank) | |
| "변화": change, | |
| }) | |
| df_out = pd.DataFrame(rows) | |
| df_out.columns = [ | |
| "No", "쿼리", "정답 메뉴", | |
| "[bge-m3] Top1", "[bge-m3] Top2", "[bge-m3] Top3", "[bge-m3] Top4", "[bge-m3] Top5", | |
| "[bge-m3] 결과", "[bge-m3] 유사도", | |
| "[+HyDE] Top1", "[+HyDE] Top2", "[+HyDE] Top3", "[+HyDE] Top4", "[+HyDE] Top5", | |
| "[+HyDE] 결과", "[+HyDE] 유사도", | |
| "[+HyDE+Rerank] Top1", "[+HyDE+Rerank] Top2", "[+HyDE+Rerank] Top3", | |
| "[+HyDE+Rerank] Top4", "[+HyDE+Rerank] Top5", | |
| "[+HyDE+Rerank] 결과", "[+HyDE+Rerank] 유사도", | |
| "변화", | |
| ] | |
| df_out.to_excel(EXCEL_PATH, sheet_name="시트1", index=False) | |
| # ── 6. Excel 스타일링 ──────────────────────────────────────────────────── | |
| wb = load_workbook(EXCEL_PATH) | |
| ws = wb["시트1"] | |
| # 열 너비 (A~Y, 25컬럼) | |
| col_widths = { | |
| "A": 5, "B": 34, "C": 34, | |
| "D": 28, "E": 28, "F": 28, "G": 28, "H": 28, "I": 12, "J": 11, | |
| "K": 28, "L": 28, "M": 28, "N": 28, "O": 28, "P": 12, "Q": 11, | |
| "R": 28, "S": 28, "T": 28, "U": 28, "V": 28, "W": 12, "X": 11, | |
| "Y": 9, | |
| } | |
| for col, w in col_widths.items(): | |
| ws.column_dimensions[col].width = w | |
| ws.row_dimensions[1].height = 28 | |
| HDR_BASE = "1F4E79" # bge-m3 (진파랑) | |
| HDR_HYDE = "1A5276" # +HyDE (중간파랑) | |
| HDR_RERANK = "145A32" # +Rerank (진초록) | |
| HDR_ETC = "2C3E50" # 공통 (차콜) | |
| FILL_MAP = { | |
| "Top1 정답": PatternFill("solid", start_color="C6EFCE"), | |
| "Top3 정답": PatternFill("solid", start_color="FFEB9C"), | |
| "Top5 정답": PatternFill("solid", start_color="FCE4D6"), | |
| "오답": PatternFill("solid", start_color="FFC7CE"), | |
| "평가제외": PatternFill("solid", start_color="F2F2F2"), | |
| } | |
| CHANGE_FILL = { | |
| "개선": PatternFill("solid", start_color="ABEBC6"), | |
| "하락": PatternFill("solid", start_color="F1948A"), | |
| "동일": PatternFill("solid", start_color="EBF5FB"), | |
| "제외": PatternFill("solid", start_color="F2F2F2"), | |
| } | |
| CHANGE_FC = {"개선": "1E8449", "하락": "922B21", "동일": "1A5276", "제외": "7F7F7F"} | |
| # 헤더 | |
| for cell in ws[1]: | |
| cl = cell.column_letter | |
| if cl in ("A", "B", "C", "Y"): | |
| bg = HDR_ETC | |
| elif cl in ("D", "E", "F", "G", "H", "I", "J"): | |
| bg = HDR_BASE | |
| elif cl in ("K", "L", "M", "N", "O", "P", "Q"): | |
| bg = HDR_HYDE | |
| else: | |
| bg = HDR_RERANK | |
| cell.fill = PatternFill("solid", start_color=bg) | |
| cell.font = Font(bold=True, color="FFFFFF", name="맑은 고딕", size=9) | |
| cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True) | |
| # 데이터 행 | |
| for row in ws.iter_rows(min_row=2, max_row=ws.max_row): | |
| rc = {cell.column_letter: cell for cell in row} | |
| bge_lbl = rc.get("I").value if rc.get("I") else "" | |
| hyde_lbl = rc.get("P").value if rc.get("P") else "" | |
| rerank_lbl = rc.get("W").value if rc.get("W") else "" | |
| change_val = rc.get("Y").value if rc.get("Y") else "" | |
| for cell in row: | |
| cell.font = Font(name="맑은 고딕", size=8) | |
| cell.alignment = Alignment(wrap_text=True, vertical="center") | |
| for col_letter, lbl in [("I", bge_lbl), ("P", hyde_lbl), ("W", rerank_lbl)]: | |
| if rc.get(col_letter) and lbl: | |
| rc[col_letter].fill = FILL_MAP.get(lbl, FILL_MAP["평가제외"]) | |
| rc[col_letter].font = Font(name="맑은 고딕", size=8, bold=True) | |
| rc[col_letter].alignment = Alignment(horizontal="center", vertical="center") | |
| for col_letter in ("J", "Q", "X"): | |
| if rc.get(col_letter): | |
| rc[col_letter].alignment = Alignment(horizontal="center", vertical="center") | |
| if rc.get("Y") and change_val: | |
| rc["Y"].fill = CHANGE_FILL.get(change_val, CHANGE_FILL["동일"]) | |
| rc["Y"].font = Font(name="맑은 고딕", size=8, bold=True, | |
| color=CHANGE_FC.get(change_val, "000000")) | |
| rc["Y"].alignment = Alignment(horizontal="center", vertical="center") | |
| if rc.get("A"): | |
| rc["A"].alignment = Alignment(horizontal="center", vertical="center") | |
| # ── 요약 행 ───────────────────────────────────────────────────────────── | |
| ws.append([]) | |
| ws.append([ | |
| "", "【요약】", | |
| f"유효 {b_valid}개 / 제외 {b_skip}개", | |
| f"Acc@1: {b_r1:.1%}", f"Acc@3: {b_r3:.1%}", f"Acc@5: {b_r5:.1%}", | |
| "", "", "", "", | |
| f"Acc@1: {h_r1:.1%}", f"Acc@3: {h_r3:.1%}", f"Acc@5: {h_r5:.1%}", | |
| "", "", "", "", | |
| f"Acc@1: {rr_r1:.1%}", f"Acc@3: {rr_r3:.1%}", f"Acc@5: {rr_r5:.1%}", | |
| "", "", "", "", | |
| "", | |
| ]) | |
| sum_fill = PatternFill("solid", start_color="D9E1F2") | |
| for cell in ws[ws.max_row]: | |
| cell.fill = sum_fill | |
| cell.font = Font(bold=True, name="맑은 고딕", size=10) | |
| cell.alignment = Alignment(horizontal="center", vertical="center") | |
| # 개선 델타 행 (vs bge-m3 baseline) | |
| ws.append([ | |
| "", "【개선폭 vs bge-m3】", "", | |
| "-", "-", "-", "", "", "", "", | |
| f"+{h_r1 - b_r1:.1%}", f"+{h_r3 - b_r3:.1%}", f"+{h_r5 - b_r5:.1%}", | |
| "", "", "", "", | |
| f"+{rr_r1 - b_r1:.1%}", f"+{rr_r3 - b_r3:.1%}", f"+{rr_r5 - b_r5:.1%}", | |
| "", "", "", "", | |
| "", | |
| ]) | |
| delta_fill = PatternFill("solid", start_color="E2EFDA") | |
| for cell in ws[ws.max_row]: | |
| cell.fill = delta_fill | |
| cell.font = Font(bold=True, name="맑은 고딕", size=10, color="375623") | |
| cell.alignment = Alignment(horizontal="center", vertical="center") | |
| wb.save(EXCEL_PATH) | |
| print(f"\n[17_eval] 저장 완료: {EXCEL_PATH}") | |
| print(f" bge-m3: Acc@1={b_r1:.1%} Acc@3={b_r3:.1%} Acc@5={b_r5:.1%}") | |
| print(f" +HyDE: Acc@1={h_r1:.1%} Acc@3={h_r3:.1%} Acc@5={h_r5:.1%}") | |
| print(f" +HyDE+Rerank: Acc@1={rr_r1:.1%} Acc@3={rr_r3:.1%} Acc@5={rr_r5:.1%}") | |
| print(f" 최종 개선폭: Acc@1 +{rr_r1-b_r1:.1%} Acc@3 +{rr_r3-b_r3:.1%} Acc@5 +{rr_r5-b_r5:.1%}") | |
| if __name__ == "__main__": | |
| main() | |