""" Step 17: bge-m3 vs bge-m3+HyDE vs +HyDE+Rerank 비교 평가 (Top5) 세 검색 방식의 Top5 결과를 나란히 비교하는 Excel 출력. 실행: python scripts/17_eval_comparison.py 출력: data/generated/Menusearch_100_answer.xlsx (비교 시트로 교체) Excel 컬럼 구조: A: No | B: 쿼리 | C: 정답 메뉴 D~H : [bge-m3] Top1~5 I : [bge-m3] 평가결과 J : [bge-m3] Top1 유사도 K~O : [+HyDE] Top1~5 P : [+HyDE] 평가결과 Q : [+HyDE] Top1 유사도 R~V : [+HyDE+Rerank] Top1~5 W : [+HyDE+Rerank] 평가결과 X : [+HyDE+Rerank] Top1 유사도 Y : 변화 (+HyDE vs +HyDE+Rerank) """ import sys import re from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) import pandas as pd from openpyxl import load_workbook from openpyxl.styles import PatternFill, Font, Alignment, Border, Side from core.search_engine import MenuSearchEngine EXCEL_PATH = Path(__file__).parent.parent / "data" / "generated" / "Menusearch_100_answer.xlsx" # 결과 우선순위 (높을수록 좋음) RANK_MAP = {"Top1 정답": 3, "Top3 정답": 2, "Top5 정답": 1, "오답": 0, "평가제외": -1} # --------------------------------------------------------------------------- # 정답 비교 유틸 (16_eval_with_answer.py 와 동일) # --------------------------------------------------------------------------- def normalize_path(path: str) -> str: if not path: return "" path = re.sub(r'\s*>\s*', '>', path) path = re.sub(r'\(.*?\)', '', path) return path.strip().lower() def matches(pred: str, ans: str) -> bool: if not pred or not ans: return False if pred == ans: return True pred_end = pred.split(">")[-1].strip() ans_end = ans.split(">")[-1].strip() if pred_end and ans_end and pred_end == ans_end: pred_parts = pred.split(">") ans_parts = ans.split(">") overlap = sum(1 for p in pred_parts if p in ans_parts) return overlap >= min(2, len(ans_parts)) return False def evaluate(answer_cell, top_paths: list) -> tuple: if not answer_cell or pd.isna(answer_cell): return None, None, None ans_lower = str(answer_cell).lower() first_line = ans_lower.split("\n")[0] if ">" not in first_line and any( first_line.startswith(m) for m in ["hts", "계좌개설", "마이페이지"] ): return None, None, None valid_answers = [normalize_path(a) for a in str(answer_cell).split("\n") if a.strip()] norm_top = [normalize_path(p) for p in top_paths if p] def hit_at(k): return any(matches(p, a) for p in norm_top[:k] for a in valid_answers) return hit_at(1), hit_at(3), hit_at(5) def label_from_eval(a1, a3, a5) -> str: if a1 is None: return "평가제외" if a1: return "Top1 정답" if a3: return "Top3 정답" if a5: return "Top5 정답" return "오답" # --------------------------------------------------------------------------- # 검색 실행 (단일 모드) # --------------------------------------------------------------------------- def run_search(engine, df_src: pd.DataFrame, use_hyde: bool, use_reranker: bool = False) -> list: if use_reranker: mode = "+HyDE+Rerank" if use_hyde else "+Rerank" else: mode = "+HyDE" if use_hyde else "bge-m3" print(f"\n[검색] {mode} 모드 시작...") results = [] for _, row in df_src.iterrows(): query = str(row["쿼리"]).strip() answer_raw = row["정답 메뉴"] if pd.notna(row.get("정답 메뉴")) else None hits = engine.search(query, top_n=5, threshold=0.0, use_hyde=use_hyde, use_reranker=use_reranker) paths = [h.get("menu_path", "") for h in hits] while len(paths) < 5: paths.append("") sims = [round(h["similarity"], 4) for h in hits] while len(sims) < 5: sims.append(0.0) a1, a3, a5 = evaluate(answer_raw, paths) lbl = label_from_eval(a1, a3, a5) status = "[O]" if a1 else ("[ ]" if a1 is None else "[X]") print(f" {status} [{int(row['No']):3d}] {query[:28]:<28} -> {paths[0][:40]}") results.append({ "paths": paths, "sims": sims, "label": lbl, "a1": a1, "a3": a3, "a5": a5, }) return results # --------------------------------------------------------------------------- # 메인 # --------------------------------------------------------------------------- def main(): print("[17_eval] bge-m3 vs bge-m3+HyDE 비교 평가 시작") # ── 1. 원본 쿼리/정답 로드 ────────────────────────────────────────────── df_src = pd.read_excel(EXCEL_PATH, sheet_name="시트1", usecols=["No", "쿼리", "정답 메뉴"]) df_src = df_src[pd.to_numeric(df_src["No"], errors="coerce").notna()].reset_index(drop=True) print(f"[17_eval] {len(df_src)}개 쿼리 로드 완료") # ── 2. 검색엔진 초기화 ────────────────────────────────────────────────── print("[17_eval] 검색엔진 초기화 중...") engine = MenuSearchEngine.get_instance() print("[17_eval] 검색엔진 준비 완료") # ── 3. 세 모드 검색 실행 ──────────────────────────────────────────────── base_results = run_search(engine, df_src, use_hyde=False, use_reranker=False) hyde_results = run_search(engine, df_src, use_hyde=True, use_reranker=False) rerank_results = run_search(engine, df_src, use_hyde=True, use_reranker=True) # ── 4. 통계 계산 ──────────────────────────────────────────────────────── def calc_stats(results): a1_list = [r["a1"] for r in results] a3_list = [r["a3"] for r in results] a5_list = [r["a5"] for r in results] valid = sum(a is not None for a in a1_list) skip = len(results) - valid n1 = sum(a for a in a1_list if a is not None) n3 = sum(a for a in a3_list if a is not None) n5 = sum(a for a in a5_list if a is not None) r1 = n1 / valid if valid else 0 r3 = n3 / valid if valid else 0 r5 = n5 / valid if valid else 0 return valid, skip, n1, n3, n5, r1, r3, r5 b_valid, b_skip, b_n1, b_n3, b_n5, b_r1, b_r3, b_r5 = calc_stats(base_results) h_valid, h_skip, h_n1, h_n3, h_n5, h_r1, h_r3, h_r5 = calc_stats(hyde_results) rr_valid, rr_skip, rr_n1, rr_n3, rr_n5, rr_r1, rr_r3, rr_r5 = calc_stats(rerank_results) print("\n" + "=" * 80) print("== 비교 결과 요약 ==") print("=" * 80) print(f"{'':20s} {'bge-m3':>12s} {'+HyDE':>12s} {'+HyDE+Rerank':>14s}") print(f" {'Acc@1':<18s} {b_n1}/{b_valid}={b_r1:.1%} {h_n1}/{h_valid}={h_r1:.1%} {rr_n1}/{rr_valid}={rr_r1:.1%}") print(f" {'Acc@3':<18s} {b_n3}/{b_valid}={b_r3:.1%} {h_n3}/{h_valid}={h_r3:.1%} {rr_n3}/{rr_valid}={rr_r3:.1%}") print(f" {'Acc@5':<18s} {b_n5}/{b_valid}={b_r5:.1%} {h_n5}/{h_valid}={h_r5:.1%} {rr_n5}/{rr_valid}={rr_r5:.1%}") print("=" * 80) # ── 5. DataFrame 구성 ─────────────────────────────────────────────────── rows = [] for i, (_, src_row) in enumerate(df_src.iterrows()): br = base_results[i] hr = hyde_results[i] rr = rerank_results[i] hyde_rank = RANK_MAP.get(hr["label"], -1) rerank_rank = RANK_MAP.get(rr["label"], -1) if hr["label"] == "평가제외": change = "제외" elif rerank_rank > hyde_rank: change = "개선" elif rerank_rank < hyde_rank: change = "하락" else: change = "동일" rows.append({ "No": src_row["No"], "쿼리": str(src_row["쿼리"]).strip(), "정답 메뉴": src_row["정답 메뉴"] if pd.notna(src_row.get("정답 메뉴")) else "", # bge-m3 "bge Top1": br["paths"][0], "bge Top2": br["paths"][1], "bge Top3": br["paths"][2], "bge Top4": br["paths"][3], "bge Top5": br["paths"][4], "bge 결과": br["label"], "bge 유사도": br["sims"][0], # +HyDE "hyde Top1": hr["paths"][0], "hyde Top2": hr["paths"][1], "hyde Top3": hr["paths"][2], "hyde Top4": hr["paths"][3], "hyde Top5": hr["paths"][4], "hyde 결과": hr["label"], "hyde 유사도": hr["sims"][0], # +HyDE+Rerank "rr Top1": rr["paths"][0], "rr Top2": rr["paths"][1], "rr Top3": rr["paths"][2], "rr Top4": rr["paths"][3], "rr Top5": rr["paths"][4], "rr 결과": rr["label"], "rr 유사도": rr["sims"][0], # 변화 (+HyDE vs +HyDE+Rerank) "변화": change, }) df_out = pd.DataFrame(rows) df_out.columns = [ "No", "쿼리", "정답 메뉴", "[bge-m3] Top1", "[bge-m3] Top2", "[bge-m3] Top3", "[bge-m3] Top4", "[bge-m3] Top5", "[bge-m3] 결과", "[bge-m3] 유사도", "[+HyDE] Top1", "[+HyDE] Top2", "[+HyDE] Top3", "[+HyDE] Top4", "[+HyDE] Top5", "[+HyDE] 결과", "[+HyDE] 유사도", "[+HyDE+Rerank] Top1", "[+HyDE+Rerank] Top2", "[+HyDE+Rerank] Top3", "[+HyDE+Rerank] Top4", "[+HyDE+Rerank] Top5", "[+HyDE+Rerank] 결과", "[+HyDE+Rerank] 유사도", "변화", ] df_out.to_excel(EXCEL_PATH, sheet_name="시트1", index=False) # ── 6. Excel 스타일링 ──────────────────────────────────────────────────── wb = load_workbook(EXCEL_PATH) ws = wb["시트1"] # 열 너비 (A~Y, 25컬럼) col_widths = { "A": 5, "B": 34, "C": 34, "D": 28, "E": 28, "F": 28, "G": 28, "H": 28, "I": 12, "J": 11, "K": 28, "L": 28, "M": 28, "N": 28, "O": 28, "P": 12, "Q": 11, "R": 28, "S": 28, "T": 28, "U": 28, "V": 28, "W": 12, "X": 11, "Y": 9, } for col, w in col_widths.items(): ws.column_dimensions[col].width = w ws.row_dimensions[1].height = 28 HDR_BASE = "1F4E79" # bge-m3 (진파랑) HDR_HYDE = "1A5276" # +HyDE (중간파랑) HDR_RERANK = "145A32" # +Rerank (진초록) HDR_ETC = "2C3E50" # 공통 (차콜) FILL_MAP = { "Top1 정답": PatternFill("solid", start_color="C6EFCE"), "Top3 정답": PatternFill("solid", start_color="FFEB9C"), "Top5 정답": PatternFill("solid", start_color="FCE4D6"), "오답": PatternFill("solid", start_color="FFC7CE"), "평가제외": PatternFill("solid", start_color="F2F2F2"), } CHANGE_FILL = { "개선": PatternFill("solid", start_color="ABEBC6"), "하락": PatternFill("solid", start_color="F1948A"), "동일": PatternFill("solid", start_color="EBF5FB"), "제외": PatternFill("solid", start_color="F2F2F2"), } CHANGE_FC = {"개선": "1E8449", "하락": "922B21", "동일": "1A5276", "제외": "7F7F7F"} # 헤더 for cell in ws[1]: cl = cell.column_letter if cl in ("A", "B", "C", "Y"): bg = HDR_ETC elif cl in ("D", "E", "F", "G", "H", "I", "J"): bg = HDR_BASE elif cl in ("K", "L", "M", "N", "O", "P", "Q"): bg = HDR_HYDE else: bg = HDR_RERANK cell.fill = PatternFill("solid", start_color=bg) cell.font = Font(bold=True, color="FFFFFF", name="맑은 고딕", size=9) cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True) # 데이터 행 for row in ws.iter_rows(min_row=2, max_row=ws.max_row): rc = {cell.column_letter: cell for cell in row} bge_lbl = rc.get("I").value if rc.get("I") else "" hyde_lbl = rc.get("P").value if rc.get("P") else "" rerank_lbl = rc.get("W").value if rc.get("W") else "" change_val = rc.get("Y").value if rc.get("Y") else "" for cell in row: cell.font = Font(name="맑은 고딕", size=8) cell.alignment = Alignment(wrap_text=True, vertical="center") for col_letter, lbl in [("I", bge_lbl), ("P", hyde_lbl), ("W", rerank_lbl)]: if rc.get(col_letter) and lbl: rc[col_letter].fill = FILL_MAP.get(lbl, FILL_MAP["평가제외"]) rc[col_letter].font = Font(name="맑은 고딕", size=8, bold=True) rc[col_letter].alignment = Alignment(horizontal="center", vertical="center") for col_letter in ("J", "Q", "X"): if rc.get(col_letter): rc[col_letter].alignment = Alignment(horizontal="center", vertical="center") if rc.get("Y") and change_val: rc["Y"].fill = CHANGE_FILL.get(change_val, CHANGE_FILL["동일"]) rc["Y"].font = Font(name="맑은 고딕", size=8, bold=True, color=CHANGE_FC.get(change_val, "000000")) rc["Y"].alignment = Alignment(horizontal="center", vertical="center") if rc.get("A"): rc["A"].alignment = Alignment(horizontal="center", vertical="center") # ── 요약 행 ───────────────────────────────────────────────────────────── ws.append([]) ws.append([ "", "【요약】", f"유효 {b_valid}개 / 제외 {b_skip}개", f"Acc@1: {b_r1:.1%}", f"Acc@3: {b_r3:.1%}", f"Acc@5: {b_r5:.1%}", "", "", "", "", f"Acc@1: {h_r1:.1%}", f"Acc@3: {h_r3:.1%}", f"Acc@5: {h_r5:.1%}", "", "", "", "", f"Acc@1: {rr_r1:.1%}", f"Acc@3: {rr_r3:.1%}", f"Acc@5: {rr_r5:.1%}", "", "", "", "", "", ]) sum_fill = PatternFill("solid", start_color="D9E1F2") for cell in ws[ws.max_row]: cell.fill = sum_fill cell.font = Font(bold=True, name="맑은 고딕", size=10) cell.alignment = Alignment(horizontal="center", vertical="center") # 개선 델타 행 (vs bge-m3 baseline) ws.append([ "", "【개선폭 vs bge-m3】", "", "-", "-", "-", "", "", "", "", f"+{h_r1 - b_r1:.1%}", f"+{h_r3 - b_r3:.1%}", f"+{h_r5 - b_r5:.1%}", "", "", "", "", f"+{rr_r1 - b_r1:.1%}", f"+{rr_r3 - b_r3:.1%}", f"+{rr_r5 - b_r5:.1%}", "", "", "", "", "", ]) delta_fill = PatternFill("solid", start_color="E2EFDA") for cell in ws[ws.max_row]: cell.fill = delta_fill cell.font = Font(bold=True, name="맑은 고딕", size=10, color="375623") cell.alignment = Alignment(horizontal="center", vertical="center") wb.save(EXCEL_PATH) print(f"\n[17_eval] 저장 완료: {EXCEL_PATH}") print(f" bge-m3: Acc@1={b_r1:.1%} Acc@3={b_r3:.1%} Acc@5={b_r5:.1%}") print(f" +HyDE: Acc@1={h_r1:.1%} Acc@3={h_r3:.1%} Acc@5={h_r5:.1%}") print(f" +HyDE+Rerank: Acc@1={rr_r1:.1%} Acc@3={rr_r3:.1%} Acc@5={rr_r5:.1%}") print(f" 최종 개선폭: Acc@1 +{rr_r1-b_r1:.1%} Acc@3 +{rr_r3-b_r3:.1%} Acc@5 +{rr_r5-b_r5:.1%}") if __name__ == "__main__": main()