AI_Menu_Search / scripts /17_eval_comparison.py
Juhaha
HF Spaces 데모 배포 (Streamlit + Qdrant 임베디드, 색인 빌드타임 생성)
fbd1091
Raw
History Blame Contribute Delete
15.9 kB
"""
Step 17: bge-m3 vs bge-m3+HyDE vs +HyDE+Rerank 비교 평가 (Top5)
세 검색 방식의 Top5 결과를 나란히 비교하는 Excel 출력.
실행:
python scripts/17_eval_comparison.py
출력:
data/generated/Menusearch_100_answer.xlsx (비교 시트로 교체)
Excel 컬럼 구조:
A: No | B: 쿼리 | C: 정답 메뉴
D~H : [bge-m3] Top1~5
I : [bge-m3] 평가결과
J : [bge-m3] Top1 유사도
K~O : [+HyDE] Top1~5
P : [+HyDE] 평가결과
Q : [+HyDE] Top1 유사도
R~V : [+HyDE+Rerank] Top1~5
W : [+HyDE+Rerank] 평가결과
X : [+HyDE+Rerank] Top1 유사도
Y : 변화 (+HyDE vs +HyDE+Rerank)
"""
import sys
import re
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
from core.search_engine import MenuSearchEngine
EXCEL_PATH = Path(__file__).parent.parent / "data" / "generated" / "Menusearch_100_answer.xlsx"
# 결과 우선순위 (높을수록 좋음)
RANK_MAP = {"Top1 정답": 3, "Top3 정답": 2, "Top5 정답": 1, "오답": 0, "평가제외": -1}
# ---------------------------------------------------------------------------
# 정답 비교 유틸 (16_eval_with_answer.py 와 동일)
# ---------------------------------------------------------------------------
def normalize_path(path: str) -> str:
if not path:
return ""
path = re.sub(r'\s*>\s*', '>', path)
path = re.sub(r'\(.*?\)', '', path)
return path.strip().lower()
def matches(pred: str, ans: str) -> bool:
if not pred or not ans:
return False
if pred == ans:
return True
pred_end = pred.split(">")[-1].strip()
ans_end = ans.split(">")[-1].strip()
if pred_end and ans_end and pred_end == ans_end:
pred_parts = pred.split(">")
ans_parts = ans.split(">")
overlap = sum(1 for p in pred_parts if p in ans_parts)
return overlap >= min(2, len(ans_parts))
return False
def evaluate(answer_cell, top_paths: list) -> tuple:
if not answer_cell or pd.isna(answer_cell):
return None, None, None
ans_lower = str(answer_cell).lower()
first_line = ans_lower.split("\n")[0]
if ">" not in first_line and any(
first_line.startswith(m) for m in ["hts", "계좌개설", "마이페이지"]
):
return None, None, None
valid_answers = [normalize_path(a) for a in str(answer_cell).split("\n") if a.strip()]
norm_top = [normalize_path(p) for p in top_paths if p]
def hit_at(k):
return any(matches(p, a) for p in norm_top[:k] for a in valid_answers)
return hit_at(1), hit_at(3), hit_at(5)
def label_from_eval(a1, a3, a5) -> str:
if a1 is None:
return "평가제외"
if a1:
return "Top1 정답"
if a3:
return "Top3 정답"
if a5:
return "Top5 정답"
return "오답"
# ---------------------------------------------------------------------------
# 검색 실행 (단일 모드)
# ---------------------------------------------------------------------------
def run_search(engine, df_src: pd.DataFrame, use_hyde: bool, use_reranker: bool = False) -> list:
if use_reranker:
mode = "+HyDE+Rerank" if use_hyde else "+Rerank"
else:
mode = "+HyDE" if use_hyde else "bge-m3"
print(f"\n[검색] {mode} 모드 시작...")
results = []
for _, row in df_src.iterrows():
query = str(row["쿼리"]).strip()
answer_raw = row["정답 메뉴"] if pd.notna(row.get("정답 메뉴")) else None
hits = engine.search(query, top_n=5, threshold=0.0, use_hyde=use_hyde, use_reranker=use_reranker)
paths = [h.get("menu_path", "") for h in hits]
while len(paths) < 5:
paths.append("")
sims = [round(h["similarity"], 4) for h in hits]
while len(sims) < 5:
sims.append(0.0)
a1, a3, a5 = evaluate(answer_raw, paths)
lbl = label_from_eval(a1, a3, a5)
status = "[O]" if a1 else ("[ ]" if a1 is None else "[X]")
print(f" {status} [{int(row['No']):3d}] {query[:28]:<28} -> {paths[0][:40]}")
results.append({
"paths": paths,
"sims": sims,
"label": lbl,
"a1": a1, "a3": a3, "a5": a5,
})
return results
# ---------------------------------------------------------------------------
# 메인
# ---------------------------------------------------------------------------
def main():
print("[17_eval] bge-m3 vs bge-m3+HyDE 비교 평가 시작")
# ── 1. 원본 쿼리/정답 로드 ──────────────────────────────────────────────
df_src = pd.read_excel(EXCEL_PATH, sheet_name="시트1", usecols=["No", "쿼리", "정답 메뉴"])
df_src = df_src[pd.to_numeric(df_src["No"], errors="coerce").notna()].reset_index(drop=True)
print(f"[17_eval] {len(df_src)}개 쿼리 로드 완료")
# ── 2. 검색엔진 초기화 ──────────────────────────────────────────────────
print("[17_eval] 검색엔진 초기화 중...")
engine = MenuSearchEngine.get_instance()
print("[17_eval] 검색엔진 준비 완료")
# ── 3. 세 모드 검색 실행 ────────────────────────────────────────────────
base_results = run_search(engine, df_src, use_hyde=False, use_reranker=False)
hyde_results = run_search(engine, df_src, use_hyde=True, use_reranker=False)
rerank_results = run_search(engine, df_src, use_hyde=True, use_reranker=True)
# ── 4. 통계 계산 ────────────────────────────────────────────────────────
def calc_stats(results):
a1_list = [r["a1"] for r in results]
a3_list = [r["a3"] for r in results]
a5_list = [r["a5"] for r in results]
valid = sum(a is not None for a in a1_list)
skip = len(results) - valid
n1 = sum(a for a in a1_list if a is not None)
n3 = sum(a for a in a3_list if a is not None)
n5 = sum(a for a in a5_list if a is not None)
r1 = n1 / valid if valid else 0
r3 = n3 / valid if valid else 0
r5 = n5 / valid if valid else 0
return valid, skip, n1, n3, n5, r1, r3, r5
b_valid, b_skip, b_n1, b_n3, b_n5, b_r1, b_r3, b_r5 = calc_stats(base_results)
h_valid, h_skip, h_n1, h_n3, h_n5, h_r1, h_r3, h_r5 = calc_stats(hyde_results)
rr_valid, rr_skip, rr_n1, rr_n3, rr_n5, rr_r1, rr_r3, rr_r5 = calc_stats(rerank_results)
print("\n" + "=" * 80)
print("== 비교 결과 요약 ==")
print("=" * 80)
print(f"{'':20s} {'bge-m3':>12s} {'+HyDE':>12s} {'+HyDE+Rerank':>14s}")
print(f" {'Acc@1':<18s} {b_n1}/{b_valid}={b_r1:.1%} {h_n1}/{h_valid}={h_r1:.1%} {rr_n1}/{rr_valid}={rr_r1:.1%}")
print(f" {'Acc@3':<18s} {b_n3}/{b_valid}={b_r3:.1%} {h_n3}/{h_valid}={h_r3:.1%} {rr_n3}/{rr_valid}={rr_r3:.1%}")
print(f" {'Acc@5':<18s} {b_n5}/{b_valid}={b_r5:.1%} {h_n5}/{h_valid}={h_r5:.1%} {rr_n5}/{rr_valid}={rr_r5:.1%}")
print("=" * 80)
# ── 5. DataFrame 구성 ───────────────────────────────────────────────────
rows = []
for i, (_, src_row) in enumerate(df_src.iterrows()):
br = base_results[i]
hr = hyde_results[i]
rr = rerank_results[i]
hyde_rank = RANK_MAP.get(hr["label"], -1)
rerank_rank = RANK_MAP.get(rr["label"], -1)
if hr["label"] == "평가제외":
change = "제외"
elif rerank_rank > hyde_rank:
change = "개선"
elif rerank_rank < hyde_rank:
change = "하락"
else:
change = "동일"
rows.append({
"No": src_row["No"],
"쿼리": str(src_row["쿼리"]).strip(),
"정답 메뉴": src_row["정답 메뉴"] if pd.notna(src_row.get("정답 메뉴")) else "",
# bge-m3
"bge Top1": br["paths"][0], "bge Top2": br["paths"][1], "bge Top3": br["paths"][2],
"bge Top4": br["paths"][3], "bge Top5": br["paths"][4],
"bge 결과": br["label"], "bge 유사도": br["sims"][0],
# +HyDE
"hyde Top1": hr["paths"][0], "hyde Top2": hr["paths"][1], "hyde Top3": hr["paths"][2],
"hyde Top4": hr["paths"][3], "hyde Top5": hr["paths"][4],
"hyde 결과": hr["label"], "hyde 유사도": hr["sims"][0],
# +HyDE+Rerank
"rr Top1": rr["paths"][0], "rr Top2": rr["paths"][1], "rr Top3": rr["paths"][2],
"rr Top4": rr["paths"][3], "rr Top5": rr["paths"][4],
"rr 결과": rr["label"], "rr 유사도": rr["sims"][0],
# 변화 (+HyDE vs +HyDE+Rerank)
"변화": change,
})
df_out = pd.DataFrame(rows)
df_out.columns = [
"No", "쿼리", "정답 메뉴",
"[bge-m3] Top1", "[bge-m3] Top2", "[bge-m3] Top3", "[bge-m3] Top4", "[bge-m3] Top5",
"[bge-m3] 결과", "[bge-m3] 유사도",
"[+HyDE] Top1", "[+HyDE] Top2", "[+HyDE] Top3", "[+HyDE] Top4", "[+HyDE] Top5",
"[+HyDE] 결과", "[+HyDE] 유사도",
"[+HyDE+Rerank] Top1", "[+HyDE+Rerank] Top2", "[+HyDE+Rerank] Top3",
"[+HyDE+Rerank] Top4", "[+HyDE+Rerank] Top5",
"[+HyDE+Rerank] 결과", "[+HyDE+Rerank] 유사도",
"변화",
]
df_out.to_excel(EXCEL_PATH, sheet_name="시트1", index=False)
# ── 6. Excel 스타일링 ────────────────────────────────────────────────────
wb = load_workbook(EXCEL_PATH)
ws = wb["시트1"]
# 열 너비 (A~Y, 25컬럼)
col_widths = {
"A": 5, "B": 34, "C": 34,
"D": 28, "E": 28, "F": 28, "G": 28, "H": 28, "I": 12, "J": 11,
"K": 28, "L": 28, "M": 28, "N": 28, "O": 28, "P": 12, "Q": 11,
"R": 28, "S": 28, "T": 28, "U": 28, "V": 28, "W": 12, "X": 11,
"Y": 9,
}
for col, w in col_widths.items():
ws.column_dimensions[col].width = w
ws.row_dimensions[1].height = 28
HDR_BASE = "1F4E79" # bge-m3 (진파랑)
HDR_HYDE = "1A5276" # +HyDE (중간파랑)
HDR_RERANK = "145A32" # +Rerank (진초록)
HDR_ETC = "2C3E50" # 공통 (차콜)
FILL_MAP = {
"Top1 정답": PatternFill("solid", start_color="C6EFCE"),
"Top3 정답": PatternFill("solid", start_color="FFEB9C"),
"Top5 정답": PatternFill("solid", start_color="FCE4D6"),
"오답": PatternFill("solid", start_color="FFC7CE"),
"평가제외": PatternFill("solid", start_color="F2F2F2"),
}
CHANGE_FILL = {
"개선": PatternFill("solid", start_color="ABEBC6"),
"하락": PatternFill("solid", start_color="F1948A"),
"동일": PatternFill("solid", start_color="EBF5FB"),
"제외": PatternFill("solid", start_color="F2F2F2"),
}
CHANGE_FC = {"개선": "1E8449", "하락": "922B21", "동일": "1A5276", "제외": "7F7F7F"}
# 헤더
for cell in ws[1]:
cl = cell.column_letter
if cl in ("A", "B", "C", "Y"):
bg = HDR_ETC
elif cl in ("D", "E", "F", "G", "H", "I", "J"):
bg = HDR_BASE
elif cl in ("K", "L", "M", "N", "O", "P", "Q"):
bg = HDR_HYDE
else:
bg = HDR_RERANK
cell.fill = PatternFill("solid", start_color=bg)
cell.font = Font(bold=True, color="FFFFFF", name="맑은 고딕", size=9)
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
# 데이터 행
for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
rc = {cell.column_letter: cell for cell in row}
bge_lbl = rc.get("I").value if rc.get("I") else ""
hyde_lbl = rc.get("P").value if rc.get("P") else ""
rerank_lbl = rc.get("W").value if rc.get("W") else ""
change_val = rc.get("Y").value if rc.get("Y") else ""
for cell in row:
cell.font = Font(name="맑은 고딕", size=8)
cell.alignment = Alignment(wrap_text=True, vertical="center")
for col_letter, lbl in [("I", bge_lbl), ("P", hyde_lbl), ("W", rerank_lbl)]:
if rc.get(col_letter) and lbl:
rc[col_letter].fill = FILL_MAP.get(lbl, FILL_MAP["평가제외"])
rc[col_letter].font = Font(name="맑은 고딕", size=8, bold=True)
rc[col_letter].alignment = Alignment(horizontal="center", vertical="center")
for col_letter in ("J", "Q", "X"):
if rc.get(col_letter):
rc[col_letter].alignment = Alignment(horizontal="center", vertical="center")
if rc.get("Y") and change_val:
rc["Y"].fill = CHANGE_FILL.get(change_val, CHANGE_FILL["동일"])
rc["Y"].font = Font(name="맑은 고딕", size=8, bold=True,
color=CHANGE_FC.get(change_val, "000000"))
rc["Y"].alignment = Alignment(horizontal="center", vertical="center")
if rc.get("A"):
rc["A"].alignment = Alignment(horizontal="center", vertical="center")
# ── 요약 행 ─────────────────────────────────────────────────────────────
ws.append([])
ws.append([
"", "【요약】",
f"유효 {b_valid}개 / 제외 {b_skip}개",
f"Acc@1: {b_r1:.1%}", f"Acc@3: {b_r3:.1%}", f"Acc@5: {b_r5:.1%}",
"", "", "", "",
f"Acc@1: {h_r1:.1%}", f"Acc@3: {h_r3:.1%}", f"Acc@5: {h_r5:.1%}",
"", "", "", "",
f"Acc@1: {rr_r1:.1%}", f"Acc@3: {rr_r3:.1%}", f"Acc@5: {rr_r5:.1%}",
"", "", "", "",
"",
])
sum_fill = PatternFill("solid", start_color="D9E1F2")
for cell in ws[ws.max_row]:
cell.fill = sum_fill
cell.font = Font(bold=True, name="맑은 고딕", size=10)
cell.alignment = Alignment(horizontal="center", vertical="center")
# 개선 델타 행 (vs bge-m3 baseline)
ws.append([
"", "【개선폭 vs bge-m3】", "",
"-", "-", "-", "", "", "", "",
f"+{h_r1 - b_r1:.1%}", f"+{h_r3 - b_r3:.1%}", f"+{h_r5 - b_r5:.1%}",
"", "", "", "",
f"+{rr_r1 - b_r1:.1%}", f"+{rr_r3 - b_r3:.1%}", f"+{rr_r5 - b_r5:.1%}",
"", "", "", "",
"",
])
delta_fill = PatternFill("solid", start_color="E2EFDA")
for cell in ws[ws.max_row]:
cell.fill = delta_fill
cell.font = Font(bold=True, name="맑은 고딕", size=10, color="375623")
cell.alignment = Alignment(horizontal="center", vertical="center")
wb.save(EXCEL_PATH)
print(f"\n[17_eval] 저장 완료: {EXCEL_PATH}")
print(f" bge-m3: Acc@1={b_r1:.1%} Acc@3={b_r3:.1%} Acc@5={b_r5:.1%}")
print(f" +HyDE: Acc@1={h_r1:.1%} Acc@3={h_r3:.1%} Acc@5={h_r5:.1%}")
print(f" +HyDE+Rerank: Acc@1={rr_r1:.1%} Acc@3={rr_r3:.1%} Acc@5={rr_r5:.1%}")
print(f" 최종 개선폭: Acc@1 +{rr_r1-b_r1:.1%} Acc@3 +{rr_r3-b_r3:.1%} Acc@5 +{rr_r5-b_r5:.1%}")
if __name__ == "__main__":
main()