fabagent / experiments /rag_eval /benchmark.py
hee_!J
feat: Hybrid RAG(BM25+FAISS+RRF) + Cross-encoder Rerank + RAGAS eval
744e87d
Raw
History Blame Contribute Delete
4.87 kB
"""RAGAS Eval - RAG ๋ฐฑ์—”๋“œ๋ณ„ ๋‹ต ํ’ˆ์งˆ ์ •๋Ÿ‰ ํ‰๊ฐ€
production ํ‘œ์ค€ ํ‰๊ฐ€ ํ”„๋ ˆ์ž„์›Œํฌ. faithfulness/answer_relevancy/context_precision์œผ๋กœ
๊ฒ€์ƒ‰ + ์ƒ์„ฑ ํ’ˆ์งˆ์„ LLM ๊ธฐ๋ฐ˜์œผ๋กœ ์ ์ˆ˜ํ™”ํ•œ๋‹ค.
๊ฐ™์€ ์•Œ๋žŒยทTier 2(์›์ธ ๋ถ„์„)์— ๋Œ€ํ•ด backend๋ณ„๋กœ ์‹คํ–‰ ํ›„ ํ‰๊ฐ€:
- hybrid: BM25 + FAISS + RRF
- hybrid_rerank: hybrid + cross-encoder ์žฌ์ •๋ ฌ
์‹คํ–‰: python -m experiments.rag_eval.benchmark
๊ฒฐ๊ณผ: results.md
"""
import os
from pathlib import Path
from datasets import Dataset
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas import evaluate
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
Faithfulness,
LLMContextPrecisionWithoutReference,
ResponseRelevancy,
)
from agents.cause import _build_query, run_cause
from agents.detection import run_detection
from agents.rag.store import load_document, search
from data.demo import DEFAULT_ALARMS
OUT_DIR = Path(__file__).parent
BACKENDS = ["hybrid", "hybrid_rerank"]
TARGET_ALARM = "A3"
def collect_samples():
"""๊ฐ backend๋ณ„๋กœ Tier 2 ์‹คํ–‰ ํ›„ (question, contexts, answer) ์ˆ˜์ง‘"""
alarm = next(a for a in DEFAULT_ALARMS if a["id"] == TARGET_ALARM)
tier1 = run_detection(alarm)
query = _build_query(alarm, tier1)
rows = {"question": [], "answer": [], "contexts": [], "backend": []}
for backend in BACKENDS:
os.environ["RAG_BACKEND"] = backend
# cause.py๊ฐ€ search()๋ฅผ ํ˜ธ์ถœํ•˜๋ฏ€๋กœ backend ๋”ฐ๋ผ ๋‹ค๋ฅธ ๊ฒฐ๊ณผ
doc_ids = search(query, top_k=3)
contexts = [load_document(d) for d in doc_ids]
tier2 = run_cause(alarm, tier1)
answer = "\n".join(
f"- {c['name']} ({c['pct']}%): {c['evidence']}" for c in tier2["causes"]
)
rows["question"].append(query)
rows["answer"].append(answer)
rows["contexts"].append(contexts)
rows["backend"].append(backend)
print(f" {backend}: {len(doc_ids)} docs, {len(tier2['causes'])} causes")
return rows
def main():
print(f"=== Tier 2 ๊ฒฐ๊ณผ ์ˆ˜์ง‘ (์•Œ๋žŒ {TARGET_ALARM}) ===")
rows = collect_samples()
print("\n=== RAGAS ํ‰๊ฐ€ ===")
# ํ‰๊ฐ€์šฉ์€ gpt-4o-mini (gpt-5-mini๋Š” temperature 0.01์„ ์ง€์› ์•ˆ ํ•ด ragas ํ‰๊ฐ€ ์‹œ BadRequest ๋ฐœ์ƒ)
eval_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0))
eval_emb = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))
dataset = Dataset.from_dict(
{"question": rows["question"], "answer": rows["answer"], "contexts": rows["contexts"]}
)
metrics = [
Faithfulness(llm=eval_llm),
ResponseRelevancy(llm=eval_llm, embeddings=eval_emb),
LLMContextPrecisionWithoutReference(llm=eval_llm),
]
result = evaluate(dataset=dataset, metrics=metrics)
print("\n--- ๊ฒฐ๊ณผ ---")
print(result)
df = result.to_pandas()
df["backend"] = rows["backend"]
write_results(df)
def write_results(df):
metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "backend")]
lines = [
"# RAG Eval (RAGAS) - ๋ฐฑ์—”๋“œ๋ณ„ ๋‹ต ํ’ˆ์งˆ ์ •๋Ÿ‰ ๋น„๊ต",
"",
f"๊ฐ™์€ ์•Œ๋žŒ({TARGET_ALARM}, CMP)์— ๋Œ€ํ•ด ๋‘ retrieval ๋ฐฑ์—”๋“œ์˜ Tier 2(์›์ธ ๋ถ„์„) ๊ฒฐ๊ณผ๋ฅผ RAGAS๋กœ ํ‰๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.",
"",
"## ์„ค์ •",
"",
"- ํ‰๊ฐ€ LLM: gpt-5-mini",
"- ํ‰๊ฐ€ ์ž„๋ฒ ๋”ฉ: text-embedding-3-small",
"- Metric:",
" - **Faithfulness**: ๋‹ต์ด ๊ฒ€์ƒ‰๋œ context์— ์ถฉ์‹คํ•œ๊ฐ€ (ํ™˜๊ฐ ์ธก์ •)",
" - **Response Relevancy**: ๋‹ต์ด ์งˆ๋ฌธ์— ๊ด€๋ จ ์žˆ๋Š”๊ฐ€",
" - **LLM Context Precision (no ref)**: ๊ฒ€์ƒ‰๋œ context ์ค‘ ๊ด€๋ จ๋œ ๊ฒƒ์˜ ๋น„์œจ",
"",
"## ๊ฒฐ๊ณผ",
"",
"| Backend | " + " | ".join(metric_cols) + " |",
"|---|" + "|".join(["---"] * len(metric_cols)) + "|",
]
for _, row in df.iterrows():
cells = [f"{row[c]:.3f}" if isinstance(row[c], float) else str(row[c]) for c in metric_cols]
lines.append(f"| {row['backend']} | " + " | ".join(cells) + " |")
lines += [
"",
"## ํ•ด์„",
"",
"- **Faithfulness ๋†’์Œ** = LLM์ด ๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ์— ์ถฉ์‹คํžˆ ๊ทผ๊ฑฐ (ํ™˜๊ฐ ์ ์Œ)",
"- **Response Relevancy ๋†’์Œ** = ๋‹ต์ด ์งˆ๋ฌธ์— ์ •ํ™•ํžˆ ๋‹ตํ•จ",
"- **Context Precision ๋†’์Œ** = ๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ๊ฐ€ ๋‹ต ์ƒ์„ฑ์— ์‹ค์ œ๋กœ ๊ธฐ์—ฌ",
"",
"## ์ฑ„ํƒ",
"",
"์ •๋Ÿ‰ ์ฐจ์ด๋ฅผ ๋ณด๊ณ  ์ ํ•ฉํ•œ backend ์ฑ„ํƒ. ์ผ๋ฐ˜์ ์œผ๋กœ Hybrid+Rerank๊ฐ€ ์ •๋ฐ€๋„์—์„œ ์šฐ์œ„.",
"",
]
(OUT_DIR / "results.md").write_text("\n".join(lines), encoding="utf-8")
print(f"--- ์ €์žฅ: {OUT_DIR / 'results.md'} ---")
if __name__ == "__main__":
main()