| """RAGAS Eval - RAG ๋ฐฑ์๋๋ณ ๋ต ํ์ง ์ ๋ ํ๊ฐ |
| |
| production ํ์ค ํ๊ฐ ํ๋ ์์ํฌ. faithfulness/answer_relevancy/context_precision์ผ๋ก |
| ๊ฒ์ + ์์ฑ ํ์ง์ LLM ๊ธฐ๋ฐ์ผ๋ก ์ ์ํํ๋ค. |
| |
| ๊ฐ์ ์๋ยทTier 2(์์ธ ๋ถ์)์ ๋ํด backend๋ณ๋ก ์คํ ํ ํ๊ฐ: |
| - hybrid: BM25 + FAISS + RRF |
| - hybrid_rerank: hybrid + cross-encoder ์ฌ์ ๋ ฌ |
| |
| ์คํ: python -m experiments.rag_eval.benchmark |
| ๊ฒฐ๊ณผ: results.md |
| """ |
| import os |
| from pathlib import Path |
|
|
| from datasets import Dataset |
| from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
| from ragas import evaluate |
| from ragas.embeddings import LangchainEmbeddingsWrapper |
| from ragas.llms import LangchainLLMWrapper |
| from ragas.metrics import ( |
| Faithfulness, |
| LLMContextPrecisionWithoutReference, |
| ResponseRelevancy, |
| ) |
|
|
| from agents.cause import _build_query, run_cause |
| from agents.detection import run_detection |
| from agents.rag.store import load_document, search |
| from data.demo import DEFAULT_ALARMS |
|
|
| OUT_DIR = Path(__file__).parent |
| BACKENDS = ["hybrid", "hybrid_rerank"] |
| TARGET_ALARM = "A3" |
|
|
|
|
| def collect_samples(): |
| """๊ฐ backend๋ณ๋ก Tier 2 ์คํ ํ (question, contexts, answer) ์์ง""" |
| alarm = next(a for a in DEFAULT_ALARMS if a["id"] == TARGET_ALARM) |
| tier1 = run_detection(alarm) |
| query = _build_query(alarm, tier1) |
|
|
| rows = {"question": [], "answer": [], "contexts": [], "backend": []} |
| for backend in BACKENDS: |
| os.environ["RAG_BACKEND"] = backend |
| |
| doc_ids = search(query, top_k=3) |
| contexts = [load_document(d) for d in doc_ids] |
|
|
| tier2 = run_cause(alarm, tier1) |
| answer = "\n".join( |
| f"- {c['name']} ({c['pct']}%): {c['evidence']}" for c in tier2["causes"] |
| ) |
|
|
| rows["question"].append(query) |
| rows["answer"].append(answer) |
| rows["contexts"].append(contexts) |
| rows["backend"].append(backend) |
| print(f" {backend}: {len(doc_ids)} docs, {len(tier2['causes'])} causes") |
| return rows |
|
|
|
|
| def main(): |
| print(f"=== Tier 2 ๊ฒฐ๊ณผ ์์ง (์๋ {TARGET_ALARM}) ===") |
| rows = collect_samples() |
|
|
| print("\n=== RAGAS ํ๊ฐ ===") |
| |
| eval_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0)) |
| eval_emb = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small")) |
|
|
| dataset = Dataset.from_dict( |
| {"question": rows["question"], "answer": rows["answer"], "contexts": rows["contexts"]} |
| ) |
| metrics = [ |
| Faithfulness(llm=eval_llm), |
| ResponseRelevancy(llm=eval_llm, embeddings=eval_emb), |
| LLMContextPrecisionWithoutReference(llm=eval_llm), |
| ] |
| result = evaluate(dataset=dataset, metrics=metrics) |
|
|
| print("\n--- ๊ฒฐ๊ณผ ---") |
| print(result) |
|
|
| df = result.to_pandas() |
| df["backend"] = rows["backend"] |
| write_results(df) |
|
|
|
|
| def write_results(df): |
| metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "backend")] |
|
|
| lines = [ |
| "# RAG Eval (RAGAS) - ๋ฐฑ์๋๋ณ ๋ต ํ์ง ์ ๋ ๋น๊ต", |
| "", |
| f"๊ฐ์ ์๋({TARGET_ALARM}, CMP)์ ๋ํด ๋ retrieval ๋ฐฑ์๋์ Tier 2(์์ธ ๋ถ์) ๊ฒฐ๊ณผ๋ฅผ RAGAS๋ก ํ๊ฐํฉ๋๋ค.", |
| "", |
| "## ์ค์ ", |
| "", |
| "- ํ๊ฐ LLM: gpt-5-mini", |
| "- ํ๊ฐ ์๋ฒ ๋ฉ: text-embedding-3-small", |
| "- Metric:", |
| " - **Faithfulness**: ๋ต์ด ๊ฒ์๋ context์ ์ถฉ์คํ๊ฐ (ํ๊ฐ ์ธก์ )", |
| " - **Response Relevancy**: ๋ต์ด ์ง๋ฌธ์ ๊ด๋ จ ์๋๊ฐ", |
| " - **LLM Context Precision (no ref)**: ๊ฒ์๋ context ์ค ๊ด๋ จ๋ ๊ฒ์ ๋น์จ", |
| "", |
| "## ๊ฒฐ๊ณผ", |
| "", |
| "| Backend | " + " | ".join(metric_cols) + " |", |
| "|---|" + "|".join(["---"] * len(metric_cols)) + "|", |
| ] |
| for _, row in df.iterrows(): |
| cells = [f"{row[c]:.3f}" if isinstance(row[c], float) else str(row[c]) for c in metric_cols] |
| lines.append(f"| {row['backend']} | " + " | ".join(cells) + " |") |
|
|
| lines += [ |
| "", |
| "## ํด์", |
| "", |
| "- **Faithfulness ๋์** = LLM์ด ๊ฒ์๋ ๋ฌธ์์ ์ถฉ์คํ ๊ทผ๊ฑฐ (ํ๊ฐ ์ ์)", |
| "- **Response Relevancy ๋์** = ๋ต์ด ์ง๋ฌธ์ ์ ํํ ๋ตํจ", |
| "- **Context Precision ๋์** = ๊ฒ์๋ ๋ฌธ์๊ฐ ๋ต ์์ฑ์ ์ค์ ๋ก ๊ธฐ์ฌ", |
| "", |
| "## ์ฑํ", |
| "", |
| "์ ๋ ์ฐจ์ด๋ฅผ ๋ณด๊ณ ์ ํฉํ backend ์ฑํ. ์ผ๋ฐ์ ์ผ๋ก Hybrid+Rerank๊ฐ ์ ๋ฐ๋์์ ์ฐ์.", |
| "", |
| ] |
| (OUT_DIR / "results.md").write_text("\n".join(lines), encoding="utf-8") |
| print(f"--- ์ ์ฅ: {OUT_DIR / 'results.md'} ---") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|