Spaces:

KSLab
/

RAG_Evaluator_C

Sleeping

App Files Files Community

MonicaChen0330 commited on May 26, 2025

Commit

ceb051e

verified ·

1 Parent(s): 7fb47ff

Create app.py

Browse files

Files changed (1) hide show

app.py +288 -0

app.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import os
+import sys
+import math
+from openai import OpenAI
+import requests
+import gradio as gr
+import pandas as pd
+import concurrent.futures
+from datasets import Dataset
+from tqdm import tqdm
+from ragas import evaluate, SingleTurnSample
+from ragas.llms import LangchainLLMWrapper
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from ragas.metrics import (
+    ResponseRelevancy, LLMContextPrecisionWithReference, LLMContextRecall,
+    ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
+)
+# 設定輸出編碼為 UTF-8（解決中文顯示問題）
+sys.stdout.reconfigure(encoding="utf-8")
+# 從Google Drive下載 Ground Truth
+gt_url = os.environ.get("GT_URL")
+gt_path = "tender_groundtruth.csv"
+if gt_url and not os.path.exists(gt_path):
+    print("嘗試下載 Ground Truth...")
+    r = requests.get(gt_url)
+    print("HTTP 狀態碼：", r.status_code)
+    if r.status_code != 200:
+        print("下載失敗內容預覽：", r.text[:500])
+    else:
+        with open(gt_path, "wb") as f:
+            f.write(r.content)
+# 綁定實驗室Google帳號(Python TA)Google Sheet，以記錄評估logs
+def log_to_google_sheet(question, answer, contexts, scores):
+    url = os.environ.get("G_SHEET_URL")
+    if not url:
+        print("G_SHEET_URL 未設定，略過記錄")
+        return
+    try:
+        payload = {
+            "question": question,
+            "answer": answer,
+            "contexts": contexts,
+            "faithfulness": scores.get("Faithfulness"),
+            "answer_relevancy": scores.get("Answer Relevancy"),
+            "semantic_similarity": scores.get("Semantic Similarity"),
+            "context_precision": scores.get("Context Precision"),
+            "context_recall": scores.get("Context Recall"),
+            "context_entity_recall": scores.get("Context Entity Recall")
+        }
+        response = requests.post(url, json=payload)
+        print("成功寫入 Google Sheet:", response.status_code)
+    except Exception as e:
+        print("寫入 Google Sheet 失敗:", str(e))
+def validate_openai_key(api_key):
+    try:
+        client = OpenAI(api_key=api_key)
+        client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "hi"}],
+            max_tokens=1
+        )
+        return None
+    except Exception as e:
+        err_msg = str(e)
+        if "Incorrect API key provided" in err_msg:
+            return pd.DataFrame([{"錯誤訊息": " 您輸入的 OpenAI API Key 有誤，請確認是否貼錯、字數不符或格式異常。"}]), None
+        elif "exceeded your current quota" in err_msg:
+            return pd.DataFrame([{"錯誤訊息": "您的 OpenAI 帳戶額度已用盡，請前往帳戶頁面檢查餘額。"}]), None
+        elif "Rate limit" in err_msg:
+            return pd.DataFrame([{"錯誤訊息": "OpenAI 請求頻率過高，請稍後再試"}]), None
+        else:
+            return pd.DataFrame([{"錯誤訊息": f"API Key 錯誤：{err_msg}"}]), None
+def RAG_evaluation(uploaded_file, user_api_key):
+    try:
+        # 檢查 OpenAI API Key 是否有效
+        validation_result = validate_openai_key(user_api_key)
+        if validation_result:
+            return validation_result
+        os.environ["OPENAI_API_KEY"] = user_api_key
+        print("評估開始")
+        if not os.path.exists(gt_path):
+            print("找不到 Ground Truth！")
+            return pd.DataFrame(), None
+        gt_df = pd.read_csv(gt_path)
+        df = pd.read_csv(uploaded_file.name, converters={"Context": eval})
+        print(f"上傳檔案筆數：{len(df)}，GT 檔案筆數：{len(gt_df)}")
+        merged_df = pd.merge(df, gt_df[["Question", "Answer"]], on="Question", suffixes=("", "_GroundTruth"))
+        merged_df = merged_df.rename(columns={"Answer_GroundTruth": "GroundTruth"})
+        print(f"成功合併筆數：{len(merged_df)} / {len(df)}")
+        if len(merged_df) < len(df):
+            missing = df[~df["Question"].isin(merged_df["Question"])]
+            print("未合併題目：", missing["Question"].tolist())
+        if merged_df.empty:
+            return pd.DataFrame([{"錯誤訊息": "合併後無資料，請確認題目與 GT 是否對應"}]), None
+        llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
+        embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))
+        batch_size = 10
+        records = []
+        for batch_start in tqdm(range(0, len(merged_df), batch_size), desc="RAGAS Batch Evaluating"):
+            batch_df = merged_df.iloc[batch_start:batch_start + batch_size]
+            samples = []
+            for _, row in batch_df.iterrows():
+                if not isinstance(row["Context"], list):
+                    print(f"Context 非 list，跳過。值：{row['Question']}")
+                    continue
+                sample = SingleTurnSample(
+                    user_input=row["Question"],
+                    response=row["Answer"],
+                    retrieved_contexts=row["Context"],
+                    reference=row["GroundTruth"],
+                )
+                samples.append(sample)
+            try:
+                dataset = Dataset.from_list([s.to_dict() for s in samples])
+                result = evaluate(
+                    dataset=dataset,
+                    metrics=[
+                        LLMContextPrecisionWithReference(),     # context precision
+                        LLMContextRecall(),                     # context recall
+                        ContextEntityRecall(),
+                        # NoiseSensitivity(),
+                        Faithfulness(),                         # faithfulness
+                        ResponseRelevancy(),                    # answer relevancy
+                        SemanticSimilarity(),                   # semantic similarity
+                        # FactualCorrectness()
+                    ],
+                    llm=llm_wrapper,
+                    embeddings=embedding_wrapper,
+                    show_progress=True
+                )
+                result_df = result.to_pandas()
+                for i, row in enumerate(result_df.itertuples()):
+                    input_row = batch_df.iloc[i]
+                    record = {
+                        "Question": input_row["Question"],
+                        "Faithfulness": getattr(row, "faithfulness", None),
+                        "Answer Relevancy": getattr(row, "answer_relevancy", None),
+                        "Semantic Similarity": getattr(row, "semantic_similarity", None),
+                        # "Factual Correctness": getattr(row, "factual_correctness", None),
+                        "Context Precision": getattr(row, "llm_context_precision_with_reference", None),
+                        "Context Recall": getattr(row, "context_recall", None),
+                        "Context Entity Recall": getattr(row, "context_entity_recall", None),
+                        # "Noise Sensitivity": getattr(row, "noise_sensitivity_relevant", None)
+                    }
+                    for key in list(record.keys()):
+                        val = record[key]
+                        if isinstance(val, float) and not math.isfinite(val):
+                            record[key] = ""
+                    records.append(record)
+                    log_to_google_sheet(
+                        question=input_row["Question"],
+                        answer=input_row["Answer"],
+                        contexts=input_row["Context"],
+                        scores=record
+                    )
+            except Exception as e:
+                print(f"批次評估失敗（第 {batch_start+1} 筆起）：{e}")
+                continue
+        score_df = pd.DataFrame(records).fillna("")
+        print("完成評估筆數：", len(score_df))
+        numeric_cols = score_df.drop(columns=["Question"]).select_dtypes(include="number")
+        if not numeric_cols.empty:
+            avg_row = numeric_cols.mean().to_dict()
+            avg_row["Question"] = "Average"
+            score_df = pd.concat([score_df, pd.DataFrame([avg_row])], ignore_index=True)
+        original_name = os.path.basename(uploaded_file.name)
+        filename = os.path.splitext(original_name)[0]
+        output_path = f"{filename}_result.csv"
+        score_df.to_csv(output_path, index=False, encoding="utf-8-sig")
+        print("評估結果已儲存：", output_path)
+        return score_df, output_path
+    except Exception as e:
+        print("評估函式整體錯誤：", str(e))
+        return pd.DataFrame([{"錯誤訊息": f"系統錯誤：{str(e)}"}]), None
+# handle exception並執行RAG評估
+def check_csv_and_run(file, key):
+    if file is None:
+        return pd.DataFrame([{"錯誤訊息": "請上傳檔案！"}]), None
+    if not key or key.strip() == "":
+        return pd.DataFrame([{"錯誤訊息": "請輸入 OpenAI API Key"}]), None
+    try:
+        df = pd.read_csv(file.name, encoding="utf-8-sig")
+        df.columns = [col.strip() for col in df.columns]
+        required_columns = {"Question", "Context", "Answer"}
+        actual_columns = set(df.columns)
+        if actual_columns != required_columns:
+            return pd.DataFrame([{"錯誤訊息": f"欄位錯誤：應包含欄位 {required_columns}，實際為 {actual_columns}"}]), None
+        if df.shape[0] == 0:
+            return pd.DataFrame([{"錯誤訊息": "檔案中沒有資料列!"}]), None
+        invalid_rows = df[df["Question"].notnull() & (df["Answer"].isnull() | df["Context"].isnull())]
+        if len(invalid_rows) > 0:
+            missing_questions = "\n".join(f"- {q}" for q in invalid_rows["Question"].tolist())
+            return pd.DataFrame([{"錯誤訊息": f"發現 {len(invalid_rows)} 筆資料中 Answer 或 Context 為空：\n{missing_questions}"}]), None
+        # check eval context
+        try:
+            for i, val in df["Context"].dropna().items():
+                if not isinstance(eval(val), list):
+                    return pd.DataFrame([{"錯誤訊息": f"第 {i + 1} 筆 Context 欄格式錯誤，請確認其內容應為 list"}]), None
+        except Exception as e:
+            return pd.DataFrame([{"錯誤訊息": f"Context 欄格式解析錯誤，請確認其為有效的 list 格式，例如 ['A', 'B']：{str(e)}"}]), None
+        # 若上傳之待評估檔案無錯誤，執行評估
+        try:
+            return RAG_evaluation(file, key)
+        # 檢查 OpenAI API Key 是否有效
+        except Exception as e:
+            error_message = str(e)
+            return pd.DataFrame([{"錯誤訊息": f"系統錯誤：{error_message}"}]), None
+    except Exception as e:
+        return pd.DataFrame([{"錯誤訊息": f"評估失敗：{str(e)}"}]), None
+# Gradio 介面
+with gr.Blocks() as demo:
+    gr.Markdown("""
+## 📐 RAG系統評估工具 (分流C)
+### 📄 使用說明
+請上傳您 RAG 系統產出的結果檔案（需包含欄位：Question、Context、Answer），並填入您的 OpenAI API Key，以進行評估。
+#### ⏳ 完整評估通常需耗時 1 小時以上。若無即時回應，請耐心等候，系統並未當機，謝謝您的理解。
+🚦 注意：本工具部署於 Hugging Face Public Space，若同時有多位使用者使用，系統會將您的評估請求排入佇列。
+為避免長時間等待，建議您**先僅送出 1 筆資料進行測試**，若進度條顯示之預估等待時間超過 2 小時（7000 秒以上），可能是其他使用者正在使用。
+本頁為分流 A，您可以考慮改用其他分流或稍後再試，感謝您的耐心與配合!
+- 🔁 [主頁面 (Main)](https://huggingface.co/spaces/KSLab/RAG_Evaluator)
+- 🔁 [分流 A](https://huggingface.co/spaces/KSLab/RAG_Evaluator_A)
+- 🔁 [分流 B](https://huggingface.co/spaces/KSLab/RAG_Evaluator_B)
+""")
+    file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
+    api_key_input = gr.Textbox(label="OpenAI API Key", type="password")
+    submit_btn = gr.Button("開始評估")
+    result_output = gr.Dataframe(label="評估結果")
+    download_link = gr.File(label="下載評估結果(CSV)")
+    # 常見QA文字
+    gr.Markdown("""
+    ---
+    ### ❓ 常見問題 & 解答
+    **Q: 什麼是「指令集」？**
+    A: 「指令集」是我們用來描述老師在課堂上所設計的各種學習活動操作流程。在與教學系統互動時，老師通常會透過一系列結構化的指令來引導學生完成任務，因此我們將這些可重複使用的操作流程統稱為「指令集」。
+                指令集也如同RESTful API一樣，我們有先盡力的與老師們溝通他們的需求，不過這些需求都只能視為一個草案，最終仍需要仰賴得標業者與老師們收斂，並且確定最終的版本來加以實作。
+    """)
+    def wrapped_fn(file, key):
+        return RAG_evaluation(file, key)
+    submit_btn.click(
+        fn=check_csv_and_run,
+        inputs=[file_input, api_key_input],
+        outputs=[result_output, download_link],
+)
+demo.launch()