Spaces:

KSLab
/

RAG_Evaluator

Sleeping

App Files Files Community

MonicaChen0330 commited on May 15, 2025

Commit

995479c

verified ·

1 Parent(s): 5b51eba

Optimized: batch evaluation

Browse files

Files changed (1) hide show

app.py +65 -50

app.py CHANGED Viewed

@@ -14,11 +14,12 @@ from ragas.metrics import (
     ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
 )
 sys.stdout.reconfigure(encoding="utf-8")
-# 嘗試從Google Drive下載 Ground Truth
 gt_url = os.environ.get("GT_URL")
-gt_path = "ragas_groundtruth.csv"
 if gt_url and not os.path.exists(gt_path):
     print("嘗試下載 Ground Truth...")
@@ -30,6 +31,24 @@ if gt_url and not os.path.exists(gt_path):
         with open(gt_path, "wb") as f:
             f.write(r.content)
 def RAG_evaluation(uploaded_file, user_api_key):
     try:
         os.environ["OPENAI_API_KEY"] = user_api_key
@@ -55,20 +74,27 @@ def RAG_evaluation(uploaded_file, user_api_key):
         llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
         embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))
         records = []
-        for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Evaluating"):
-            try:
                 if not isinstance(row["Context"], list):
-                    print(f"第 {idx + 1} 筆 Context 非 list，跳過。值：{row['Context']}")
                     continue
                 sample = SingleTurnSample(
                     user_input=row["Question"],
                     response=row["Answer"],
                     retrieved_contexts=row["Context"],
-                    reference=row["GroundTruth"]
                 )
-                dataset = Dataset.from_list([sample.to_dict()])
                 result = evaluate(
                     dataset=dataset,
                     metrics=[
@@ -81,28 +107,32 @@ def RAG_evaluation(uploaded_file, user_api_key):
                     show_progress=False
                 )
-                score_row = result.to_pandas().iloc[0].to_dict()
-                records.append({
-                    "Question": row["Question"],
-                    "Faithfulness": score_row.get("faithfulness"),
-                    "Answer Relevancy": score_row.get("answer_relevancy"),
-                    "Semantic Similarity": score_row.get("semantic_similarity"),
-                    # "Factual Correctness": score_row.get("factual_correctness"),
-                    "Context Precision": score_row.get("llm_context_precision_with_reference"),
-                    "Context Recall": score_row.get("context_recall"),
-                    "Context Entity Recall": score_row.get("context_entity_recall"),
-                    # "noise_sensitivity_relevant": score_row.get("noise_sensitivity_relevant")
-                })
-                log_to_google_sheet(
-                    question=row["Question"],
-                    answer=row["Answer"],
-                    contexts=row["Context"],
-                    scores=score_row
-                )
             except Exception as e:
-                print(f"第 {idx + 1} 筆評估失敗：{e}")
                 continue
         score_df = pd.DataFrame(records).fillna("")
@@ -123,9 +153,9 @@ def RAG_evaluation(uploaded_file, user_api_key):
     except Exception as e:
         print("評估函式整體錯誤：", str(e))
         return pd.DataFrame([{"錯誤訊息": f"系統錯誤：{str(e)}"}]), None
 def check_csv_and_run(file, key):
-    print("開始檢查CSV檔案格式並執行評估")
     if file is None:
         return pd.DataFrame([{"錯誤訊息": "請上傳檔案！"}]), None
@@ -166,31 +196,13 @@ def check_csv_and_run(file, key):
     except Exception as e:
         return pd.DataFrame([{"錯誤訊息": f"RAG 評估失敗：{str(e)}"}]), None
-def log_to_google_sheet(question, answer, contexts, scores):
-    url = os.environ.get("G_SHEET_URL")
-    if not url:
-        print("G_SHEET_URL 未設定，略過記錄")
-        return
-    try:
-        payload = {
-            "question": question,
-            "answer": answer,
-            "contexts": contexts,
-            "scores": scores
-        }
-        response = requests.post(url, json=payload)
-        print("成功寫入 Google Sheet:", response.status_code)
-    except Exception as e:
-        print("寫入 Google Sheet 失敗:", str(e))
 # Gradio 介面
 with gr.Blocks() as demo:
     gr.Markdown("## 📐 RAG系統評估工具")
     gr.Markdown("""
     ### 📄 使用說明
     請上傳您RAG系統產出的結果檔案（包含 Question, Context, Answer 欄位），並填入您的OpenAI API Key，以評估您的RAG系統。
-    #### ⏳ 評估需要時間，請耐心等候。
     """)
     file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
@@ -198,7 +210,10 @@ with gr.Blocks() as demo:
     submit_btn = gr.Button("開始評估")
     result_output = gr.Dataframe(label="評估結果")
-    download_link = gr.File(label="下載結果檔案(CSV)")
     submit_btn.click(
         fn=check_csv_and_run,

     ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
 )
+# 設定輸出編碼為 UTF-8（解決中文顯示問題）
 sys.stdout.reconfigure(encoding="utf-8")
+# 支援從Google Drive下載 Ground Truth
 gt_url = os.environ.get("GT_URL")
+gt_path = "tender_groundtruth.csv"
 if gt_url and not os.path.exists(gt_path):
     print("嘗試下載 Ground Truth...")
         with open(gt_path, "wb") as f:
             f.write(r.content)
+# 綁定實驗室Google帳號(Python TA)Google Sheet，以記錄評估logs
+def log_to_google_sheet(question, answer, contexts, scores):
+    url = os.environ.get("G_SHEET_URL")
+    if not url:
+        print("G_SHEET_URL 未設定，略過記錄")
+        return
+    try:
+        payload = {
+            "question": question,
+            "answer": answer,
+            "contexts": contexts,
+            "scores": scores
+        }
+        response = requests.post(url, json=payload)
+        print("成功寫入 Google Sheet:", response.status_code)
+    except Exception as e:
+        print("寫入 Google Sheet 失敗:", str(e))
 def RAG_evaluation(uploaded_file, user_api_key):
     try:
         os.environ["OPENAI_API_KEY"] = user_api_key
         llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
         embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))
+        batch_size = 10
         records = []
+        for batch_start in tqdm(range(0, len(merged_df), batch_size), desc="RAGAS Batch Evaluating"):
+            batch_df = merged_df.iloc[batch_start:batch_start + batch_size]
+            samples = []
+            for _, row in batch_df.iterrows():
                 if not isinstance(row["Context"], list):
+                    print(f"Context 非 list，跳過。值：{row['Question']}")
                     continue
                 sample = SingleTurnSample(
                     user_input=row["Question"],
                     response=row["Answer"],
                     retrieved_contexts=row["Context"],
+                    reference=row["GroundTruth"],
                 )
+                samples.append(sample)
+            try:
+                dataset = Dataset.from_list([s.to_dict() for s in samples])
                 result = evaluate(
                     dataset=dataset,
                     metrics=[
                     show_progress=False
                 )
+                result_df = result.to_pandas()
+                for i, row in enumerate(result_df.itertuples()):
+                    input_row = batch_df.iloc[i]
+                    record = {
+                        "Question": input_row["Question"],
+                        "Faithfulness": getattr(row, "faithfulness", None),
+                        "Answer Relevancy": getattr(row, "answer_relevancy", None),
+                        "Semantic Similarity": getattr(row, "semantic_similarity", None),
+                        # "Factual Correctness": getattr(row, "factual_correctness", None),
+                        "Context Precision": getattr(row, "llm_context_precision_with_reference", None),
+                        "Context Recall": getattr(row, "context_recall", None),
+                        "Context Entity Recall": getattr(row, "context_entity_recall", None),
+                        # "Noise Sensitivity": getattr(row, "noise_sensitivity_relevant", None)
+                    }
+                    records.append(record)
+                    log_to_google_sheet(
+                        question=input_row["Question"],
+                        answer=input_row["Answer"],
+                        contexts=input_row["Context"],
+                        scores=record
+                    )
             except Exception as e:
+                print(f"批次評估失敗（第 {batch_start+1} 筆起）：{e}")
                 continue
         score_df = pd.DataFrame(records).fillna("")
     except Exception as e:
         print("評估函式整體錯誤：", str(e))
         return pd.DataFrame([{"錯誤訊息": f"系統錯誤：{str(e)}"}]), None
+# handle exception並執行RAG評估
 def check_csv_and_run(file, key):
     if file is None:
         return pd.DataFrame([{"錯誤訊息": "請上傳檔案！"}]), None
     except Exception as e:
         return pd.DataFrame([{"錯誤訊息": f"RAG 評估失敗：{str(e)}"}]), None
 # Gradio 介面
 with gr.Blocks() as demo:
     gr.Markdown("## 📐 RAG系統評估工具")
     gr.Markdown("""
     ### 📄 使用說明
     請上傳您RAG系統產出的結果檔案（包含 Question, Context, Answer 欄位），並填入您的OpenAI API Key，以評估您的RAG系統。
+    #### ⏳ 完整評估需要數小時，無即時回應並不是當機，請耐心等候。
     """)
     file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
     submit_btn = gr.Button("開始評估")
     result_output = gr.Dataframe(label="評估結果")
+    download_link = gr.File(label="下載評估結果(CSV)")
+    def wrapped_fn(file, key):
+        return RAG_evaluation(file, key)
     submit_btn.click(
         fn=check_csv_and_run,