Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import math | |
| from openai import OpenAI | |
| import requests | |
| import gradio as gr | |
| import pandas as pd | |
| import concurrent.futures | |
| from datasets import Dataset | |
| from tqdm import tqdm | |
| from ragas import evaluate, SingleTurnSample | |
| from ragas.llms import LangchainLLMWrapper | |
| from ragas.embeddings import LangchainEmbeddingsWrapper | |
| from langchain_openai import ChatOpenAI, OpenAIEmbeddings | |
| from ragas.metrics import ( | |
| ResponseRelevancy, LLMContextPrecisionWithReference, LLMContextRecall, | |
| ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness | |
| ) | |
| # 設定輸出編碼為 UTF-8(解決中文顯示問題) | |
| sys.stdout.reconfigure(encoding="utf-8") | |
| # 從Google Drive下載 Ground Truth | |
| gt_url = os.environ.get("GT_URL") | |
| gt_path = "tender_groundtruth.csv" | |
| if gt_url and not os.path.exists(gt_path): | |
| print("嘗試下載 Ground Truth...") | |
| r = requests.get(gt_url) | |
| print("HTTP 狀態碼:", r.status_code) | |
| if r.status_code != 200: | |
| print("下載失敗內容預覽:", r.text[:500]) | |
| else: | |
| with open(gt_path, "wb") as f: | |
| f.write(r.content) | |
| # 綁定實驗室Google帳號(Python TA)Google Sheet,以記錄評估logs | |
| def log_to_google_sheet(question, answer, contexts, scores): | |
| url = os.environ.get("G_SHEET_URL") | |
| if not url: | |
| print("G_SHEET_URL 未設定,略過記錄") | |
| return | |
| try: | |
| payload = { | |
| "question": question, | |
| "answer": answer, | |
| "contexts": contexts, | |
| "faithfulness": scores.get("Faithfulness"), | |
| "answer_relevancy": scores.get("Answer Relevancy"), | |
| "semantic_similarity": scores.get("Semantic Similarity"), | |
| "context_precision": scores.get("Context Precision"), | |
| "context_recall": scores.get("Context Recall"), | |
| "context_entity_recall": scores.get("Context Entity Recall") | |
| } | |
| response = requests.post(url, json=payload) | |
| print("成功寫入 Google Sheet:", response.status_code) | |
| except Exception as e: | |
| print("寫入 Google Sheet 失敗:", str(e)) | |
| def fetch_sheet_content(): | |
| DEFAULT_ANNOUNCEMENT = "尚無公告" | |
| DEFAULT_FAQ = "尚無常見問題" | |
| try: | |
| url = os.environ.get("ANNOUNCEMENT_URL") | |
| if not url: | |
| print("Warning: 未設定 ANNOUNCEMENT_URL") | |
| return DEFAULT_ANNOUNCEMENT, DEFAULT_FAQ | |
| df = pd.read_csv(url) | |
| announcement = df["Announcement"].iloc[0].strip() if "Announcement" in df.columns else DEFAULT_ANNOUNCEMENT | |
| faq = df["FAQ"].iloc[0].strip() if "FAQ" in df.columns else DEFAULT_FAQ | |
| announcement = announcement.replace("\\n", "<br>").replace("\n", "<br>") | |
| faq = faq.replace("\\n", "<br>").replace("\n", "<br>") | |
| return announcement or DEFAULT_ANNOUNCEMENT, faq or DEFAULT_FAQ | |
| except Exception as e: | |
| print("載入 Sheet 錯誤:", e) | |
| return DEFAULT_ANNOUNCEMENT, DEFAULT_FAQ | |
| def validate_openai_key(api_key): | |
| try: | |
| client = OpenAI(api_key=api_key) | |
| client.chat.completions.create( | |
| model="gpt-3.5-turbo", | |
| messages=[{"role": "user", "content": "hi"}], | |
| max_tokens=1 | |
| ) | |
| return None | |
| except Exception as e: | |
| err_msg = str(e) | |
| if "Incorrect API key provided" in err_msg: | |
| return pd.DataFrame([{"錯誤訊息": " 您輸入的 OpenAI API Key 有誤,請確認是否貼錯、字數不符或格式異常。"}]), None | |
| elif "exceeded your current quota" in err_msg: | |
| return pd.DataFrame([{"錯誤訊息": "您的 OpenAI 帳戶額度已用盡,請前往帳戶頁面檢查餘額。"}]), None | |
| elif "Rate limit" in err_msg: | |
| return pd.DataFrame([{"錯誤訊息": "OpenAI 請求頻率過高,請稍後再試"}]), None | |
| else: | |
| return pd.DataFrame([{"錯誤訊息": f"API Key 錯誤:{err_msg}"}]), None | |
| def RAG_evaluation(uploaded_file, user_api_key): | |
| try: | |
| # 檢查 OpenAI API Key 是否有效 | |
| validation_result = validate_openai_key(user_api_key) | |
| if validation_result: | |
| return validation_result | |
| os.environ["OPENAI_API_KEY"] = user_api_key | |
| print("評估開始") | |
| if not os.path.exists(gt_path): | |
| print("找不到 Ground Truth!") | |
| return pd.DataFrame(), None | |
| gt_df = pd.read_csv(gt_path) | |
| df = pd.read_csv(uploaded_file.name, converters={"Context": eval}) | |
| print(f"上傳檔案筆數:{len(df)},GT 檔案筆數:{len(gt_df)}") | |
| merged_df = pd.merge(df, gt_df[["Question", "Answer"]], on="Question", suffixes=("", "_GroundTruth")) | |
| merged_df = merged_df.rename(columns={"Answer_GroundTruth": "GroundTruth"}) | |
| print(f"成功合併筆數:{len(merged_df)} / {len(df)}") | |
| if len(merged_df) < len(df): | |
| missing = df[~df["Question"].isin(merged_df["Question"])] | |
| print("未合併題目:", missing["Question"].tolist()) | |
| if merged_df.empty: | |
| return pd.DataFrame([{"錯誤訊息": "合併後無資料,請確認題目與 GT 是否對應"}]), None | |
| llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18")) | |
| embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large")) | |
| batch_size = 10 | |
| records = [] | |
| for batch_start in tqdm(range(0, len(merged_df), batch_size), desc="RAGAS Batch Evaluating"): | |
| batch_df = merged_df.iloc[batch_start:batch_start + batch_size] | |
| samples = [] | |
| for _, row in batch_df.iterrows(): | |
| if not isinstance(row["Context"], list): | |
| print(f"Context 非 list,跳過。值:{row['Question']}") | |
| continue | |
| sample = SingleTurnSample( | |
| user_input=row["Question"], | |
| response=row["Answer"], | |
| retrieved_contexts=row["Context"], | |
| reference=row["GroundTruth"], | |
| ) | |
| samples.append(sample) | |
| try: | |
| dataset = Dataset.from_list([s.to_dict() for s in samples]) | |
| result = evaluate( | |
| dataset=dataset, | |
| metrics=[ | |
| LLMContextPrecisionWithReference(), # context precision | |
| LLMContextRecall(), # context recall | |
| ContextEntityRecall(), | |
| # NoiseSensitivity(), | |
| Faithfulness(), # faithfulness | |
| ResponseRelevancy(), # answer relevancy | |
| SemanticSimilarity(), # semantic similarity | |
| # FactualCorrectness() | |
| ], | |
| llm=llm_wrapper, | |
| embeddings=embedding_wrapper, | |
| show_progress=True | |
| ) | |
| result_df = result.to_pandas() | |
| for i, row in enumerate(result_df.itertuples()): | |
| input_row = batch_df.iloc[i] | |
| record = { | |
| "Question": input_row["Question"], | |
| "Faithfulness": getattr(row, "faithfulness", None), | |
| "Answer Relevancy": getattr(row, "answer_relevancy", None), | |
| "Semantic Similarity": getattr(row, "semantic_similarity", None), | |
| # "Factual Correctness": getattr(row, "factual_correctness", None), | |
| "Context Precision": getattr(row, "llm_context_precision_with_reference", None), | |
| "Context Recall": getattr(row, "context_recall", None), | |
| "Context Entity Recall": getattr(row, "context_entity_recall", None), | |
| # "Noise Sensitivity": getattr(row, "noise_sensitivity_relevant", None) | |
| } | |
| for key in list(record.keys()): | |
| val = record[key] | |
| if isinstance(val, float) and not math.isfinite(val): | |
| record[key] = "" | |
| records.append(record) | |
| log_to_google_sheet( | |
| question=input_row["Question"], | |
| answer=input_row["Answer"], | |
| contexts=input_row["Context"], | |
| scores=record | |
| ) | |
| except Exception as e: | |
| print(f"批次評估失敗(第 {batch_start+1} 筆起):{e}") | |
| continue | |
| score_df = pd.DataFrame(records).fillna("") | |
| print("完成評估筆數:", len(score_df)) | |
| numeric_cols = score_df.drop(columns=["Question"]).select_dtypes(include="number") | |
| if not numeric_cols.empty: | |
| avg_row = numeric_cols.mean().to_dict() | |
| avg_row["Question"] = "Average" | |
| score_df = pd.concat([score_df, pd.DataFrame([avg_row])], ignore_index=True) | |
| original_name = os.path.basename(uploaded_file.name) | |
| filename = os.path.splitext(original_name)[0] | |
| output_path = f"{filename}_result.csv" | |
| score_df.to_csv(output_path, index=False, encoding="utf-8-sig") | |
| print("評估結果已儲存:", output_path) | |
| return score_df, output_path | |
| except Exception as e: | |
| print("評估函式整體錯誤:", str(e)) | |
| return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{str(e)}"}]), None | |
| # handle exception並執行RAG評估 | |
| def check_csv_and_run(file, key): | |
| if file is None: | |
| return pd.DataFrame([{"錯誤訊息": "請上傳檔案!"}]), None | |
| if not key or key.strip() == "": | |
| return pd.DataFrame([{"錯誤訊息": "請輸入 OpenAI API Key"}]), None | |
| try: | |
| df = pd.read_csv(file.name, encoding="utf-8-sig") | |
| df.columns = [col.strip() for col in df.columns] | |
| required_columns = {"Question", "Context", "Answer"} | |
| actual_columns = set(df.columns) | |
| if actual_columns != required_columns: | |
| return pd.DataFrame([{"錯誤訊息": f"欄位錯誤:應包含欄位 {required_columns},實際為 {actual_columns}"}]), None | |
| if df.shape[0] == 0: | |
| return pd.DataFrame([{"錯誤訊息": "檔案中沒有資料列!"}]), None | |
| invalid_rows = df[df["Question"].notnull() & (df["Answer"].isnull() | df["Context"].isnull())] | |
| if len(invalid_rows) > 0: | |
| missing_questions = "\n".join(f"- {q}" for q in invalid_rows["Question"].tolist()) | |
| return pd.DataFrame([{"錯誤訊息": f"發現 {len(invalid_rows)} 筆資料中 Answer 或 Context 為空:\n{missing_questions}"}]), None | |
| # check eval context | |
| try: | |
| for i, val in df["Context"].dropna().items(): | |
| if not isinstance(eval(val), list): | |
| return pd.DataFrame([{"錯誤訊息": f"第 {i + 1} 筆 Context 欄格式錯誤,請確認其內容應為 list"}]), None | |
| except Exception as e: | |
| return pd.DataFrame([{"錯誤訊息": f"Context 欄格式解析錯誤,請確認其為有效的 list 格式,例如 ['A', 'B']:{str(e)}"}]), None | |
| # 若上傳之待評估檔案無錯誤,執行評估 | |
| try: | |
| return RAG_evaluation(file, key) | |
| # 檢查 OpenAI API Key 是否有效 | |
| except Exception as e: | |
| error_message = str(e) | |
| return pd.DataFrame([{"錯誤訊息": f"系統錯誤:{error_message}"}]), None | |
| except Exception as e: | |
| return pd.DataFrame([{"錯誤訊息": f"評估失敗:{str(e)}"}]), None | |
| # Gradio 介面 | |
| with gr.Blocks() as demo: | |
| gr.Markdown(""" | |
| # 📐 RAG系統評估工具 (Science_Evaluation_0) | |
| ### 📄 使用說明 | |
| - 請上傳您 RAG 系統產出的結果檔案(需包含欄位:Question、Context、Answer),並填入您的 OpenAI API Key,以進行評估。 | |
| - ⏳ 完整評估**通常需耗時 1 小時以上**,若無即時回應,請**耐心等候**,系統並未當機,謝謝您的理解。 | |
| ### 🚦 分流措施 | |
| 本工具部署於 Hugging Face Public Space,若同時有多位使用者使用,系統會將您的評估請求**排入佇列**。 | |
| 為避免長時間等待,建議您**先僅送出 1 筆資料進行測試**,若進度條顯示之預估**等待時間超過 2 小時(7000 秒以上),可能是其他使用者正在使用**。 | |
| 本頁為**分流 B**,您可以考慮改用其他分流或稍後再試,感謝您的耐心與配合! | |
| - 🔁 [主頁面 (Main)](https://huggingface.co/spaces/KSLab/RAG_Evaluator) | |
| - 🔁 [分流 A](https://huggingface.co/spaces/KSLab/RAG_Evaluator_A) | |
| - 🔁 [分流 C](https://huggingface.co/spaces/KSLab/RAG_Evaluator_C) | |
| ### 📢 系統公告 | |
| """) | |
| announcement_display = gr.Markdown() | |
| file_input = gr.File(label="上傳 Evaluation_Dataset.csv") | |
| api_key_input = gr.Textbox(label="OpenAI API Key", type="password") | |
| submit_btn = gr.Button("開始評估") | |
| result_output = gr.Dataframe(label="評估結果") | |
| download_link = gr.File(label="下載評估結果(CSV)") | |
| # 常見QA | |
| gr.Markdown(""" | |
| --- | |
| ### ❓ 常見問題 & 解答 | |
| """) | |
| faq_display = gr.Markdown() | |
| # 載入公告與 FAQ | |
| def load_sheet(): | |
| return fetch_sheet_content() | |
| demo.load(fn=load_sheet, inputs=[], outputs=[announcement_display, faq_display]) | |
| def wrapped_fn(file, key): | |
| return RAG_evaluation(file, key) | |
| submit_btn.click( | |
| fn=check_csv_and_run, | |
| inputs=[file_input, api_key_input], | |
| outputs=[result_output, download_link], | |
| ) | |
| demo.launch() |