Spaces:

KSLab
/

RAG_Evaluator_B

Sleeping

App Files Files Community

RAG_Evaluator_B / app.py

shihman

Update app.py

a6d9330 verified 5 months ago

raw

history blame contribute delete

13.9 kB

	import os
	import sys
	import math
	from openai import OpenAI
	import requests
	import gradio as gr
	import pandas as pd
	import concurrent.futures
	from datasets import Dataset
	from tqdm import tqdm
	from ragas import evaluate, SingleTurnSample
	from ragas.llms import LangchainLLMWrapper
	from ragas.embeddings import LangchainEmbeddingsWrapper
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings
	from ragas.metrics import (
	ResponseRelevancy, LLMContextPrecisionWithReference, LLMContextRecall,
	ContextEntityRecall, Faithfulness, NoiseSensitivity, SemanticSimilarity, FactualCorrectness
	)

	# 設定輸出編碼為 UTF-8（解決中文顯示問題）
	sys.stdout.reconfigure(encoding="utf-8")

	# 從Google Drive下載 Ground Truth
	gt_url = os.environ.get("GT_URL")
	gt_path = "tender_groundtruth.csv"

	if gt_url and not os.path.exists(gt_path):
	print("嘗試下載 Ground Truth...")
	r = requests.get(gt_url)
	print("HTTP 狀態碼：", r.status_code)
	if r.status_code != 200:
	print("下載失敗內容預覽：", r.text[:500])
	else:
	with open(gt_path, "wb") as f:
	f.write(r.content)

	# 綁定實驗室Google帳號(Python TA)Google Sheet，以記錄評估logs
	def log_to_google_sheet(question, answer, contexts, scores):
	url = os.environ.get("G_SHEET_URL")
	if not url:
	print("G_SHEET_URL 未設定，略過記錄")
	return
	try:
	payload = {
	"question": question,
	"answer": answer,
	"contexts": contexts,
	"faithfulness": scores.get("Faithfulness"),
	"answer_relevancy": scores.get("Answer Relevancy"),
	"semantic_similarity": scores.get("Semantic Similarity"),
	"context_precision": scores.get("Context Precision"),
	"context_recall": scores.get("Context Recall"),
	"context_entity_recall": scores.get("Context Entity Recall")
	}
	response = requests.post(url, json=payload)
	print("成功寫入 Google Sheet:", response.status_code)
	except Exception as e:
	print("寫入 Google Sheet 失敗:", str(e))

	def fetch_sheet_content():
	DEFAULT_ANNOUNCEMENT = "尚無公告"
	DEFAULT_FAQ = "尚無常見問題"

	try:
	url = os.environ.get("ANNOUNCEMENT_URL")
	if not url:
	print("Warning: 未設定 ANNOUNCEMENT_URL")
	return DEFAULT_ANNOUNCEMENT, DEFAULT_FAQ

	df = pd.read_csv(url)

	announcement = df["Announcement"].iloc[0].strip() if "Announcement" in df.columns else DEFAULT_ANNOUNCEMENT
	faq = df["FAQ"].iloc[0].strip() if "FAQ" in df.columns else DEFAULT_FAQ

	announcement = announcement.replace("\\n", "<br>").replace("\n", "<br>")
	faq = faq.replace("\\n", "<br>").replace("\n", "<br>")

	return announcement or DEFAULT_ANNOUNCEMENT, faq or DEFAULT_FAQ

	except Exception as e:
	print("載入 Sheet 錯誤：", e)
	return DEFAULT_ANNOUNCEMENT, DEFAULT_FAQ


	def validate_openai_key(api_key):
	try:
	client = OpenAI(api_key=api_key)
	client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[{"role": "user", "content": "hi"}],
	max_tokens=1
	)
	return None
	except Exception as e:
	err_msg = str(e)
	if "Incorrect API key provided" in err_msg:
	return pd.DataFrame([{"錯誤訊息": " 您輸入的 OpenAI API Key 有誤，請確認是否貼錯、字數不符或格式異常。"}]), None
	elif "exceeded your current quota" in err_msg:
	return pd.DataFrame([{"錯誤訊息": "您的 OpenAI 帳戶額度已用盡，請前往帳戶頁面檢查餘額。"}]), None
	elif "Rate limit" in err_msg:
	return pd.DataFrame([{"錯誤訊息": "OpenAI 請求頻率過高，請稍後再試"}]), None
	else:
	return pd.DataFrame([{"錯誤訊息": f"API Key 錯誤：{err_msg}"}]), None

	def RAG_evaluation(uploaded_file, user_api_key):
	try:
	# 檢查 OpenAI API Key 是否有效
	validation_result = validate_openai_key(user_api_key)
	if validation_result:
	return validation_result

	os.environ["OPENAI_API_KEY"] = user_api_key
	print("評估開始")

	if not os.path.exists(gt_path):
	print("找不到 Ground Truth！")
	return pd.DataFrame(), None

	gt_df = pd.read_csv(gt_path)
	df = pd.read_csv(uploaded_file.name, converters={"Context": eval})
	print(f"上傳檔案筆數：{len(df)}，GT 檔案筆數：{len(gt_df)}")

	merged_df = pd.merge(df, gt_df[["Question", "Answer"]], on="Question", suffixes=("", "_GroundTruth"))
	merged_df = merged_df.rename(columns={"Answer_GroundTruth": "GroundTruth"})
	print(f"成功合併筆數：{len(merged_df)} / {len(df)}")
	if len(merged_df) < len(df):
	missing = df[~df["Question"].isin(merged_df["Question"])]
	print("未合併題目：", missing["Question"].tolist())
	if merged_df.empty:
	return pd.DataFrame([{"錯誤訊息": "合併後無資料，請確認題目與 GT 是否對應"}]), None

	llm_wrapper = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini-2024-07-18"))
	embedding_wrapper = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-large"))

	batch_size = 10
	records = []
	for batch_start in tqdm(range(0, len(merged_df), batch_size), desc="RAGAS Batch Evaluating"):
	batch_df = merged_df.iloc[batch_start:batch_start + batch_size]

	samples = []
	for _, row in batch_df.iterrows():
	if not isinstance(row["Context"], list):
	print(f"Context 非 list，跳過。值：{row['Question']}")
	continue

	sample = SingleTurnSample(
	user_input=row["Question"],
	response=row["Answer"],
	retrieved_contexts=row["Context"],
	reference=row["GroundTruth"],
	)
	samples.append(sample)

	try:
	dataset = Dataset.from_list([s.to_dict() for s in samples])
	result = evaluate(
	dataset=dataset,
	metrics=[
	LLMContextPrecisionWithReference(), # context precision
	LLMContextRecall(), # context recall
	ContextEntityRecall(),
	# NoiseSensitivity(),
	Faithfulness(), # faithfulness
	ResponseRelevancy(), # answer relevancy
	SemanticSimilarity(), # semantic similarity
	# FactualCorrectness()
	],
	llm=llm_wrapper,
	embeddings=embedding_wrapper,
	show_progress=True
	)

	result_df = result.to_pandas()

	for i, row in enumerate(result_df.itertuples()):
	input_row = batch_df.iloc[i]
	record = {
	"Question": input_row["Question"],
	"Faithfulness": getattr(row, "faithfulness", None),
	"Answer Relevancy": getattr(row, "answer_relevancy", None),
	"Semantic Similarity": getattr(row, "semantic_similarity", None),
	# "Factual Correctness": getattr(row, "factual_correctness", None),
	"Context Precision": getattr(row, "llm_context_precision_with_reference", None),
	"Context Recall": getattr(row, "context_recall", None),
	"Context Entity Recall": getattr(row, "context_entity_recall", None),
	# "Noise Sensitivity": getattr(row, "noise_sensitivity_relevant", None)
	}

	for key in list(record.keys()):
	val = record[key]
	if isinstance(val, float) and not math.isfinite(val):
	record[key] = ""

	records.append(record)

	log_to_google_sheet(
	question=input_row["Question"],
	answer=input_row["Answer"],
	contexts=input_row["Context"],
	scores=record
	)

	except Exception as e:
	print(f"批次評估失敗（第 {batch_start+1} 筆起）：{e}")
	continue

	score_df = pd.DataFrame(records).fillna("")
	print("完成評估筆數：", len(score_df))

	numeric_cols = score_df.drop(columns=["Question"]).select_dtypes(include="number")
	if not numeric_cols.empty:
	avg_row = numeric_cols.mean().to_dict()
	avg_row["Question"] = "Average"
	score_df = pd.concat([score_df, pd.DataFrame([avg_row])], ignore_index=True)

	original_name = os.path.basename(uploaded_file.name)
	filename = os.path.splitext(original_name)[0]
	output_path = f"{filename}_result.csv"
	score_df.to_csv(output_path, index=False, encoding="utf-8-sig")
	print("評估結果已儲存：", output_path)

	return score_df, output_path

	except Exception as e:
	print("評估函式整體錯誤：", str(e))
	return pd.DataFrame([{"錯誤訊息": f"系統錯誤：{str(e)}"}]), None

	# handle exception並執行RAG評估
	def check_csv_and_run(file, key):
	if file is None:
	return pd.DataFrame([{"錯誤訊息": "請上傳檔案！"}]), None

	if not key or key.strip() == "":
	return pd.DataFrame([{"錯誤訊息": "請輸入 OpenAI API Key"}]), None

	try:
	df = pd.read_csv(file.name, encoding="utf-8-sig")
	df.columns = [col.strip() for col in df.columns]

	required_columns = {"Question", "Context", "Answer"}
	actual_columns = set(df.columns)

	if actual_columns != required_columns:
	return pd.DataFrame([{"錯誤訊息": f"欄位錯誤：應包含欄位 {required_columns}，實際為 {actual_columns}"}]), None

	if df.shape[0] == 0:
	return pd.DataFrame([{"錯誤訊息": "檔案中沒有資料列!"}]), None

	invalid_rows = df[df["Question"].notnull() & (df["Answer"].isnull() \| df["Context"].isnull())]
	if len(invalid_rows) > 0:
	missing_questions = "\n".join(f"- {q}" for q in invalid_rows["Question"].tolist())
	return pd.DataFrame([{"錯誤訊息": f"發現 {len(invalid_rows)} 筆資料中 Answer 或 Context 為空：\n{missing_questions}"}]), None

	# check eval context
	try:
	for i, val in df["Context"].dropna().items():
	if not isinstance(eval(val), list):
	return pd.DataFrame([{"錯誤訊息": f"第 {i + 1} 筆 Context 欄格式錯誤，請確認其內容應為 list"}]), None
	except Exception as e:
	return pd.DataFrame([{"錯誤訊息": f"Context 欄格式解析錯誤，請確認其為有效的 list 格式，例如 ['A', 'B']：{str(e)}"}]), None

	# 若上傳之待評估檔案無錯誤，執行評估
	try:
	return RAG_evaluation(file, key)
	# 檢查 OpenAI API Key 是否有效
	except Exception as e:
	error_message = str(e)
	return pd.DataFrame([{"錯誤訊息": f"系統錯誤：{error_message}"}]), None
	except Exception as e:
	return pd.DataFrame([{"錯誤訊息": f"評估失敗：{str(e)}"}]), None

	# Gradio 介面
	with gr.Blocks() as demo:
	gr.Markdown("""
	# 📐 RAG系統評估工具 (Science_Evaluation_0)

	### 📄 使用說明
	- 請上傳您 RAG 系統產出的結果檔案（需包含欄位：Question、Context、Answer），並填入您的 OpenAI API Key，以進行評估。
	- ⏳ 完整評估通常需耗時 1 小時以上，若無即時回應，請耐心等候，系統並未當機，謝謝您的理解。

	### 🚦 分流措施
	本工具部署於 Hugging Face Public Space，若同時有多位使用者使用，系統會將您的評估請求排入佇列。
	為避免長時間等待，建議您先僅送出 1 筆資料進行測試，若進度條顯示之預估等待時間超過 2 小時（7000 秒以上），可能是其他使用者正在使用。

	本頁為分流 B，您可以考慮改用其他分流或稍後再試，感謝您的耐心與配合!
	- 🔁 [主頁面 (Main)](https://huggingface.co/spaces/KSLab/RAG_Evaluator)
	- 🔁 [分流 A](https://huggingface.co/spaces/KSLab/RAG_Evaluator_A)
	- 🔁 [分流 C](https://huggingface.co/spaces/KSLab/RAG_Evaluator_C)

	### 📢 系統公告
	""")
	announcement_display = gr.Markdown()

	file_input = gr.File(label="上傳 Evaluation_Dataset.csv")
	api_key_input = gr.Textbox(label="OpenAI API Key", type="password")
	submit_btn = gr.Button("開始評估")

	result_output = gr.Dataframe(label="評估結果")
	download_link = gr.File(label="下載評估結果(CSV)")

	# 常見QA
	gr.Markdown("""
	---
	### ❓ 常見問題 & 解答
	""")
	faq_display = gr.Markdown()

	# 載入公告與 FAQ
	def load_sheet():
	return fetch_sheet_content()
	demo.load(fn=load_sheet, inputs=[], outputs=[announcement_display, faq_display])

	def wrapped_fn(file, key):
	return RAG_evaluation(file, key)

	submit_btn.click(
	fn=check_csv_and_run,
	inputs=[file_input, api_key_input],
	outputs=[result_output, download_link],
	)

	demo.launch()