Spaces:

MaroueneA
/

llm-evaluation

Build error

App Files Files Community

llm-evaluation / app.py

MaroueneA

Update app.py

a982ad4 verified about 1 year ago

raw

history blame contribute delete

7.34 kB

	import gradio as gr
	import pandas as pd
	import gspread
	from oauth2client.service_account import ServiceAccountCredentials

	SPREADSHEET_NAME = "dataset" # The name of your Google Sheet
	WORKSHEET_NAME = "sheet1" # The tab/worksheet name
	df = None
	current_index = None
	ws = None

	def init_gsheets():
	"""
	Authenticate with Google Sheets and load the entire worksheet into a DataFrame.
	We'll identify the next unreviewed row and store that in global variables.
	"""
	global df, current_index, ws

	# Scopes for Google Sheets
	scope = [
	"https://spreadsheets.google.com/feeds",
	"https://www.googleapis.com/auth/spreadsheets",
	"https://www.googleapis.com/auth/drive"
	]

	creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope)
	gc = gspread.authorize(creds)
	sh = gc.open(SPREADSHEET_NAME)
	ws = sh.worksheet(WORKSHEET_NAME)

	# Read all values from the sheet
	data = ws.get_all_values()
	df = pd.DataFrame(data[1:], columns=data[0]) # row 1 = headers

	# Identify first unreviewed row (example: "Human judges quality" is empty)
	unreviewed_rows = df[df["Human judges quality"] == ""]
	if len(unreviewed_rows) > 0:
	current_index = unreviewed_rows.index[0]
	else:
	current_index = None

	def get_prompt_data():
	"""
	Returns the current prompt and responses if any are left,
	or a "done" message if everything is reviewed.
	"""
	global df, current_index
	if current_index is None:
	return "All rows have been reviewed.", "", "", True # all_done = True

	row = df.loc[current_index]
	return row["Prompt"], row["LLM1 response"], row["LLM2 response"], False

	def save_and_load_next(
	preference,
	factual_accuracy,
	relevance,
	llm1_issues,
	llm2_issues,
	llm1_tunisian_score,
	llm2_tunisian_score
	):
	"""
	1) Saves feedback for the current row to Google Sheets
	2) Moves to the next unreviewed row
	3) Returns the next prompt + updated status message in one step
	"""
	global df, current_index, ws

	# If we're out of rows, just return "all done"
	if current_index is None:
	return (
	"All rows have been reviewed.", # prompt
	"", # LLM1 resp
	"", # LLM2 resp
	"No more rows to review!" # status message
	)

	# 1. Update the in-memory DataFrame
	df.at[current_index, "Human judges quality"] = preference
	df.at[current_index, "Human judges correctness"] = factual_accuracy
	df.at[current_index, "Human judges relevance"] = relevance
	df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) if llm1_issues else ""
	df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) if llm2_issues else ""
	df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score
	df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score

	# 2. Write updates back to Google Sheets
	sheet_row = current_index + 2 # row 1 is headers
	headers = list(df.columns)

	def update_sheet_cell(column_name, value):
	col_index = headers.index(column_name) + 1 # 1-based indexing
	ws.update_cell(sheet_row, col_index, value)

	update_sheet_cell("Human judges quality", preference)
	update_sheet_cell("Human judges correctness", factual_accuracy)
	update_sheet_cell("Human judges relevance", relevance)
	update_sheet_cell("Human LLM1 flagged issues", df.at[current_index, "Human LLM1 flagged issues"])
	update_sheet_cell("Human LLM2 flagged issues", df.at[current_index, "Human LLM2 flagged issues"])
	update_sheet_cell("Human LLM1 Tunisian usage score", llm1_tunisian_score)
	update_sheet_cell("Human LLM2 Tunisian usage score", llm2_tunisian_score)

	# 3. Move to the next unreviewed row
	unreviewed_rows = df[df["Human judges quality"] == ""]
	if len(unreviewed_rows) == 0:
	current_index = None
	return (
	"All rows have been reviewed.", # prompt
	"", # LLM1 resp
	"", # LLM2 resp
	"All rows have been reviewed. Thank you!" # status message
	)
	else:
	current_index = unreviewed_rows.index[0]
	prompt, llm1resp, llm2resp, _ = get_prompt_data()
	return (
	prompt,
	llm1resp,
	llm2resp,
	"Feedback saved! Moving to the next prompt..."
	)

	def on_load():
	"""
	Called on interface load. Returns the current prompt, or 'done' if there's none.
	"""
	prompt, llm1resp, llm2resp, all_done = get_prompt_data()
	if all_done:
	return prompt, llm1resp, llm2resp, "No next prompt. All done."
	else:
	return prompt, llm1resp, llm2resp, ""

	# ---- Initialize Google Sheets data
	init_gsheets()

	# ---- Build Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# LLM Responses Evaluation (Google Sheets)")

	prompt_text = gr.Textbox(label="Prompt", interactive=False)
	llm1_text = gr.Textbox(label="LLM1 Response", interactive=False)
	llm2_text = gr.Textbox(label="LLM2 Response", interactive=False)
	status_msg = gr.Markdown()

	preference = gr.Radio(
	["LLM1", "LLM2", "Tie", "Both are bad"],
	label="Which response do you prefer?"
	)
	factual_accuracy = gr.Radio(
	["LLM1", "LLM2", "Tie", "Both are bad"],
	label="Which response is more factually accurate?"
	)
	relevance = gr.Radio(
	["LLM1", "LLM2", "Tie", "Both are bad"],
	label="Which response better addresses the prompt?"
	)

	llm1_issues = gr.CheckboxGroup(
	[
	"Hate Speech",
	"Not Arabic",
	"Inappropriate Content",
	"Sexual Content",
	"Untruthful Info",
	"Violent Content",
	"Personal Information"
	],
	label="Does Response 1 contain any issues?"
	)
	llm2_issues = gr.CheckboxGroup(
	[
	"Hate Speech",
	"Not Arabic",
	"Inappropriate Content",
	"Sexual Content",
	"Untruthful Info",
	"Violent Content",
	"Personal Information"
	],
	label="Does Response 2 contain any issues?"
	)

	llm1_tunisian_score = gr.Radio(
	[0, 1, 2],
	label="Rate LLM1's use of Tunisian Arabic",
	value=0
	)
	llm2_tunisian_score = gr.Radio(
	[0, 1, 2],
	label="Rate LLM2's use of Tunisian Arabic",
	value=0
	)

	submit_btn = gr.Button("Submit Feedback")

	# Single callback: save feedback and immediately load next prompt
	submit_btn.click(
	fn=save_and_load_next,
	inputs=[
	preference,
	factual_accuracy,
	relevance,
	llm1_issues,
	llm2_issues,
	llm1_tunisian_score,
	llm2_tunisian_score
	],
	outputs=[prompt_text, llm1_text, llm2_text, status_msg]
	)

	# On initial load: display the first unreviewed prompt
	demo.load(
	fn=on_load,
	inputs=[],
	outputs=[prompt_text, llm1_text, llm2_text, status_msg]
	)

	demo.launch()