Spaces:

MaroueneA
/

Tn-LLMs-Evaluation

Build error

marouene addhoum

I AM LOST

31e3f57 about 1 year ago

6.15 kB

	import gradio as gr
	import pandas as pd
	import os
	import shutil
	import time

	# File paths
	# Reference dataset
	initial_file = './Updated_HumanEvaluations_columns_and_rows_shuffled.xlsx'
	storage_dir = './data' # Local storage directory for the updated dataset
	# The dataset being modified
	storage_file = f'{storage_dir}/Updated_HumanEvaluations.xlsx'

	# ✅ Ensure the directory exists
	os.makedirs(storage_dir, exist_ok=True)

	print("📂 Checking if dataset exists:", os.path.exists(storage_file))
	print("📂 Using dataset from:", storage_file)

	if not os.path.exists(storage_file):
	print("⚠️ No existing dataset found, copying from reference dataset...")
	shutil.copy(initial_file, storage_file)

	# Load dataset with explicit column types
	df = pd.read_excel(storage_file, dtype={
	'Human judges quality': str,
	'Human judges correctness': str,
	'Human judges relevance': str,
	'Human LLM1 flagged issues': str,
	'Human LLM2 flagged issues': str,
	'Human LLM1 Tunisian usage score': str,
	'Human LLM2 Tunisian usage score': str
	})
	print("✅ Dataset loaded successfully!")
	print(df.head()) # ✅ Print the first few rows to verify


	def get_next_prompt():
	global df

	print("🔄 Checking for next unfilled prompt...")

	# ✅ Reload dataset to ensure latest version is used
	df = pd.read_excel(storage_file)

	for index, row in df.iterrows():
	print(f"🔍 Checking row {index} for missing values...")
	if pd.isna(row['Human judges quality']) or pd.isna(row['Human judges correctness']) or pd.isna(row['Human judges relevance']) or pd.isna(row['Human LLM1 Tunisian usage score']) or pd.isna(row['Human LLM2 Tunisian usage score']) or pd.isna(row['Human LLM1 flagged issues']) or pd.isna(row['Human LLM2 flagged issues']):
	print(f"✅ Found next unfilled prompt at index {index}")
	return row['Prompt'], row['LLM1 response'], row['LLM2 response']

	print("⚠️ No more unfilled prompts!")
	return "No more prompts!", "", ""


	def submit_feedback(prefer, accuracy, relevance, llm1_flags, llm2_flags, llm1_tunisian, llm2_tunisian):
	global df

	print("🟡 Receiving feedback submission...")

	for index, row in df.iterrows():
	if pd.isna(row['Human judges quality']) or pd.isna(row['Human judges correctness']) or pd.isna(row['Human judges relevance']) or pd.isna(row['Human LLM1 Tunisian usage score']) or pd.isna(row['Human LLM2 Tunisian usage score']) or pd.isna(row['Human LLM1 flagged issues']) or pd.isna(row['Human LLM2 flagged issues']):

	print(f"🟢 Updating row index {index} with feedback...")

	# ✅ Convert values explicitly before saving
	df.at[index, 'Human judges quality'] = str(
	prefer) if prefer else "N/A"
	df.at[index, 'Human judges correctness'] = str(
	accuracy) if accuracy else "N/A"
	df.at[index, 'Human judges relevance'] = str(
	relevance) if relevance else "N/A"
	df.at[index, 'Human LLM1 flagged issues'] = ", ".join(
	llm1_flags) if llm1_flags else "None"
	df.at[index, 'Human LLM2 flagged issues'] = ", ".join(
	llm2_flags) if llm2_flags else "None"
	df.at[index, 'Human LLM1 Tunisian usage score'] = int(
	llm1_tunisian) if llm1_tunisian else 0
	df.at[index, 'Human LLM2 Tunisian usage score'] = int(
	llm2_tunisian) if llm2_tunisian else 0

	try:
	# ✅ Ensure all NaN values are replaced before saving
	df.fillna("N/A", inplace=True)

	# ✅ Save dataset with explicit encoding
	df.to_excel(storage_file, index=False, engine="openpyxl")

	time.sleep(1) # ✅ Allow time for saving
	print("✅ Dataset saved successfully at:", storage_file)

	# ✅ Reload dataset to verify update
	df = pd.read_excel(storage_file)
	print("🔄 Reloaded dataset preview:")
	# ✅ Show the updated row to confirm changes
	print(df.iloc[index])

	except Exception as e:
	print("❌ ERROR Saving File:", str(e))

	break # ✅ Move to the next prompt after saving
	return get_next_prompt()


	with gr.Blocks() as demo:
	gr.Markdown("## LLM Response Evaluation")
	prompt_output = gr.Textbox(label="Prompt", interactive=False)
	response1_output = gr.Textbox(label="Response 1", interactive=False)
	response2_output = gr.Textbox(label="Response 2", interactive=False)

	prefer = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"],
	label="Which response do you prefer?")
	accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"],
	label="Which response is more factually accurate?")
	relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"],
	label="Which response better addresses the prompt?")

	llm1_flags = gr.CheckboxGroup(["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content",
	"Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 1 contain any issues?")
	llm2_flags = gr.CheckboxGroup(["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content",
	"Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 2 contain any issues?")

	llm1_tunisian = gr.Radio(
	[0, 1, 2], label="Rate LLM1's use of Tunisian Arabic")
	llm2_tunisian = gr.Radio(
	[0, 1, 2], label="Rate LLM2's use of Tunisian Arabic")

	submit_btn = gr.Button("Submit and Get New Prompt")

	submit_btn.click(submit_feedback, inputs=[prefer, accuracy, relevance, llm1_flags, llm2_flags,
	llm1_tunisian, llm2_tunisian], outputs=[prompt_output, response1_output, response2_output])

	demo.load(get_next_prompt, outputs=[
	prompt_output, response1_output, response2_output])

	if __name__ == "__main__":
	demo.launch()