Spaces:

MaroueneA
/

feedback-app

Sleeping

App Files Files Community

feedback-app / app.py

MaroueneA

Update app.py

2a15eee verified about 1 year ago

raw

history blame contribute delete

5.82 kB

	import gradio as gr
	from datasets import load_dataset, Dataset
	import pandas as pd
	import os
	from huggingface_hub import HfApi

	# Read the Hugging Face token from the environment variable
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# Authenticate with Hugging Face
	api = HfApi(token=HF_TOKEN)

	# Load the dataset from Hugging Face
	try:
	dataset = load_dataset("MaroueneA/feedback-dataset", data_files="dataset.csv") # Replace with your dataset file
	df = dataset["train"].to_pandas()
	if "CurrentPromptIndex" not in df.columns:
	df["CurrentPromptIndex"] = 0 # Initialize the column as an integer
	else:
	# Fill NaN values in the "CurrentPromptIndex" column with 0
	df["CurrentPromptIndex"] = df["CurrentPromptIndex"].fillna(0).astype(int)
	except Exception as e:
	print(f"Error loading dataset: {e}")
	df = pd.DataFrame(columns=[
	"Prompt", "LLM1 Response", "LLM2 Response", "Human judges quality", "Human judges correctness",
	"Human judges relevance", "Human LLM1 Tunisian usage score", "Human LLM2 Tunisian usage score",
	"Human LLM1 flagged issues", "Human LLM2 flagged issues", "Evaluated", "CurrentPromptIndex"
	])

	# Function to save feedback to the dataset
	def save_feedback(prompt_idx, preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
	# Update the dataset with feedback
	df.at[prompt_idx, "Human judges quality"] = str(preference)
	df.at[prompt_idx, "Human judges correctness"] = str(factual_accuracy)
	df.at[prompt_idx, "Human judges relevance"] = str(relevance)
	df.at[prompt_idx, "Human LLM1 Tunisian usage score"] = int(llm1_tunisian)
	df.at[prompt_idx, "Human LLM2 Tunisian usage score"] = int(llm2_tunisian)
	df.at[prompt_idx, "Human LLM1 flagged issues"] = ", ".join(llm1_issues)
	df.at[prompt_idx, "Human LLM2 flagged issues"] = ", ".join(llm2_issues)
	df.at[prompt_idx, "Evaluated"] = True

	# Convert the updated DataFrame back to a Hugging Face Dataset
	updated_dataset = Dataset.from_pandas(df)

	# Push the updated dataset back to Hugging Face
	updated_dataset.push_to_hub("MaroueneA/feedback-dataset", token=HF_TOKEN)

	# Function to get the next unevaluated prompt
	def get_next_prompt():
	# Get the current prompt index from the dataset and convert it to an integer
	current_prompt_idx = int(df["CurrentPromptIndex"].iloc[0])
	# Iterate through the DataFrame to find the next unevaluated prompt
	for idx in range(current_prompt_idx, len(df)):
	if not df.at[idx, "Evaluated"]:
	df.at[0, "CurrentPromptIndex"] = idx # Update the current prompt index in the dataset
	return df.iloc[idx]
	return None

	# Initialize the first prompt
	current_prompt = get_next_prompt()
	if current_prompt is not None:
	initial_prompt = current_prompt["Prompt"]
	initial_llm1 = current_prompt["LLM1 response"]
	initial_llm2 = current_prompt["LLM2 response"]
	else:
	initial_prompt = "No prompts available."
	initial_llm1 = ""
	initial_llm2 = ""

	# Function to submit feedback and get the next prompt
	def submit_feedback(preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
	# Get the next unevaluated prompt
	next_prompt = get_next_prompt()
	if next_prompt is None:
	return "No more prompts available.", "", "", "Feedback saved successfully!"

	# Save feedback to the dataset
	save_feedback(df["CurrentPromptIndex"].iloc[0], preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian)

	# Increment the prompt index and save it to the dataset
	df.at[0, "CurrentPromptIndex"] += 1
	print(f"Updated Prompt Index: {df['CurrentPromptIndex'].iloc[0]}")

	# Return the next prompt and responses
	return next_prompt["Prompt"], next_prompt["LLM1 response"], next_prompt["LLM2 response"], "Feedback saved successfully!"

	# Gradio interface
	with gr.Blocks() as demo:
	with gr.Row():
	prompt = gr.Textbox(label="Prompt", value=initial_prompt, interactive=False)
	with gr.Row():
	llm1_response = gr.Textbox(label="LLM1 Response", value=initial_llm1, interactive=False)
	llm2_response = gr.Textbox(label="LLM2 Response", value=initial_llm2, interactive=False)
	with gr.Row():
	preference = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response do you prefer?")
	factual_accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response is more factually accurate?")
	relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response better addresses the prompt?")
	with gr.Row():
	llm1_issues = gr.CheckboxGroup(
	["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"],
	label="Does Response 1 contain any issues?"
	)
	llm2_issues = gr.CheckboxGroup(
	["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"],
	label="Does Response 2 contain any issues?"
	)
	with gr.Row():
	llm1_tunisian = gr.Radio([0, 1, 2], label="Rate LLM1's use of Tunisian Arabic")
	llm2_tunisian = gr.Radio([0, 1, 2], label="Rate LLM2's use of Tunisian Arabic")
	with gr.Row():
	submit_btn = gr.Button("Submit Feedback and Next Prompt")

	# Submit feedback and load the next prompt
	submit_btn.click(
	submit_feedback,
	inputs=[preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian],
	outputs=[prompt, llm1_response, llm2_response, gr.Textbox(label="Status")]
	)

	# Launch the app
	demo.launch()