feedback-app / app.py
MaroueneA's picture
Update app.py
2a15eee verified
import gradio as gr
from datasets import load_dataset, Dataset
import pandas as pd
import os
from huggingface_hub import HfApi
# Read the Hugging Face token from the environment variable
HF_TOKEN = os.environ.get("HF_TOKEN")
# Authenticate with Hugging Face
api = HfApi(token=HF_TOKEN)
# Load the dataset from Hugging Face
try:
dataset = load_dataset("MaroueneA/feedback-dataset", data_files="dataset.csv") # Replace with your dataset file
df = dataset["train"].to_pandas()
if "CurrentPromptIndex" not in df.columns:
df["CurrentPromptIndex"] = 0 # Initialize the column as an integer
else:
# Fill NaN values in the "CurrentPromptIndex" column with 0
df["CurrentPromptIndex"] = df["CurrentPromptIndex"].fillna(0).astype(int)
except Exception as e:
print(f"Error loading dataset: {e}")
df = pd.DataFrame(columns=[
"Prompt", "LLM1 Response", "LLM2 Response", "Human judges quality", "Human judges correctness",
"Human judges relevance", "Human LLM1 Tunisian usage score", "Human LLM2 Tunisian usage score",
"Human LLM1 flagged issues", "Human LLM2 flagged issues", "Evaluated", "CurrentPromptIndex"
])
# Function to save feedback to the dataset
def save_feedback(prompt_idx, preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
# Update the dataset with feedback
df.at[prompt_idx, "Human judges quality"] = str(preference)
df.at[prompt_idx, "Human judges correctness"] = str(factual_accuracy)
df.at[prompt_idx, "Human judges relevance"] = str(relevance)
df.at[prompt_idx, "Human LLM1 Tunisian usage score"] = int(llm1_tunisian)
df.at[prompt_idx, "Human LLM2 Tunisian usage score"] = int(llm2_tunisian)
df.at[prompt_idx, "Human LLM1 flagged issues"] = ", ".join(llm1_issues)
df.at[prompt_idx, "Human LLM2 flagged issues"] = ", ".join(llm2_issues)
df.at[prompt_idx, "Evaluated"] = True
# Convert the updated DataFrame back to a Hugging Face Dataset
updated_dataset = Dataset.from_pandas(df)
# Push the updated dataset back to Hugging Face
updated_dataset.push_to_hub("MaroueneA/feedback-dataset", token=HF_TOKEN)
# Function to get the next unevaluated prompt
def get_next_prompt():
# Get the current prompt index from the dataset and convert it to an integer
current_prompt_idx = int(df["CurrentPromptIndex"].iloc[0])
# Iterate through the DataFrame to find the next unevaluated prompt
for idx in range(current_prompt_idx, len(df)):
if not df.at[idx, "Evaluated"]:
df.at[0, "CurrentPromptIndex"] = idx # Update the current prompt index in the dataset
return df.iloc[idx]
return None
# Initialize the first prompt
current_prompt = get_next_prompt()
if current_prompt is not None:
initial_prompt = current_prompt["Prompt"]
initial_llm1 = current_prompt["LLM1 response"]
initial_llm2 = current_prompt["LLM2 response"]
else:
initial_prompt = "No prompts available."
initial_llm1 = ""
initial_llm2 = ""
# Function to submit feedback and get the next prompt
def submit_feedback(preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
# Get the next unevaluated prompt
next_prompt = get_next_prompt()
if next_prompt is None:
return "No more prompts available.", "", "", "Feedback saved successfully!"
# Save feedback to the dataset
save_feedback(df["CurrentPromptIndex"].iloc[0], preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian)
# Increment the prompt index and save it to the dataset
df.at[0, "CurrentPromptIndex"] += 1
print(f"Updated Prompt Index: {df['CurrentPromptIndex'].iloc[0]}")
# Return the next prompt and responses
return next_prompt["Prompt"], next_prompt["LLM1 response"], next_prompt["LLM2 response"], "Feedback saved successfully!"
# Gradio interface
with gr.Blocks() as demo:
with gr.Row():
prompt = gr.Textbox(label="Prompt", value=initial_prompt, interactive=False)
with gr.Row():
llm1_response = gr.Textbox(label="LLM1 Response", value=initial_llm1, interactive=False)
llm2_response = gr.Textbox(label="LLM2 Response", value=initial_llm2, interactive=False)
with gr.Row():
preference = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response do you prefer?")
factual_accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response is more factually accurate?")
relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response better addresses the prompt?")
with gr.Row():
llm1_issues = gr.CheckboxGroup(
["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"],
label="Does Response 1 contain any issues?"
)
llm2_issues = gr.CheckboxGroup(
["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"],
label="Does Response 2 contain any issues?"
)
with gr.Row():
llm1_tunisian = gr.Radio([0, 1, 2], label="Rate LLM1's use of Tunisian Arabic")
llm2_tunisian = gr.Radio([0, 1, 2], label="Rate LLM2's use of Tunisian Arabic")
with gr.Row():
submit_btn = gr.Button("Submit Feedback and Next Prompt")
# Submit feedback and load the next prompt
submit_btn.click(
submit_feedback,
inputs=[preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian],
outputs=[prompt, llm1_response, llm2_response, gr.Textbox(label="Status")]
)
# Launch the app
demo.launch()