import gradio as gr from datasets import load_dataset, Dataset import pandas as pd import os from huggingface_hub import HfApi # Read the Hugging Face token from the environment variable HF_TOKEN = os.environ.get("HF_TOKEN") # Authenticate with Hugging Face api = HfApi(token=HF_TOKEN) # Load the dataset from Hugging Face try: dataset = load_dataset("MaroueneA/feedback-dataset", data_files="dataset.csv") # Replace with your dataset file df = dataset["train"].to_pandas() if "CurrentPromptIndex" not in df.columns: df["CurrentPromptIndex"] = 0 # Initialize the column as an integer else: # Fill NaN values in the "CurrentPromptIndex" column with 0 df["CurrentPromptIndex"] = df["CurrentPromptIndex"].fillna(0).astype(int) except Exception as e: print(f"Error loading dataset: {e}") df = pd.DataFrame(columns=[ "Prompt", "LLM1 Response", "LLM2 Response", "Human judges quality", "Human judges correctness", "Human judges relevance", "Human LLM1 Tunisian usage score", "Human LLM2 Tunisian usage score", "Human LLM1 flagged issues", "Human LLM2 flagged issues", "Evaluated", "CurrentPromptIndex" ]) # Function to save feedback to the dataset def save_feedback(prompt_idx, preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian): # Update the dataset with feedback df.at[prompt_idx, "Human judges quality"] = str(preference) df.at[prompt_idx, "Human judges correctness"] = str(factual_accuracy) df.at[prompt_idx, "Human judges relevance"] = str(relevance) df.at[prompt_idx, "Human LLM1 Tunisian usage score"] = int(llm1_tunisian) df.at[prompt_idx, "Human LLM2 Tunisian usage score"] = int(llm2_tunisian) df.at[prompt_idx, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) df.at[prompt_idx, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) df.at[prompt_idx, "Evaluated"] = True # Convert the updated DataFrame back to a Hugging Face Dataset updated_dataset = Dataset.from_pandas(df) # Push the updated dataset back to Hugging Face updated_dataset.push_to_hub("MaroueneA/feedback-dataset", token=HF_TOKEN) # Function to get the next unevaluated prompt def get_next_prompt(): # Get the current prompt index from the dataset and convert it to an integer current_prompt_idx = int(df["CurrentPromptIndex"].iloc[0]) # Iterate through the DataFrame to find the next unevaluated prompt for idx in range(current_prompt_idx, len(df)): if not df.at[idx, "Evaluated"]: df.at[0, "CurrentPromptIndex"] = idx # Update the current prompt index in the dataset return df.iloc[idx] return None # Initialize the first prompt current_prompt = get_next_prompt() if current_prompt is not None: initial_prompt = current_prompt["Prompt"] initial_llm1 = current_prompt["LLM1 response"] initial_llm2 = current_prompt["LLM2 response"] else: initial_prompt = "No prompts available." initial_llm1 = "" initial_llm2 = "" # Function to submit feedback and get the next prompt def submit_feedback(preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian): # Get the next unevaluated prompt next_prompt = get_next_prompt() if next_prompt is None: return "No more prompts available.", "", "", "Feedback saved successfully!" # Save feedback to the dataset save_feedback(df["CurrentPromptIndex"].iloc[0], preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian) # Increment the prompt index and save it to the dataset df.at[0, "CurrentPromptIndex"] += 1 print(f"Updated Prompt Index: {df['CurrentPromptIndex'].iloc[0]}") # Return the next prompt and responses return next_prompt["Prompt"], next_prompt["LLM1 response"], next_prompt["LLM2 response"], "Feedback saved successfully!" # Gradio interface with gr.Blocks() as demo: with gr.Row(): prompt = gr.Textbox(label="Prompt", value=initial_prompt, interactive=False) with gr.Row(): llm1_response = gr.Textbox(label="LLM1 Response", value=initial_llm1, interactive=False) llm2_response = gr.Textbox(label="LLM2 Response", value=initial_llm2, interactive=False) with gr.Row(): preference = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response do you prefer?") factual_accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response is more factually accurate?") relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response better addresses the prompt?") with gr.Row(): llm1_issues = gr.CheckboxGroup( ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 1 contain any issues?" ) llm2_issues = gr.CheckboxGroup( ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 2 contain any issues?" ) with gr.Row(): llm1_tunisian = gr.Radio([0, 1, 2], label="Rate LLM1's use of Tunisian Arabic") llm2_tunisian = gr.Radio([0, 1, 2], label="Rate LLM2's use of Tunisian Arabic") with gr.Row(): submit_btn = gr.Button("Submit Feedback and Next Prompt") # Submit feedback and load the next prompt submit_btn.click( submit_feedback, inputs=[preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian], outputs=[prompt, llm1_response, llm2_response, gr.Textbox(label="Status")] ) # Launch the app demo.launch()