Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from datasets import load_dataset, Dataset | |
| import pandas as pd | |
| import os | |
| from huggingface_hub import HfApi | |
| # Read the Hugging Face token from the environment variable | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| # Authenticate with Hugging Face | |
| api = HfApi(token=HF_TOKEN) | |
| # Load the dataset from Hugging Face | |
| try: | |
| dataset = load_dataset("MaroueneA/feedback-dataset", data_files="dataset.csv") # Replace with your dataset file | |
| df = dataset["train"].to_pandas() | |
| if "CurrentPromptIndex" not in df.columns: | |
| df["CurrentPromptIndex"] = 0 # Initialize the column as an integer | |
| else: | |
| # Fill NaN values in the "CurrentPromptIndex" column with 0 | |
| df["CurrentPromptIndex"] = df["CurrentPromptIndex"].fillna(0).astype(int) | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| df = pd.DataFrame(columns=[ | |
| "Prompt", "LLM1 Response", "LLM2 Response", "Human judges quality", "Human judges correctness", | |
| "Human judges relevance", "Human LLM1 Tunisian usage score", "Human LLM2 Tunisian usage score", | |
| "Human LLM1 flagged issues", "Human LLM2 flagged issues", "Evaluated", "CurrentPromptIndex" | |
| ]) | |
| # Function to save feedback to the dataset | |
| def save_feedback(prompt_idx, preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian): | |
| # Update the dataset with feedback | |
| df.at[prompt_idx, "Human judges quality"] = str(preference) | |
| df.at[prompt_idx, "Human judges correctness"] = str(factual_accuracy) | |
| df.at[prompt_idx, "Human judges relevance"] = str(relevance) | |
| df.at[prompt_idx, "Human LLM1 Tunisian usage score"] = int(llm1_tunisian) | |
| df.at[prompt_idx, "Human LLM2 Tunisian usage score"] = int(llm2_tunisian) | |
| df.at[prompt_idx, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) | |
| df.at[prompt_idx, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) | |
| df.at[prompt_idx, "Evaluated"] = True | |
| # Convert the updated DataFrame back to a Hugging Face Dataset | |
| updated_dataset = Dataset.from_pandas(df) | |
| # Push the updated dataset back to Hugging Face | |
| updated_dataset.push_to_hub("MaroueneA/feedback-dataset", token=HF_TOKEN) | |
| # Function to get the next unevaluated prompt | |
| def get_next_prompt(): | |
| # Get the current prompt index from the dataset and convert it to an integer | |
| current_prompt_idx = int(df["CurrentPromptIndex"].iloc[0]) | |
| # Iterate through the DataFrame to find the next unevaluated prompt | |
| for idx in range(current_prompt_idx, len(df)): | |
| if not df.at[idx, "Evaluated"]: | |
| df.at[0, "CurrentPromptIndex"] = idx # Update the current prompt index in the dataset | |
| return df.iloc[idx] | |
| return None | |
| # Initialize the first prompt | |
| current_prompt = get_next_prompt() | |
| if current_prompt is not None: | |
| initial_prompt = current_prompt["Prompt"] | |
| initial_llm1 = current_prompt["LLM1 response"] | |
| initial_llm2 = current_prompt["LLM2 response"] | |
| else: | |
| initial_prompt = "No prompts available." | |
| initial_llm1 = "" | |
| initial_llm2 = "" | |
| # Function to submit feedback and get the next prompt | |
| def submit_feedback(preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian): | |
| # Get the next unevaluated prompt | |
| next_prompt = get_next_prompt() | |
| if next_prompt is None: | |
| return "No more prompts available.", "", "", "Feedback saved successfully!" | |
| # Save feedback to the dataset | |
| save_feedback(df["CurrentPromptIndex"].iloc[0], preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian) | |
| # Increment the prompt index and save it to the dataset | |
| df.at[0, "CurrentPromptIndex"] += 1 | |
| print(f"Updated Prompt Index: {df['CurrentPromptIndex'].iloc[0]}") | |
| # Return the next prompt and responses | |
| return next_prompt["Prompt"], next_prompt["LLM1 response"], next_prompt["LLM2 response"], "Feedback saved successfully!" | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| prompt = gr.Textbox(label="Prompt", value=initial_prompt, interactive=False) | |
| with gr.Row(): | |
| llm1_response = gr.Textbox(label="LLM1 Response", value=initial_llm1, interactive=False) | |
| llm2_response = gr.Textbox(label="LLM2 Response", value=initial_llm2, interactive=False) | |
| with gr.Row(): | |
| preference = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response do you prefer?") | |
| factual_accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response is more factually accurate?") | |
| relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response better addresses the prompt?") | |
| with gr.Row(): | |
| llm1_issues = gr.CheckboxGroup( | |
| ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"], | |
| label="Does Response 1 contain any issues?" | |
| ) | |
| llm2_issues = gr.CheckboxGroup( | |
| ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"], | |
| label="Does Response 2 contain any issues?" | |
| ) | |
| with gr.Row(): | |
| llm1_tunisian = gr.Radio([0, 1, 2], label="Rate LLM1's use of Tunisian Arabic") | |
| llm2_tunisian = gr.Radio([0, 1, 2], label="Rate LLM2's use of Tunisian Arabic") | |
| with gr.Row(): | |
| submit_btn = gr.Button("Submit Feedback and Next Prompt") | |
| # Submit feedback and load the next prompt | |
| submit_btn.click( | |
| submit_feedback, | |
| inputs=[preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian], | |
| outputs=[prompt, llm1_response, llm2_response, gr.Textbox(label="Status")] | |
| ) | |
| # Launch the app | |
| demo.launch() |