Spaces:
Build error
Build error
| import gradio as gr | |
| import pandas as pd | |
| import os | |
| import shutil | |
| import time | |
| # File paths | |
| # Reference dataset | |
| initial_file = './Updated_HumanEvaluations_columns_and_rows_shuffled.xlsx' | |
| storage_dir = './data' # Local storage directory for the updated dataset | |
| # The dataset being modified | |
| storage_file = f'{storage_dir}/Updated_HumanEvaluations.xlsx' | |
| # β Ensure the directory exists | |
| os.makedirs(storage_dir, exist_ok=True) | |
| print("π Checking if dataset exists:", os.path.exists(storage_file)) | |
| print("π Using dataset from:", storage_file) | |
| if not os.path.exists(storage_file): | |
| print("β οΈ No existing dataset found, copying from reference dataset...") | |
| shutil.copy(initial_file, storage_file) | |
| # Load dataset with explicit column types | |
| df = pd.read_excel(storage_file, dtype={ | |
| 'Human judges quality': str, | |
| 'Human judges correctness': str, | |
| 'Human judges relevance': str, | |
| 'Human LLM1 flagged issues': str, | |
| 'Human LLM2 flagged issues': str, | |
| 'Human LLM1 Tunisian usage score': str, | |
| 'Human LLM2 Tunisian usage score': str | |
| }) | |
| print("β Dataset loaded successfully!") | |
| print(df.head()) # β Print the first few rows to verify | |
| def get_next_prompt(): | |
| global df | |
| print("π Checking for next unfilled prompt...") | |
| # β Reload dataset to ensure latest version is used | |
| df = pd.read_excel(storage_file) | |
| for index, row in df.iterrows(): | |
| print(f"π Checking row {index} for missing values...") | |
| if pd.isna(row['Human judges quality']) or pd.isna(row['Human judges correctness']) or pd.isna(row['Human judges relevance']) or pd.isna(row['Human LLM1 Tunisian usage score']) or pd.isna(row['Human LLM2 Tunisian usage score']) or pd.isna(row['Human LLM1 flagged issues']) or pd.isna(row['Human LLM2 flagged issues']): | |
| print(f"β Found next unfilled prompt at index {index}") | |
| return row['Prompt'], row['LLM1 response'], row['LLM2 response'] | |
| print("β οΈ No more unfilled prompts!") | |
| return "No more prompts!", "", "" | |
| def submit_feedback(prefer, accuracy, relevance, llm1_flags, llm2_flags, llm1_tunisian, llm2_tunisian): | |
| global df | |
| print("π‘ Receiving feedback submission...") | |
| for index, row in df.iterrows(): | |
| if pd.isna(row['Human judges quality']) or pd.isna(row['Human judges correctness']) or pd.isna(row['Human judges relevance']) or pd.isna(row['Human LLM1 Tunisian usage score']) or pd.isna(row['Human LLM2 Tunisian usage score']) or pd.isna(row['Human LLM1 flagged issues']) or pd.isna(row['Human LLM2 flagged issues']): | |
| print(f"π’ Updating row index {index} with feedback...") | |
| # β Convert values explicitly before saving | |
| df.at[index, 'Human judges quality'] = str( | |
| prefer) if prefer else "N/A" | |
| df.at[index, 'Human judges correctness'] = str( | |
| accuracy) if accuracy else "N/A" | |
| df.at[index, 'Human judges relevance'] = str( | |
| relevance) if relevance else "N/A" | |
| df.at[index, 'Human LLM1 flagged issues'] = ", ".join( | |
| llm1_flags) if llm1_flags else "None" | |
| df.at[index, 'Human LLM2 flagged issues'] = ", ".join( | |
| llm2_flags) if llm2_flags else "None" | |
| df.at[index, 'Human LLM1 Tunisian usage score'] = int( | |
| llm1_tunisian) if llm1_tunisian else 0 | |
| df.at[index, 'Human LLM2 Tunisian usage score'] = int( | |
| llm2_tunisian) if llm2_tunisian else 0 | |
| try: | |
| # β Ensure all NaN values are replaced before saving | |
| df.fillna("N/A", inplace=True) | |
| # β Save dataset with explicit encoding | |
| df.to_excel(storage_file, index=False, engine="openpyxl") | |
| time.sleep(1) # β Allow time for saving | |
| print("β Dataset saved successfully at:", storage_file) | |
| # β Reload dataset to verify update | |
| df = pd.read_excel(storage_file) | |
| print("π Reloaded dataset preview:") | |
| # β Show the updated row to confirm changes | |
| print(df.iloc[index]) | |
| except Exception as e: | |
| print("β ERROR Saving File:", str(e)) | |
| break # β Move to the next prompt after saving | |
| return get_next_prompt() | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## LLM Response Evaluation") | |
| prompt_output = gr.Textbox(label="Prompt", interactive=False) | |
| response1_output = gr.Textbox(label="Response 1", interactive=False) | |
| response2_output = gr.Textbox(label="Response 2", interactive=False) | |
| prefer = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], | |
| label="Which response do you prefer?") | |
| accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], | |
| label="Which response is more factually accurate?") | |
| relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], | |
| label="Which response better addresses the prompt?") | |
| llm1_flags = gr.CheckboxGroup(["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", | |
| "Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 1 contain any issues?") | |
| llm2_flags = gr.CheckboxGroup(["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", | |
| "Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 2 contain any issues?") | |
| llm1_tunisian = gr.Radio( | |
| [0, 1, 2], label="Rate LLM1's use of Tunisian Arabic") | |
| llm2_tunisian = gr.Radio( | |
| [0, 1, 2], label="Rate LLM2's use of Tunisian Arabic") | |
| submit_btn = gr.Button("Submit and Get New Prompt") | |
| submit_btn.click(submit_feedback, inputs=[prefer, accuracy, relevance, llm1_flags, llm2_flags, | |
| llm1_tunisian, llm2_tunisian], outputs=[prompt_output, response1_output, response2_output]) | |
| demo.load(get_next_prompt, outputs=[ | |
| prompt_output, response1_output, response2_output]) | |
| if __name__ == "__main__": | |
| demo.launch() | |