Spaces:
Build error
Build error
| import gradio as gr | |
| import pandas as pd | |
| import gspread | |
| from oauth2client.service_account import ServiceAccountCredentials | |
| SPREADSHEET_NAME = "dataset" # The name of your Google Sheet | |
| WORKSHEET_NAME = "sheet1" # The tab/worksheet name | |
| df = None | |
| current_index = None | |
| ws = None | |
| def init_gsheets(): | |
| """ | |
| Authenticate with Google Sheets and load the entire worksheet into a DataFrame. | |
| We'll identify the next unreviewed row and store that in global variables. | |
| """ | |
| global df, current_index, ws | |
| # Scopes for Google Sheets | |
| scope = [ | |
| "https://spreadsheets.google.com/feeds", | |
| "https://www.googleapis.com/auth/spreadsheets", | |
| "https://www.googleapis.com/auth/drive" | |
| ] | |
| creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope) | |
| gc = gspread.authorize(creds) | |
| sh = gc.open(SPREADSHEET_NAME) | |
| ws = sh.worksheet(WORKSHEET_NAME) | |
| # Read all values from the sheet | |
| data = ws.get_all_values() | |
| df = pd.DataFrame(data[1:], columns=data[0]) # row 1 = headers | |
| # Identify first unreviewed row (example: "Human judges quality" is empty) | |
| unreviewed_rows = df[df["Human judges quality"] == ""] | |
| if len(unreviewed_rows) > 0: | |
| current_index = unreviewed_rows.index[0] | |
| else: | |
| current_index = None | |
| def get_prompt_data(): | |
| """ | |
| Returns the current prompt and responses if any are left, | |
| or a "done" message if everything is reviewed. | |
| """ | |
| global df, current_index | |
| if current_index is None: | |
| return "All rows have been reviewed.", "", "", True # all_done = True | |
| row = df.loc[current_index] | |
| return row["Prompt"], row["LLM1 response"], row["LLM2 response"], False | |
| def save_and_load_next( | |
| preference, | |
| factual_accuracy, | |
| relevance, | |
| llm1_issues, | |
| llm2_issues, | |
| llm1_tunisian_score, | |
| llm2_tunisian_score | |
| ): | |
| """ | |
| 1) Saves feedback for the current row to Google Sheets | |
| 2) Moves to the next unreviewed row | |
| 3) Returns the next prompt + updated status message in one step | |
| """ | |
| global df, current_index, ws | |
| # If we're out of rows, just return "all done" | |
| if current_index is None: | |
| return ( | |
| "All rows have been reviewed.", # prompt | |
| "", # LLM1 resp | |
| "", # LLM2 resp | |
| "No more rows to review!" # status message | |
| ) | |
| # 1. Update the in-memory DataFrame | |
| df.at[current_index, "Human judges quality"] = preference | |
| df.at[current_index, "Human judges correctness"] = factual_accuracy | |
| df.at[current_index, "Human judges relevance"] = relevance | |
| df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) if llm1_issues else "" | |
| df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) if llm2_issues else "" | |
| df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score | |
| df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score | |
| # 2. Write updates back to Google Sheets | |
| sheet_row = current_index + 2 # row 1 is headers | |
| headers = list(df.columns) | |
| def update_sheet_cell(column_name, value): | |
| col_index = headers.index(column_name) + 1 # 1-based indexing | |
| ws.update_cell(sheet_row, col_index, value) | |
| update_sheet_cell("Human judges quality", preference) | |
| update_sheet_cell("Human judges correctness", factual_accuracy) | |
| update_sheet_cell("Human judges relevance", relevance) | |
| update_sheet_cell("Human LLM1 flagged issues", df.at[current_index, "Human LLM1 flagged issues"]) | |
| update_sheet_cell("Human LLM2 flagged issues", df.at[current_index, "Human LLM2 flagged issues"]) | |
| update_sheet_cell("Human LLM1 Tunisian usage score", llm1_tunisian_score) | |
| update_sheet_cell("Human LLM2 Tunisian usage score", llm2_tunisian_score) | |
| # 3. Move to the next unreviewed row | |
| unreviewed_rows = df[df["Human judges quality"] == ""] | |
| if len(unreviewed_rows) == 0: | |
| current_index = None | |
| return ( | |
| "All rows have been reviewed.", # prompt | |
| "", # LLM1 resp | |
| "", # LLM2 resp | |
| "All rows have been reviewed. Thank you!" # status message | |
| ) | |
| else: | |
| current_index = unreviewed_rows.index[0] | |
| prompt, llm1resp, llm2resp, _ = get_prompt_data() | |
| return ( | |
| prompt, | |
| llm1resp, | |
| llm2resp, | |
| "Feedback saved! Moving to the next prompt..." | |
| ) | |
| def on_load(): | |
| """ | |
| Called on interface load. Returns the current prompt, or 'done' if there's none. | |
| """ | |
| prompt, llm1resp, llm2resp, all_done = get_prompt_data() | |
| if all_done: | |
| return prompt, llm1resp, llm2resp, "No next prompt. All done." | |
| else: | |
| return prompt, llm1resp, llm2resp, "" | |
| # ---- Initialize Google Sheets data | |
| init_gsheets() | |
| # ---- Build Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# LLM Responses Evaluation (Google Sheets)") | |
| prompt_text = gr.Textbox(label="Prompt", interactive=False) | |
| llm1_text = gr.Textbox(label="LLM1 Response", interactive=False) | |
| llm2_text = gr.Textbox(label="LLM2 Response", interactive=False) | |
| status_msg = gr.Markdown() | |
| preference = gr.Radio( | |
| ["LLM1", "LLM2", "Tie", "Both are bad"], | |
| label="Which response do you prefer?" | |
| ) | |
| factual_accuracy = gr.Radio( | |
| ["LLM1", "LLM2", "Tie", "Both are bad"], | |
| label="Which response is more factually accurate?" | |
| ) | |
| relevance = gr.Radio( | |
| ["LLM1", "LLM2", "Tie", "Both are bad"], | |
| label="Which response better addresses the prompt?" | |
| ) | |
| llm1_issues = gr.CheckboxGroup( | |
| [ | |
| "Hate Speech", | |
| "Not Arabic", | |
| "Inappropriate Content", | |
| "Sexual Content", | |
| "Untruthful Info", | |
| "Violent Content", | |
| "Personal Information" | |
| ], | |
| label="Does Response 1 contain any issues?" | |
| ) | |
| llm2_issues = gr.CheckboxGroup( | |
| [ | |
| "Hate Speech", | |
| "Not Arabic", | |
| "Inappropriate Content", | |
| "Sexual Content", | |
| "Untruthful Info", | |
| "Violent Content", | |
| "Personal Information" | |
| ], | |
| label="Does Response 2 contain any issues?" | |
| ) | |
| llm1_tunisian_score = gr.Radio( | |
| [0, 1, 2], | |
| label="Rate LLM1's use of Tunisian Arabic", | |
| value=0 | |
| ) | |
| llm2_tunisian_score = gr.Radio( | |
| [0, 1, 2], | |
| label="Rate LLM2's use of Tunisian Arabic", | |
| value=0 | |
| ) | |
| submit_btn = gr.Button("Submit Feedback") | |
| # Single callback: save feedback and immediately load next prompt | |
| submit_btn.click( | |
| fn=save_and_load_next, | |
| inputs=[ | |
| preference, | |
| factual_accuracy, | |
| relevance, | |
| llm1_issues, | |
| llm2_issues, | |
| llm1_tunisian_score, | |
| llm2_tunisian_score | |
| ], | |
| outputs=[prompt_text, llm1_text, llm2_text, status_msg] | |
| ) | |
| # On initial load: display the first unreviewed prompt | |
| demo.load( | |
| fn=on_load, | |
| inputs=[], | |
| outputs=[prompt_text, llm1_text, llm2_text, status_msg] | |
| ) | |
| demo.launch() | |