import gradio as gr import pandas as pd import gspread from oauth2client.service_account import ServiceAccountCredentials SPREADSHEET_NAME = "dataset" # The name of your Google Sheet WORKSHEET_NAME = "sheet1" # The tab/worksheet name df = None current_index = None ws = None def init_gsheets(): """ Authenticate with Google Sheets and load the entire worksheet into a DataFrame. We'll identify the next unreviewed row and store that in global variables. """ global df, current_index, ws # Scopes for Google Sheets scope = [ "https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive" ] creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope) gc = gspread.authorize(creds) sh = gc.open(SPREADSHEET_NAME) ws = sh.worksheet(WORKSHEET_NAME) # Read all values from the sheet data = ws.get_all_values() df = pd.DataFrame(data[1:], columns=data[0]) # row 1 = headers # Identify first unreviewed row (example: "Human judges quality" is empty) unreviewed_rows = df[df["Human judges quality"] == ""] if len(unreviewed_rows) > 0: current_index = unreviewed_rows.index[0] else: current_index = None def get_prompt_data(): """ Returns the current prompt and responses if any are left, or a "done" message if everything is reviewed. """ global df, current_index if current_index is None: return "All rows have been reviewed.", "", "", True # all_done = True row = df.loc[current_index] return row["Prompt"], row["LLM1 response"], row["LLM2 response"], False def save_and_load_next( preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian_score, llm2_tunisian_score ): """ 1) Saves feedback for the current row to Google Sheets 2) Moves to the next unreviewed row 3) Returns the next prompt + updated status message in one step """ global df, current_index, ws # If we're out of rows, just return "all done" if current_index is None: return ( "All rows have been reviewed.", # prompt "", # LLM1 resp "", # LLM2 resp "No more rows to review!" # status message ) # 1. Update the in-memory DataFrame df.at[current_index, "Human judges quality"] = preference df.at[current_index, "Human judges correctness"] = factual_accuracy df.at[current_index, "Human judges relevance"] = relevance df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) if llm1_issues else "" df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) if llm2_issues else "" df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score # 2. Write updates back to Google Sheets sheet_row = current_index + 2 # row 1 is headers headers = list(df.columns) def update_sheet_cell(column_name, value): col_index = headers.index(column_name) + 1 # 1-based indexing ws.update_cell(sheet_row, col_index, value) update_sheet_cell("Human judges quality", preference) update_sheet_cell("Human judges correctness", factual_accuracy) update_sheet_cell("Human judges relevance", relevance) update_sheet_cell("Human LLM1 flagged issues", df.at[current_index, "Human LLM1 flagged issues"]) update_sheet_cell("Human LLM2 flagged issues", df.at[current_index, "Human LLM2 flagged issues"]) update_sheet_cell("Human LLM1 Tunisian usage score", llm1_tunisian_score) update_sheet_cell("Human LLM2 Tunisian usage score", llm2_tunisian_score) # 3. Move to the next unreviewed row unreviewed_rows = df[df["Human judges quality"] == ""] if len(unreviewed_rows) == 0: current_index = None return ( "All rows have been reviewed.", # prompt "", # LLM1 resp "", # LLM2 resp "All rows have been reviewed. Thank you!" # status message ) else: current_index = unreviewed_rows.index[0] prompt, llm1resp, llm2resp, _ = get_prompt_data() return ( prompt, llm1resp, llm2resp, "Feedback saved! Moving to the next prompt..." ) def on_load(): """ Called on interface load. Returns the current prompt, or 'done' if there's none. """ prompt, llm1resp, llm2resp, all_done = get_prompt_data() if all_done: return prompt, llm1resp, llm2resp, "No next prompt. All done." else: return prompt, llm1resp, llm2resp, "" # ---- Initialize Google Sheets data init_gsheets() # ---- Build Gradio interface with gr.Blocks() as demo: gr.Markdown("# LLM Responses Evaluation (Google Sheets)") prompt_text = gr.Textbox(label="Prompt", interactive=False) llm1_text = gr.Textbox(label="LLM1 Response", interactive=False) llm2_text = gr.Textbox(label="LLM2 Response", interactive=False) status_msg = gr.Markdown() preference = gr.Radio( ["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response do you prefer?" ) factual_accuracy = gr.Radio( ["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response is more factually accurate?" ) relevance = gr.Radio( ["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response better addresses the prompt?" ) llm1_issues = gr.CheckboxGroup( [ "Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information" ], label="Does Response 1 contain any issues?" ) llm2_issues = gr.CheckboxGroup( [ "Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information" ], label="Does Response 2 contain any issues?" ) llm1_tunisian_score = gr.Radio( [0, 1, 2], label="Rate LLM1's use of Tunisian Arabic", value=0 ) llm2_tunisian_score = gr.Radio( [0, 1, 2], label="Rate LLM2's use of Tunisian Arabic", value=0 ) submit_btn = gr.Button("Submit Feedback") # Single callback: save feedback and immediately load next prompt submit_btn.click( fn=save_and_load_next, inputs=[ preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian_score, llm2_tunisian_score ], outputs=[prompt_text, llm1_text, llm2_text, status_msg] ) # On initial load: display the first unreviewed prompt demo.load( fn=on_load, inputs=[], outputs=[prompt_text, llm1_text, llm2_text, status_msg] ) demo.launch()