llm-evaluation / app.py
MaroueneA's picture
Update app.py
a982ad4 verified
import gradio as gr
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
SPREADSHEET_NAME = "dataset" # The name of your Google Sheet
WORKSHEET_NAME = "sheet1" # The tab/worksheet name
df = None
current_index = None
ws = None
def init_gsheets():
"""
Authenticate with Google Sheets and load the entire worksheet into a DataFrame.
We'll identify the next unreviewed row and store that in global variables.
"""
global df, current_index, ws
# Scopes for Google Sheets
scope = [
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive"
]
creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope)
gc = gspread.authorize(creds)
sh = gc.open(SPREADSHEET_NAME)
ws = sh.worksheet(WORKSHEET_NAME)
# Read all values from the sheet
data = ws.get_all_values()
df = pd.DataFrame(data[1:], columns=data[0]) # row 1 = headers
# Identify first unreviewed row (example: "Human judges quality" is empty)
unreviewed_rows = df[df["Human judges quality"] == ""]
if len(unreviewed_rows) > 0:
current_index = unreviewed_rows.index[0]
else:
current_index = None
def get_prompt_data():
"""
Returns the current prompt and responses if any are left,
or a "done" message if everything is reviewed.
"""
global df, current_index
if current_index is None:
return "All rows have been reviewed.", "", "", True # all_done = True
row = df.loc[current_index]
return row["Prompt"], row["LLM1 response"], row["LLM2 response"], False
def save_and_load_next(
preference,
factual_accuracy,
relevance,
llm1_issues,
llm2_issues,
llm1_tunisian_score,
llm2_tunisian_score
):
"""
1) Saves feedback for the current row to Google Sheets
2) Moves to the next unreviewed row
3) Returns the next prompt + updated status message in one step
"""
global df, current_index, ws
# If we're out of rows, just return "all done"
if current_index is None:
return (
"All rows have been reviewed.", # prompt
"", # LLM1 resp
"", # LLM2 resp
"No more rows to review!" # status message
)
# 1. Update the in-memory DataFrame
df.at[current_index, "Human judges quality"] = preference
df.at[current_index, "Human judges correctness"] = factual_accuracy
df.at[current_index, "Human judges relevance"] = relevance
df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) if llm1_issues else ""
df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) if llm2_issues else ""
df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score
df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score
# 2. Write updates back to Google Sheets
sheet_row = current_index + 2 # row 1 is headers
headers = list(df.columns)
def update_sheet_cell(column_name, value):
col_index = headers.index(column_name) + 1 # 1-based indexing
ws.update_cell(sheet_row, col_index, value)
update_sheet_cell("Human judges quality", preference)
update_sheet_cell("Human judges correctness", factual_accuracy)
update_sheet_cell("Human judges relevance", relevance)
update_sheet_cell("Human LLM1 flagged issues", df.at[current_index, "Human LLM1 flagged issues"])
update_sheet_cell("Human LLM2 flagged issues", df.at[current_index, "Human LLM2 flagged issues"])
update_sheet_cell("Human LLM1 Tunisian usage score", llm1_tunisian_score)
update_sheet_cell("Human LLM2 Tunisian usage score", llm2_tunisian_score)
# 3. Move to the next unreviewed row
unreviewed_rows = df[df["Human judges quality"] == ""]
if len(unreviewed_rows) == 0:
current_index = None
return (
"All rows have been reviewed.", # prompt
"", # LLM1 resp
"", # LLM2 resp
"All rows have been reviewed. Thank you!" # status message
)
else:
current_index = unreviewed_rows.index[0]
prompt, llm1resp, llm2resp, _ = get_prompt_data()
return (
prompt,
llm1resp,
llm2resp,
"Feedback saved! Moving to the next prompt..."
)
def on_load():
"""
Called on interface load. Returns the current prompt, or 'done' if there's none.
"""
prompt, llm1resp, llm2resp, all_done = get_prompt_data()
if all_done:
return prompt, llm1resp, llm2resp, "No next prompt. All done."
else:
return prompt, llm1resp, llm2resp, ""
# ---- Initialize Google Sheets data
init_gsheets()
# ---- Build Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# LLM Responses Evaluation (Google Sheets)")
prompt_text = gr.Textbox(label="Prompt", interactive=False)
llm1_text = gr.Textbox(label="LLM1 Response", interactive=False)
llm2_text = gr.Textbox(label="LLM2 Response", interactive=False)
status_msg = gr.Markdown()
preference = gr.Radio(
["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response do you prefer?"
)
factual_accuracy = gr.Radio(
["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response is more factually accurate?"
)
relevance = gr.Radio(
["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response better addresses the prompt?"
)
llm1_issues = gr.CheckboxGroup(
[
"Hate Speech",
"Not Arabic",
"Inappropriate Content",
"Sexual Content",
"Untruthful Info",
"Violent Content",
"Personal Information"
],
label="Does Response 1 contain any issues?"
)
llm2_issues = gr.CheckboxGroup(
[
"Hate Speech",
"Not Arabic",
"Inappropriate Content",
"Sexual Content",
"Untruthful Info",
"Violent Content",
"Personal Information"
],
label="Does Response 2 contain any issues?"
)
llm1_tunisian_score = gr.Radio(
[0, 1, 2],
label="Rate LLM1's use of Tunisian Arabic",
value=0
)
llm2_tunisian_score = gr.Radio(
[0, 1, 2],
label="Rate LLM2's use of Tunisian Arabic",
value=0
)
submit_btn = gr.Button("Submit Feedback")
# Single callback: save feedback and immediately load next prompt
submit_btn.click(
fn=save_and_load_next,
inputs=[
preference,
factual_accuracy,
relevance,
llm1_issues,
llm2_issues,
llm1_tunisian_score,
llm2_tunisian_score
],
outputs=[prompt_text, llm1_text, llm2_text, status_msg]
)
# On initial load: display the first unreviewed prompt
demo.load(
fn=on_load,
inputs=[],
outputs=[prompt_text, llm1_text, llm2_text, status_msg]
)
demo.launch()