Spaces:
Build error
Build error
File size: 7,342 Bytes
90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 a982ad4 90ad0a1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | import gradio as gr
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
SPREADSHEET_NAME = "dataset" # The name of your Google Sheet
WORKSHEET_NAME = "sheet1" # The tab/worksheet name
df = None
current_index = None
ws = None
def init_gsheets():
"""
Authenticate with Google Sheets and load the entire worksheet into a DataFrame.
We'll identify the next unreviewed row and store that in global variables.
"""
global df, current_index, ws
# Scopes for Google Sheets
scope = [
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive"
]
creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope)
gc = gspread.authorize(creds)
sh = gc.open(SPREADSHEET_NAME)
ws = sh.worksheet(WORKSHEET_NAME)
# Read all values from the sheet
data = ws.get_all_values()
df = pd.DataFrame(data[1:], columns=data[0]) # row 1 = headers
# Identify first unreviewed row (example: "Human judges quality" is empty)
unreviewed_rows = df[df["Human judges quality"] == ""]
if len(unreviewed_rows) > 0:
current_index = unreviewed_rows.index[0]
else:
current_index = None
def get_prompt_data():
"""
Returns the current prompt and responses if any are left,
or a "done" message if everything is reviewed.
"""
global df, current_index
if current_index is None:
return "All rows have been reviewed.", "", "", True # all_done = True
row = df.loc[current_index]
return row["Prompt"], row["LLM1 response"], row["LLM2 response"], False
def save_and_load_next(
preference,
factual_accuracy,
relevance,
llm1_issues,
llm2_issues,
llm1_tunisian_score,
llm2_tunisian_score
):
"""
1) Saves feedback for the current row to Google Sheets
2) Moves to the next unreviewed row
3) Returns the next prompt + updated status message in one step
"""
global df, current_index, ws
# If we're out of rows, just return "all done"
if current_index is None:
return (
"All rows have been reviewed.", # prompt
"", # LLM1 resp
"", # LLM2 resp
"No more rows to review!" # status message
)
# 1. Update the in-memory DataFrame
df.at[current_index, "Human judges quality"] = preference
df.at[current_index, "Human judges correctness"] = factual_accuracy
df.at[current_index, "Human judges relevance"] = relevance
df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) if llm1_issues else ""
df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) if llm2_issues else ""
df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score
df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score
# 2. Write updates back to Google Sheets
sheet_row = current_index + 2 # row 1 is headers
headers = list(df.columns)
def update_sheet_cell(column_name, value):
col_index = headers.index(column_name) + 1 # 1-based indexing
ws.update_cell(sheet_row, col_index, value)
update_sheet_cell("Human judges quality", preference)
update_sheet_cell("Human judges correctness", factual_accuracy)
update_sheet_cell("Human judges relevance", relevance)
update_sheet_cell("Human LLM1 flagged issues", df.at[current_index, "Human LLM1 flagged issues"])
update_sheet_cell("Human LLM2 flagged issues", df.at[current_index, "Human LLM2 flagged issues"])
update_sheet_cell("Human LLM1 Tunisian usage score", llm1_tunisian_score)
update_sheet_cell("Human LLM2 Tunisian usage score", llm2_tunisian_score)
# 3. Move to the next unreviewed row
unreviewed_rows = df[df["Human judges quality"] == ""]
if len(unreviewed_rows) == 0:
current_index = None
return (
"All rows have been reviewed.", # prompt
"", # LLM1 resp
"", # LLM2 resp
"All rows have been reviewed. Thank you!" # status message
)
else:
current_index = unreviewed_rows.index[0]
prompt, llm1resp, llm2resp, _ = get_prompt_data()
return (
prompt,
llm1resp,
llm2resp,
"Feedback saved! Moving to the next prompt..."
)
def on_load():
"""
Called on interface load. Returns the current prompt, or 'done' if there's none.
"""
prompt, llm1resp, llm2resp, all_done = get_prompt_data()
if all_done:
return prompt, llm1resp, llm2resp, "No next prompt. All done."
else:
return prompt, llm1resp, llm2resp, ""
# ---- Initialize Google Sheets data
init_gsheets()
# ---- Build Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# LLM Responses Evaluation (Google Sheets)")
prompt_text = gr.Textbox(label="Prompt", interactive=False)
llm1_text = gr.Textbox(label="LLM1 Response", interactive=False)
llm2_text = gr.Textbox(label="LLM2 Response", interactive=False)
status_msg = gr.Markdown()
preference = gr.Radio(
["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response do you prefer?"
)
factual_accuracy = gr.Radio(
["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response is more factually accurate?"
)
relevance = gr.Radio(
["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response better addresses the prompt?"
)
llm1_issues = gr.CheckboxGroup(
[
"Hate Speech",
"Not Arabic",
"Inappropriate Content",
"Sexual Content",
"Untruthful Info",
"Violent Content",
"Personal Information"
],
label="Does Response 1 contain any issues?"
)
llm2_issues = gr.CheckboxGroup(
[
"Hate Speech",
"Not Arabic",
"Inappropriate Content",
"Sexual Content",
"Untruthful Info",
"Violent Content",
"Personal Information"
],
label="Does Response 2 contain any issues?"
)
llm1_tunisian_score = gr.Radio(
[0, 1, 2],
label="Rate LLM1's use of Tunisian Arabic",
value=0
)
llm2_tunisian_score = gr.Radio(
[0, 1, 2],
label="Rate LLM2's use of Tunisian Arabic",
value=0
)
submit_btn = gr.Button("Submit Feedback")
# Single callback: save feedback and immediately load next prompt
submit_btn.click(
fn=save_and_load_next,
inputs=[
preference,
factual_accuracy,
relevance,
llm1_issues,
llm2_issues,
llm1_tunisian_score,
llm2_tunisian_score
],
outputs=[prompt_text, llm1_text, llm2_text, status_msg]
)
# On initial load: display the first unreviewed prompt
demo.load(
fn=on_load,
inputs=[],
outputs=[prompt_text, llm1_text, llm2_text, status_msg]
)
demo.launch()
|