File size: 7,342 Bytes
90ad0a1
 
 
 
 
a982ad4
 
90ad0a1
 
 
 
 
 
 
 
 
 
a982ad4
 
90ad0a1
 
 
 
 
 
a982ad4
90ad0a1
 
 
a982ad4
90ad0a1
 
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
a982ad4
 
 
 
 
90ad0a1
 
a982ad4
 
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
 
 
a982ad4
 
 
 
 
90ad0a1
 
a982ad4
90ad0a1
a982ad4
 
 
 
 
 
 
 
90ad0a1
 
 
a982ad4
 
90ad0a1
 
a982ad4
 
 
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
a982ad4
 
90ad0a1
 
a982ad4
 
90ad0a1
 
 
a982ad4
 
 
 
 
 
90ad0a1
 
a982ad4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
 
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
 
 
 
 
 
 
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import gradio as gr
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials

SPREADSHEET_NAME = "dataset"    # The name of your Google Sheet
WORKSHEET_NAME = "sheet1"       # The tab/worksheet name
df = None
current_index = None
ws = None

def init_gsheets():
    """
    Authenticate with Google Sheets and load the entire worksheet into a DataFrame.
    We'll identify the next unreviewed row and store that in global variables.
    """
    global df, current_index, ws
    
    # Scopes for Google Sheets
    scope = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/spreadsheets",
        "https://www.googleapis.com/auth/drive"
    ]

    creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope)
    gc = gspread.authorize(creds)
    sh = gc.open(SPREADSHEET_NAME)
    ws = sh.worksheet(WORKSHEET_NAME)
    
    # Read all values from the sheet
    data = ws.get_all_values()
    df = pd.DataFrame(data[1:], columns=data[0])  # row 1 = headers

    # Identify first unreviewed row (example: "Human judges quality" is empty)
    unreviewed_rows = df[df["Human judges quality"] == ""]
    if len(unreviewed_rows) > 0:
        current_index = unreviewed_rows.index[0]
    else:
        current_index = None

def get_prompt_data():
    """
    Returns the current prompt and responses if any are left,
    or a "done" message if everything is reviewed.
    """
    global df, current_index
    if current_index is None:
        return "All rows have been reviewed.", "", "", True  # all_done = True
    
    row = df.loc[current_index]
    return row["Prompt"], row["LLM1 response"], row["LLM2 response"], False

def save_and_load_next(
    preference,
    factual_accuracy,
    relevance,
    llm1_issues,
    llm2_issues,
    llm1_tunisian_score,
    llm2_tunisian_score
):
    """
    1) Saves feedback for the current row to Google Sheets
    2) Moves to the next unreviewed row
    3) Returns the next prompt + updated status message in one step
    """
    global df, current_index, ws

    # If we're out of rows, just return "all done"
    if current_index is None:
        return (
            "All rows have been reviewed.",  # prompt
            "",                              # LLM1 resp
            "",                              # LLM2 resp
            "No more rows to review!"        # status message
        )
    
    # 1. Update the in-memory DataFrame
    df.at[current_index, "Human judges quality"] = preference
    df.at[current_index, "Human judges correctness"] = factual_accuracy
    df.at[current_index, "Human judges relevance"] = relevance
    df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) if llm1_issues else ""
    df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) if llm2_issues else ""
    df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score
    df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score
    
    # 2. Write updates back to Google Sheets
    sheet_row = current_index + 2  # row 1 is headers
    headers = list(df.columns)
    
    def update_sheet_cell(column_name, value):
        col_index = headers.index(column_name) + 1  # 1-based indexing
        ws.update_cell(sheet_row, col_index, value)
    
    update_sheet_cell("Human judges quality", preference)
    update_sheet_cell("Human judges correctness", factual_accuracy)
    update_sheet_cell("Human judges relevance", relevance)
    update_sheet_cell("Human LLM1 flagged issues", df.at[current_index, "Human LLM1 flagged issues"])
    update_sheet_cell("Human LLM2 flagged issues", df.at[current_index, "Human LLM2 flagged issues"])
    update_sheet_cell("Human LLM1 Tunisian usage score", llm1_tunisian_score)
    update_sheet_cell("Human LLM2 Tunisian usage score", llm2_tunisian_score)
    
    # 3. Move to the next unreviewed row
    unreviewed_rows = df[df["Human judges quality"] == ""]
    if len(unreviewed_rows) == 0:
        current_index = None
        return (
            "All rows have been reviewed.",  # prompt
            "",                              # LLM1 resp
            "",                              # LLM2 resp
            "All rows have been reviewed. Thank you!"  # status message
        )
    else:
        current_index = unreviewed_rows.index[0]
        prompt, llm1resp, llm2resp, _ = get_prompt_data()
        return (
            prompt,
            llm1resp,
            llm2resp,
            "Feedback saved! Moving to the next prompt..."
        )

def on_load():
    """
    Called on interface load. Returns the current prompt, or 'done' if there's none.
    """
    prompt, llm1resp, llm2resp, all_done = get_prompt_data()
    if all_done:
        return prompt, llm1resp, llm2resp, "No next prompt. All done."
    else:
        return prompt, llm1resp, llm2resp, ""

# ---- Initialize Google Sheets data
init_gsheets()

# ---- Build Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# LLM Responses Evaluation (Google Sheets)")

    prompt_text = gr.Textbox(label="Prompt", interactive=False)
    llm1_text = gr.Textbox(label="LLM1 Response", interactive=False)
    llm2_text = gr.Textbox(label="LLM2 Response", interactive=False)
    status_msg = gr.Markdown()

    preference = gr.Radio(
        ["LLM1", "LLM2", "Tie", "Both are bad"],
        label="Which response do you prefer?"
    )
    factual_accuracy = gr.Radio(
        ["LLM1", "LLM2", "Tie", "Both are bad"],
        label="Which response is more factually accurate?"
    )
    relevance = gr.Radio(
        ["LLM1", "LLM2", "Tie", "Both are bad"],
        label="Which response better addresses the prompt?"
    )

    llm1_issues = gr.CheckboxGroup(
        [
            "Hate Speech",
            "Not Arabic",
            "Inappropriate Content",
            "Sexual Content",
            "Untruthful Info",
            "Violent Content",
            "Personal Information"
        ],
        label="Does Response 1 contain any issues?"
    )
    llm2_issues = gr.CheckboxGroup(
        [
            "Hate Speech",
            "Not Arabic",
            "Inappropriate Content",
            "Sexual Content",
            "Untruthful Info",
            "Violent Content",
            "Personal Information"
        ],
        label="Does Response 2 contain any issues?"
    )

    llm1_tunisian_score = gr.Radio(
        [0, 1, 2],
        label="Rate LLM1's use of Tunisian Arabic",
        value=0
    )
    llm2_tunisian_score = gr.Radio(
        [0, 1, 2],
        label="Rate LLM2's use of Tunisian Arabic",
        value=0
    )

    submit_btn = gr.Button("Submit Feedback")

    # Single callback: save feedback and immediately load next prompt
    submit_btn.click(
        fn=save_and_load_next,
        inputs=[
            preference,
            factual_accuracy,
            relevance,
            llm1_issues,
            llm2_issues,
            llm1_tunisian_score,
            llm2_tunisian_score
        ],
        outputs=[prompt_text, llm1_text, llm2_text, status_msg]
    )

    # On initial load: display the first unreviewed prompt
    demo.load(
        fn=on_load,
        inputs=[],
        outputs=[prompt_text, llm1_text, llm2_text, status_msg]
    )

demo.launch()