MaroueneA commited on
Commit
bdddd07
·
verified ·
1 Parent(s): 8d44e85

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +237 -0
  3. dataset.xlsx +3 -0
  4. requirements.txt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset.xlsx filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+
5
+ # Global variables
6
+ DATASET_PATH = "dataset.xlsx"
7
+ df = None
8
+ current_index = None
9
+
10
+
11
+ def load_dataset():
12
+ """Load the Excel dataset into a global pandas DataFrame."""
13
+ global df, current_index
14
+
15
+ if not os.path.exists(DATASET_PATH):
16
+ raise FileNotFoundError(f"Excel file not found at {DATASET_PATH}")
17
+
18
+ df = pd.read_excel(DATASET_PATH)
19
+
20
+ # Identify if there's any row that is "unreviewed".
21
+ # We'll consider a row unreviewed if 'Human judges quality' is NaN or empty.
22
+ # Adjust the column or logic as needed for your real use case.
23
+ unreviewed_rows = df[df['Human judges quality'].isna()]
24
+
25
+ if len(unreviewed_rows) == 0:
26
+ current_index = None # Means no rows left to review
27
+ else:
28
+ # Pick the first unreviewed row
29
+ current_index = unreviewed_rows.index[0]
30
+
31
+
32
+ def get_next_prompt():
33
+ """
34
+ Fetch the next unreviewed row from the DataFrame.
35
+ Return a dictionary of prompt data or indicate if all done.
36
+ """
37
+ global current_index, df
38
+ if current_index is None:
39
+ return {
40
+ "prompt": "All rows have been reviewed.",
41
+ "llm1_resp": "",
42
+ "llm2_resp": "",
43
+ "all_done": True
44
+ }
45
+
46
+ row = df.loc[current_index]
47
+
48
+ return {
49
+ "prompt": row["Prompt"],
50
+ "llm1_resp": row["LLM1 response"],
51
+ "llm2_resp": row["LLM2 response"],
52
+ "all_done": False
53
+ }
54
+
55
+
56
+ def save_feedback(
57
+ preference,
58
+ factual_accuracy,
59
+ relevance,
60
+ llm1_issues,
61
+ llm2_issues,
62
+ llm1_tunisian_score,
63
+ llm2_tunisian_score
64
+ ):
65
+ """
66
+ Saves the feedback to the global DataFrame, writes it to disk,
67
+ and updates current_index to the next unreviewed row.
68
+ """
69
+ global df, current_index
70
+
71
+ if current_index is None:
72
+ return gr.update(value="No more rows to review!")
73
+
74
+ # Map the feedback to the columns in your dataset
75
+ # For example, "Human judges quality" could store the "preference".
76
+ df.at[current_index, "Human judges quality"] = preference
77
+ df.at[current_index, "Human judges correctness"] = factual_accuracy
78
+ df.at[current_index, "Human judges relevance"] = relevance
79
+
80
+ # Store flagged issues (you might want to store them as a comma-separated string)
81
+ df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(
82
+ llm1_issues) if llm1_issues else ""
83
+ df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(
84
+ llm2_issues) if llm2_issues else ""
85
+
86
+ # Store Tunisian Arabic usage scores
87
+ df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score
88
+ df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score
89
+
90
+ # Write back to Excel
91
+ df.to_excel(DATASET_PATH, index=False)
92
+
93
+ # Move to the next unreviewed row
94
+ next_unreviewed = df[df['Human judges quality'].isna()]
95
+ if len(next_unreviewed) == 0:
96
+ current_index = None
97
+ return gr.update(value="All rows have been reviewed. Thank you!")
98
+ else:
99
+ current_index = next_unreviewed.index[0]
100
+ return gr.update(value="Feedback saved! Moving to the next prompt...")
101
+
102
+
103
+ def get_prompt_and_responses():
104
+ """
105
+ Retrieve the next prompt and responses from the DataFrame
106
+ and return them so they can be displayed in the interface.
107
+ """
108
+ data = get_next_prompt()
109
+ if data["all_done"]:
110
+ return (
111
+ data["prompt"],
112
+ data["llm1_resp"],
113
+ data["llm2_resp"],
114
+ "No next prompt. All done."
115
+ )
116
+ else:
117
+ return (
118
+ data["prompt"],
119
+ data["llm1_resp"],
120
+ data["llm2_resp"],
121
+ ""
122
+ )
123
+
124
+
125
+ def refresh_ui():
126
+ """Helper to re-fetch the prompt data (e.g., after user feedback)."""
127
+ prompt, llm1_resp, llm2_resp, msg = get_prompt_and_responses()
128
+ return prompt, llm1_resp, llm2_resp, msg
129
+
130
+
131
+ # Load the dataset once on startup
132
+ load_dataset()
133
+
134
+ with gr.Blocks() as demo:
135
+
136
+ gr.Markdown("# LLM Responses Evaluation")
137
+
138
+ # 1) Display the prompt and LLM responses
139
+ prompt_text = gr.Textbox(label="Prompt", interactive=False)
140
+ llm1_text = gr.Textbox(label="LLM1 Response", interactive=False)
141
+ llm2_text = gr.Textbox(label="LLM2 Response", interactive=False)
142
+ status_msg = gr.Markdown()
143
+
144
+ # 2) Radio for "Which response do you prefer?"
145
+ preference = gr.Radio(
146
+ ["LLM1", "LLM2", "Tie", "Both are bad"],
147
+ label="Which response do you prefer?",
148
+ value=None
149
+ )
150
+
151
+ # 3) Radio for "Which response is more factually accurate?"
152
+ factual_accuracy = gr.Radio(
153
+ ["LLM1", "LLM2", "Tie", "Both are bad"],
154
+ label="Which response is more factually accurate?",
155
+ value=None
156
+ )
157
+
158
+ # 4) Radio for "Which response better addresses the prompt?"
159
+ relevance = gr.Radio(
160
+ ["LLM1", "LLM2", "Tie", "Both are bad"],
161
+ label="Which response better addresses the prompt?",
162
+ value=None
163
+ )
164
+
165
+ # 5) Checkboxes for flagged issues in Response 1
166
+ llm1_issues = gr.CheckboxGroup(
167
+ [
168
+ "Hate Speech",
169
+ "Not Arabic",
170
+ "Inappropriate Content",
171
+ "Sexual Content",
172
+ "Untruthful Info",
173
+ "Violent Content",
174
+ "Personal Information"
175
+ ],
176
+ label="Does Response 1 contain any issues?"
177
+ )
178
+
179
+ # 6) Checkboxes for flagged issues in Response 2
180
+ llm2_issues = gr.CheckboxGroup(
181
+ [
182
+ "Hate Speech",
183
+ "Not Arabic",
184
+ "Inappropriate Content",
185
+ "Sexual Content",
186
+ "Untruthful Info",
187
+ "Violent Content",
188
+ "Personal Information"
189
+ ],
190
+ label="Does Response 2 contain any issues?"
191
+ )
192
+
193
+ # 7) Radio for LLM1's Tunisian Arabic usage score
194
+ llm1_tunisian_score = gr.Radio(
195
+ [0, 1, 2], label="Rate LLM1's use of Tunisian Arabic? 0: No Tunisian Arabic, 1: Mostly Tunisian Arabic, 2: Fully Tunisian Arabic", value=0)
196
+
197
+ # 8) Radio for LLM2's Tunisian Arabic usage score
198
+ llm2_tunisian_score = gr.Radio(
199
+ [0, 1, 2], label="Rate LLM2's use of Tunisian Arabic? 0: No Tunisian Arabic, 1: Mostly Tunisian Arabic, 2: Fully Tunisian Arabic", value=0)
200
+
201
+ # Submit button
202
+ submit_btn = gr.Button("Submit Feedback")
203
+
204
+ # On submit, save the feedback and show an update message
205
+ submit_btn.click(
206
+ fn=save_feedback,
207
+ inputs=[
208
+ preference,
209
+ factual_accuracy,
210
+ relevance,
211
+ llm1_issues,
212
+ llm2_issues,
213
+ llm1_tunisian_score,
214
+ llm2_tunisian_score
215
+ ],
216
+ outputs=status_msg
217
+ )
218
+
219
+ # Then auto-refresh the prompt/responses displayed
220
+ submit_btn.click(
221
+ fn=refresh_ui,
222
+ inputs=[],
223
+ outputs=[prompt_text, llm1_text, llm2_text, status_msg]
224
+ )
225
+
226
+ # Initialize with the first unreviewed row
227
+ demo.load(
228
+ fn=get_prompt_and_responses,
229
+ inputs=[],
230
+ outputs=[prompt_text, llm1_text, llm2_text, status_msg]
231
+ )
232
+
233
+ # If you're running this locally, you'd do:
234
+ demo.launch(share=True)
235
+
236
+ # When uploading to HuggingFace Spaces, ensure you have a "requirements.txt"
237
+ # with gradio, pandas, openpyxl so HF can build the environment.
dataset.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4999dbdc6db1e0e8ad10d69fa8f3966e80cd156e0f759d62569974d7055294a3
3
+ size 567180
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ pandas
3
+ openpyxl