jmcinern commited on
Commit
74efe2a
·
verified ·
1 Parent(s): 7fff0c1

for annotation

Browse files
Files changed (1) hide show
  1. app.py +302 -0
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import random
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ import os
8
+
9
+ from huggingface_hub import HfApi, hf_hub_download, create_repo
10
+ try:
11
+ from huggingface_hub.utils import HfHubHTTPError
12
+ except ImportError:
13
+ # For older versions of huggingface_hub
14
+ class HfHubHTTPError(Exception):
15
+ pass
16
+
17
+ # --- Configuration ---
18
+ # Source data file containing instructions and responses
19
+ TRANSLATED_FILE = "translated_IRT_ga.jsonl"
20
+ # Local and remote filename for annotations
21
+ ANNOTATION_FILE = "DPO_annotations.csv"
22
+ # Hugging Face Hub details
23
+ HF_REPO_ID = "jmcinern/DPO_ga" # Your HF repo ID
24
+
25
+ HF_TOKEN = os.getenv("HF_TOKEN")
26
+
27
+ # Deterministic sampling settings
28
+ NUM_SAMPLES = 100
29
+ RANDOM_SEED = 42
30
+
31
+ # --- UI Content ---
32
+ CONSENT_MD = """
33
+ ### Irish QA Pair Comparison (Master’s Thesis)
34
+
35
+ You are invited to take part in a study on Large Language Model Irish-language QA quality.
36
+ By continuing, you consent to the following:
37
+
38
+ - Your annotations are anonymised.
39
+ - The dataset (reference text + model outputs + your choices) will be released **open-source** for both research and commercial purposes.
40
+ - No personal data is collected. You may stop at any time.
41
+
42
+ - You will answer the following question:
43
+
44
+ #### Which answer, A or B, is better in terms of grammar, naturalness, and coherence?
45
+
46
+ - Only base your decision on this question and not other factors.
47
+
48
+ Please confirm consent, select your role, then press **Begin**.
49
+ """
50
+
51
+ # --- Helper Functions ---
52
+
53
+ def load_master_samples() -> list:
54
+ """Loads, shuffles deterministically, and returns the first 100 samples."""
55
+ if not Path(TRANSLATED_FILE).exists():
56
+ raise FileNotFoundError(f"Source file not found: {TRANSLATED_FILE}")
57
+ with open(TRANSLATED_FILE, "r", encoding="utf-8") as f:
58
+ data = [json.loads(line) for line in f]
59
+
60
+ # Shuffle with a fixed seed to get a deterministic "random" subset
61
+ rng = random.Random(RANDOM_SEED)
62
+ rng.shuffle(data)
63
+ return data[:NUM_SAMPLES]
64
+
65
+ def download_annotations() -> pd.DataFrame:
66
+ """Downloads annotations from HF. If not found, returns an empty DataFrame."""
67
+ try:
68
+ local_path = hf_hub_download(
69
+ repo_id=HF_REPO_ID,
70
+ filename=ANNOTATION_FILE,
71
+ repo_type="dataset",
72
+ token=HF_TOKEN,
73
+ )
74
+ print(f"Downloaded existing annotations from {HF_REPO_ID}")
75
+ return pd.read_csv(local_path)
76
+ except HfHubHTTPError as e:
77
+ # If the file doesn't exist on the Hub (404), it's the first run.
78
+ if e.response.status_code == 404:
79
+ print("No remote annotation file found. Creating a new one.")
80
+ # Define the schema for the new CSV file, now including annotator_type
81
+ return pd.DataFrame(columns=["hash", "annotator_type", "choice", "preferred_response", "timestamp"])
82
+ else:
83
+ raise # Re-raise other HTTP errors
84
+
85
+ def upload_annotations(df: pd.DataFrame):
86
+ """Saves a DataFrame locally and pushes it to the Hugging Face Hub."""
87
+ if not HF_TOKEN:
88
+ print("WARNING: No HF_TOKEN found. Skipping upload.")
89
+ return
90
+
91
+ # Save locally first
92
+ df.to_csv(ANNOTATION_FILE, index=False)
93
+
94
+ # Upload to Hub
95
+ api = HfApi()
96
+ create_repo(HF_REPO_ID, repo_type="dataset", exist_ok=True, token=HF_TOKEN)
97
+ api.upload_file(
98
+ path_or_fileobj=ANNOTATION_FILE,
99
+ path_in_repo=ANNOTATION_FILE,
100
+ repo_id=HF_REPO_ID,
101
+ repo_type="dataset",
102
+ token=HF_TOKEN,
103
+ commit_message="Append new DPO annotation"
104
+ )
105
+ print(f"Successfully uploaded updated annotations to {HF_REPO_ID}")
106
+
107
+
108
+ # --- Gradio Core Logic ---
109
+
110
+ def prepare_tasks():
111
+ """
112
+ Loads master samples, downloads existing annotations, and prepares the
113
+ list of un-annotated tasks for the current session.
114
+ """
115
+ master_samples = load_master_samples()
116
+ annotations_df = download_annotations()
117
+ completed_hashes = set(annotations_df['hash'].unique())
118
+
119
+ to_do_samples = [s for s in master_samples if s['hash'] not in completed_hashes]
120
+
121
+ tasks = []
122
+ for sample in to_do_samples:
123
+ # Shuffle response1 and response2 for unbiased presentation
124
+ options = [('response1', sample['response1']), ('response2', sample['response2'])]
125
+ random.shuffle(options)
126
+
127
+ tasks.append({
128
+ "hash": sample['hash'],
129
+ "instruction": sample['instruction'],
130
+ "response_A": options[0][1],
131
+ "response_B": options[1][1],
132
+ # Track which original response corresponds to A and B
133
+ "shuffle_map": {'A': options[0][0], 'B': options[1][0]}
134
+ })
135
+ return tasks
136
+
137
+ def start_session(annotator_type):
138
+ """
139
+ Triggered by the 'Begin' button. Prepares tasks and loads the first one.
140
+ """
141
+ tasks = prepare_tasks()
142
+ if not tasks:
143
+ # All samples are already annotated
144
+ return {
145
+ consent_group: gr.update(visible=False),
146
+ task_group: gr.update(visible=False),
147
+ done_group: gr.update(visible=True),
148
+ state_tasks: [],
149
+ state_task_index: 0,
150
+ state_annotator_type: ""
151
+ }
152
+
153
+ first_task = tasks[0]
154
+ progress_str = f"Progress: 1 / {len(tasks)}"
155
+
156
+ return {
157
+ consent_group: gr.update(visible=False),
158
+ task_group: gr.update(visible=True),
159
+ done_group: gr.update(visible=False),
160
+ state_tasks: tasks,
161
+ state_task_index: 0,
162
+ state_annotator_type: annotator_type,
163
+ progress_counter: gr.update(value=progress_str),
164
+ instruction_box: gr.update(value=first_task['instruction']),
165
+ response_a_box: gr.update(value=first_task['response_A']),
166
+ response_b_box: gr.update(value=first_task['response_B']),
167
+ }
168
+
169
+ def record_choice(tasks, current_index, annotator_type, choice):
170
+ """
171
+ Records the user's choice, saves it, and loads the next task.
172
+ """
173
+ # 1. Get current task and determine which original response was preferred
174
+ current_task = tasks[current_index]
175
+ preferred_response_key = current_task['shuffle_map'][choice] # 'response1' or 'response2'
176
+
177
+ # 2. Create a new annotation row, now including the annotator_type
178
+ new_annotation = {
179
+ "hash": current_task['hash'],
180
+ "annotator_type": annotator_type,
181
+ "choice": choice, # 'A' or 'B'
182
+ "preferred_response": preferred_response_key,
183
+ "timestamp": datetime.utcnow().isoformat()
184
+ }
185
+
186
+ # 3. Load existing annotations, append, and upload
187
+ annotations_df = download_annotations()
188
+ new_df = pd.concat([annotations_df, pd.DataFrame([new_annotation])], ignore_index=True)
189
+ upload_annotations(new_df)
190
+
191
+ # 4. Move to the next task
192
+ next_index = current_index + 1
193
+ if next_index >= len(tasks):
194
+ # All tasks for this session are done
195
+ return {
196
+ task_group: gr.update(visible=False),
197
+ done_group: gr.update(visible=True)
198
+ }
199
+
200
+ next_task = tasks[next_index]
201
+ progress_str = f"Progress: {next_index + 1} / {len(tasks)}"
202
+
203
+ return {
204
+ state_task_index: next_index,
205
+ progress_counter: gr.update(value=progress_str),
206
+ instruction_box: gr.update(value=next_task['instruction']),
207
+ response_a_box: gr.update(value=next_task['response_A']),
208
+ response_b_box: gr.update(value=next_task['response_B']),
209
+ }
210
+
211
+ def update_begin_button_status(consent_given, role_selected):
212
+ """Enable the begin button only if consent is checked and a role is selected."""
213
+ return gr.update(interactive=(consent_given and role_selected is not None))
214
+
215
+
216
+ # --- Gradio UI Layout ---
217
+
218
+ with gr.Blocks(theme=gr.themes.Soft(), title="DPO Annotation") as demo:
219
+ # State management
220
+ state_tasks = gr.State([])
221
+ state_task_index = gr.State(0)
222
+ state_annotator_type = gr.State("")
223
+
224
+ # Page 1: Consent
225
+ with gr.Group(visible=True) as consent_group:
226
+ gr.Markdown(CONSENT_MD)
227
+ with gr.Row():
228
+ consent_checkbox = gr.Checkbox(label="I consent to the terms above")
229
+ annotator_type_dropdown = gr.Dropdown(["Tester", "Native"], label="Select Your Role")
230
+ begin_btn = gr.Button("Begin", interactive=False)
231
+
232
+ # Page 2: Annotation Task
233
+ with gr.Group(visible=False) as task_group:
234
+ progress_counter = gr.Markdown("Progress: 0 / 0", elem_id="progress_counter")
235
+ with gr.Column():
236
+ instruction_box = gr.Textbox(label="Instruction", interactive=False, lines=3)
237
+ with gr.Row():
238
+ response_a_box = gr.Textbox(label="Answer A", interactive=False, lines=8)
239
+ response_b_box = gr.Textbox(label="Answer B", interactive=False, lines=8)
240
+ with gr.Row():
241
+ choose_a_btn = gr.Button("A is Better", variant="primary")
242
+ choose_b_btn = gr.Button("B is Better", variant="primary")
243
+
244
+ # Page 3: Completion Message
245
+ with gr.Group(visible=False) as done_group:
246
+ gr.Markdown("## ✅ Thank You!\n\nAll available samples have been annotated. Your contribution is greatly appreciated.")
247
+
248
+
249
+ # --- Event Handlers ---
250
+
251
+ # Enable 'Begin' button only when consent is checked AND a role is selected
252
+ consent_checkbox.change(
253
+ fn=update_begin_button_status,
254
+ inputs=[consent_checkbox, annotator_type_dropdown],
255
+ outputs=begin_btn
256
+ )
257
+ annotator_type_dropdown.change(
258
+ fn=update_begin_button_status,
259
+ inputs=[consent_checkbox, annotator_type_dropdown],
260
+ outputs=begin_btn
261
+ )
262
+
263
+ # Start the session when 'Begin' is clicked
264
+ begin_btn.click(
265
+ fn=start_session,
266
+ inputs=[annotator_type_dropdown],
267
+ outputs=[
268
+ consent_group, task_group, done_group,
269
+ state_tasks, state_task_index, state_annotator_type,
270
+ progress_counter, instruction_box, response_a_box, response_b_box
271
+ ]
272
+ )
273
+
274
+ # Handle choice A
275
+ choose_a_btn.click(
276
+ fn=record_choice,
277
+ inputs=[state_tasks, state_task_index, state_annotator_type, gr.State('A')],
278
+ outputs=[
279
+ state_task_index, progress_counter,
280
+ instruction_box, response_a_box, response_b_box,
281
+ task_group, done_group
282
+ ]
283
+ )
284
+
285
+ # Handle choice B
286
+ choose_b_btn.click(
287
+ fn=record_choice,
288
+ inputs=[state_tasks, state_task_index, state_annotator_type, gr.State('B')],
289
+ outputs=[
290
+ state_task_index, progress_counter,
291
+ instruction_box, response_a_box, response_b_box,
292
+ task_group, done_group
293
+ ]
294
+ )
295
+
296
+ if __name__ == "__main__":
297
+ # Ensure the source file exists before launching
298
+ if not Path(TRANSLATED_FILE).exists():
299
+ print(f"FATAL: Source data file '{TRANSLATED_FILE}' not found.")
300
+ print("Please ensure the file is in the correct directory before running.")
301
+ else:
302
+ demo.launch()