File size: 10,838 Bytes
74efe2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fac4afa
74efe2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import gradio as gr
import pandas as pd
import json
import random
from datetime import datetime
from pathlib import Path
import os

from huggingface_hub import HfApi, hf_hub_download, create_repo
try:
    from huggingface_hub.utils import HfHubHTTPError
except ImportError:
    # For older versions of huggingface_hub
    class HfHubHTTPError(Exception):
        pass

# --- Configuration ---
# Source data file containing instructions and responses
TRANSLATED_FILE = "translated_IRT_ga.jsonl"
# Local and remote filename for annotations
ANNOTATION_FILE = "DPO_annotations.csv"
# Hugging Face Hub details
HF_REPO_ID = "jmcinern/DPO_ga" # Your HF repo ID

HF_TOKEN = os.getenv("HF_TOKEN")

# Deterministic sampling settings
NUM_SAMPLES = 200
RANDOM_SEED = 42

# --- UI Content ---
CONSENT_MD = """
### Irish QA Pair Comparison (Master’s Thesis)

You are invited to take part in a study on Large Language Model Irish-language QA quality.
By continuing, you consent to the following:

- Your annotations are anonymised.
- The dataset (reference text + model outputs + your choices) will be released **open-source** for both research and commercial purposes.
- No personal data is collected. You may stop at any time.

- You will answer the following question:

#### Which answer, A or B, is better in terms of grammar, naturalness, and coherence?

- Only base your decision on this question and not other factors.

Please confirm consent, select your role, then press **Begin**.
"""

# --- Helper Functions ---

def load_master_samples() -> list:
    """Loads, shuffles deterministically, and returns the first 100 samples."""
    if not Path(TRANSLATED_FILE).exists():
        raise FileNotFoundError(f"Source file not found: {TRANSLATED_FILE}")
    with open(TRANSLATED_FILE, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    # Shuffle with a fixed seed to get a deterministic "random" subset
    rng = random.Random(RANDOM_SEED)
    rng.shuffle(data)
    return data[:NUM_SAMPLES]

def download_annotations() -> pd.DataFrame:
    """Downloads annotations from HF. If not found, returns an empty DataFrame."""
    try:
        local_path = hf_hub_download(
            repo_id=HF_REPO_ID,
            filename=ANNOTATION_FILE,
            repo_type="dataset",
            token=HF_TOKEN,
        )
        print(f"Downloaded existing annotations from {HF_REPO_ID}")
        return pd.read_csv(local_path)
    except HfHubHTTPError as e:
        # If the file doesn't exist on the Hub (404), it's the first run.
        if e.response.status_code == 404:
            print("No remote annotation file found. Creating a new one.")
            # Define the schema for the new CSV file, now including annotator_type
            return pd.DataFrame(columns=["hash", "annotator_type", "choice", "preferred_response", "timestamp"])
        else:
            raise  # Re-raise other HTTP errors

def upload_annotations(df: pd.DataFrame):
    """Saves a DataFrame locally and pushes it to the Hugging Face Hub."""
    if not HF_TOKEN:
        print("WARNING: No HF_TOKEN found. Skipping upload.")
        return

    # Save locally first
    df.to_csv(ANNOTATION_FILE, index=False)

    # Upload to Hub
    api = HfApi()
    create_repo(HF_REPO_ID, repo_type="dataset", exist_ok=True, token=HF_TOKEN)
    api.upload_file(
        path_or_fileobj=ANNOTATION_FILE,
        path_in_repo=ANNOTATION_FILE,
        repo_id=HF_REPO_ID,
        repo_type="dataset",
        token=HF_TOKEN,
        commit_message="Append new DPO annotation"
    )
    print(f"Successfully uploaded updated annotations to {HF_REPO_ID}")


# --- Gradio Core Logic ---

def prepare_tasks():
    """
    Loads master samples, downloads existing annotations, and prepares the
    list of un-annotated tasks for the current session.
    """
    master_samples = load_master_samples()
    annotations_df = download_annotations()
    completed_hashes = set(annotations_df['hash'].unique())

    to_do_samples = [s for s in master_samples if s['hash'] not in completed_hashes]

    tasks = []
    for sample in to_do_samples:
        # Shuffle response1 and response2 for unbiased presentation
        options = [('response1', sample['response1']), ('response2', sample['response2'])]
        random.shuffle(options)

        tasks.append({
            "hash": sample['hash'],
            "instruction": sample['instruction'],
            "response_A": options[0][1],
            "response_B": options[1][1],
            # Track which original response corresponds to A and B
            "shuffle_map": {'A': options[0][0], 'B': options[1][0]}
        })
    return tasks

def start_session(annotator_type):
    """
    Triggered by the 'Begin' button. Prepares tasks and loads the first one.
    """
    tasks = prepare_tasks()
    if not tasks:
        # All samples are already annotated
        return {
            consent_group: gr.update(visible=False),
            task_group: gr.update(visible=False),
            done_group: gr.update(visible=True),
            state_tasks: [],
            state_task_index: 0,
            state_annotator_type: ""
        }

    first_task = tasks[0]
    progress_str = f"Progress: 1 / {len(tasks)}"

    return {
        consent_group: gr.update(visible=False),
        task_group: gr.update(visible=True),
        done_group: gr.update(visible=False),
        state_tasks: tasks,
        state_task_index: 0,
        state_annotator_type: annotator_type,
        progress_counter: gr.update(value=progress_str),
        instruction_box: gr.update(value=first_task['instruction']),
        response_a_box: gr.update(value=first_task['response_A']),
        response_b_box: gr.update(value=first_task['response_B']),
    }

def record_choice(tasks, current_index, annotator_type, choice):
    """
    Records the user's choice, saves it, and loads the next task.
    """
    # 1. Get current task and determine which original response was preferred
    current_task = tasks[current_index]
    preferred_response_key = current_task['shuffle_map'][choice] # 'response1' or 'response2'

    # 2. Create a new annotation row, now including the annotator_type
    new_annotation = {
        "hash": current_task['hash'],
        "annotator_type": annotator_type,
        "choice": choice, # 'A' or 'B'
        "preferred_response": preferred_response_key,
        "timestamp": datetime.utcnow().isoformat()
    }

    # 3. Load existing annotations, append, and upload
    annotations_df = download_annotations()
    new_df = pd.concat([annotations_df, pd.DataFrame([new_annotation])], ignore_index=True)
    upload_annotations(new_df)

    # 4. Move to the next task
    next_index = current_index + 1
    if next_index >= len(tasks):
        # All tasks for this session are done
        return {
            task_group: gr.update(visible=False),
            done_group: gr.update(visible=True)
        }

    next_task = tasks[next_index]
    progress_str = f"Progress: {next_index + 1} / {len(tasks)}"

    return {
        state_task_index: next_index,
        progress_counter: gr.update(value=progress_str),
        instruction_box: gr.update(value=next_task['instruction']),
        response_a_box: gr.update(value=next_task['response_A']),
        response_b_box: gr.update(value=next_task['response_B']),
    }

def update_begin_button_status(consent_given, role_selected):
    """Enable the begin button only if consent is checked and a role is selected."""
    return gr.update(interactive=(consent_given and role_selected is not None))


# --- Gradio UI Layout ---

with gr.Blocks(theme=gr.themes.Soft(), title="DPO Annotation") as demo:
    # State management
    state_tasks = gr.State([])
    state_task_index = gr.State(0)
    state_annotator_type = gr.State("")

    # Page 1: Consent
    with gr.Group(visible=True) as consent_group:
        gr.Markdown(CONSENT_MD)
        with gr.Row():
            consent_checkbox = gr.Checkbox(label="I consent to the terms above")
            annotator_type_dropdown = gr.Dropdown(["Tester", "Native"], label="Select Your Role")
        begin_btn = gr.Button("Begin", interactive=False)

    # Page 2: Annotation Task
    with gr.Group(visible=False) as task_group:
        progress_counter = gr.Markdown("Progress: 0 / 0", elem_id="progress_counter")
        with gr.Column():
            instruction_box = gr.Textbox(label="Instruction", interactive=False, lines=3)
            with gr.Row():
                response_a_box = gr.Textbox(label="Answer A", interactive=False, lines=8)
                response_b_box = gr.Textbox(label="Answer B", interactive=False, lines=8)
            with gr.Row():
                choose_a_btn = gr.Button("A is Better", variant="primary")
                choose_b_btn = gr.Button("B is Better", variant="primary")

    # Page 3: Completion Message
    with gr.Group(visible=False) as done_group:
        gr.Markdown("## ✅ Thank You!\n\nAll available samples have been annotated. Your contribution is greatly appreciated.")


    # --- Event Handlers ---

    # Enable 'Begin' button only when consent is checked AND a role is selected
    consent_checkbox.change(
        fn=update_begin_button_status,
        inputs=[consent_checkbox, annotator_type_dropdown],
        outputs=begin_btn
    )
    annotator_type_dropdown.change(
        fn=update_begin_button_status,
        inputs=[consent_checkbox, annotator_type_dropdown],
        outputs=begin_btn
    )

    # Start the session when 'Begin' is clicked
    begin_btn.click(
        fn=start_session,
        inputs=[annotator_type_dropdown],
        outputs=[
            consent_group, task_group, done_group,
            state_tasks, state_task_index, state_annotator_type,
            progress_counter, instruction_box, response_a_box, response_b_box
        ]
    )

    # Handle choice A
    choose_a_btn.click(
        fn=record_choice,
        inputs=[state_tasks, state_task_index, state_annotator_type, gr.State('A')],
        outputs=[
            state_task_index, progress_counter,
            instruction_box, response_a_box, response_b_box,
            task_group, done_group
        ]
    )

    # Handle choice B
    choose_b_btn.click(
        fn=record_choice,
        inputs=[state_tasks, state_task_index, state_annotator_type, gr.State('B')],
        outputs=[
            state_task_index, progress_counter,
            instruction_box, response_a_box, response_b_box,
            task_group, done_group
        ]
    )

if __name__ == "__main__":
    # Ensure the source file exists before launching
    if not Path(TRANSLATED_FILE).exists():
        print(f"FATAL: Source data file '{TRANSLATED_FILE}' not found.")
        print("Please ensure the file is in the correct directory before running.")
    else:
        demo.launch()