File size: 10,838 Bytes
74efe2a fac4afa 74efe2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 |
import gradio as gr
import pandas as pd
import json
import random
from datetime import datetime
from pathlib import Path
import os
from huggingface_hub import HfApi, hf_hub_download, create_repo
try:
from huggingface_hub.utils import HfHubHTTPError
except ImportError:
# For older versions of huggingface_hub
class HfHubHTTPError(Exception):
pass
# --- Configuration ---
# Source data file containing instructions and responses
TRANSLATED_FILE = "translated_IRT_ga.jsonl"
# Local and remote filename for annotations
ANNOTATION_FILE = "DPO_annotations.csv"
# Hugging Face Hub details
HF_REPO_ID = "jmcinern/DPO_ga" # Your HF repo ID
HF_TOKEN = os.getenv("HF_TOKEN")
# Deterministic sampling settings
NUM_SAMPLES = 200
RANDOM_SEED = 42
# --- UI Content ---
CONSENT_MD = """
### Irish QA Pair Comparison (Master’s Thesis)
You are invited to take part in a study on Large Language Model Irish-language QA quality.
By continuing, you consent to the following:
- Your annotations are anonymised.
- The dataset (reference text + model outputs + your choices) will be released **open-source** for both research and commercial purposes.
- No personal data is collected. You may stop at any time.
- You will answer the following question:
#### Which answer, A or B, is better in terms of grammar, naturalness, and coherence?
- Only base your decision on this question and not other factors.
Please confirm consent, select your role, then press **Begin**.
"""
# --- Helper Functions ---
def load_master_samples() -> list:
"""Loads, shuffles deterministically, and returns the first 100 samples."""
if not Path(TRANSLATED_FILE).exists():
raise FileNotFoundError(f"Source file not found: {TRANSLATED_FILE}")
with open(TRANSLATED_FILE, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
# Shuffle with a fixed seed to get a deterministic "random" subset
rng = random.Random(RANDOM_SEED)
rng.shuffle(data)
return data[:NUM_SAMPLES]
def download_annotations() -> pd.DataFrame:
"""Downloads annotations from HF. If not found, returns an empty DataFrame."""
try:
local_path = hf_hub_download(
repo_id=HF_REPO_ID,
filename=ANNOTATION_FILE,
repo_type="dataset",
token=HF_TOKEN,
)
print(f"Downloaded existing annotations from {HF_REPO_ID}")
return pd.read_csv(local_path)
except HfHubHTTPError as e:
# If the file doesn't exist on the Hub (404), it's the first run.
if e.response.status_code == 404:
print("No remote annotation file found. Creating a new one.")
# Define the schema for the new CSV file, now including annotator_type
return pd.DataFrame(columns=["hash", "annotator_type", "choice", "preferred_response", "timestamp"])
else:
raise # Re-raise other HTTP errors
def upload_annotations(df: pd.DataFrame):
"""Saves a DataFrame locally and pushes it to the Hugging Face Hub."""
if not HF_TOKEN:
print("WARNING: No HF_TOKEN found. Skipping upload.")
return
# Save locally first
df.to_csv(ANNOTATION_FILE, index=False)
# Upload to Hub
api = HfApi()
create_repo(HF_REPO_ID, repo_type="dataset", exist_ok=True, token=HF_TOKEN)
api.upload_file(
path_or_fileobj=ANNOTATION_FILE,
path_in_repo=ANNOTATION_FILE,
repo_id=HF_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Append new DPO annotation"
)
print(f"Successfully uploaded updated annotations to {HF_REPO_ID}")
# --- Gradio Core Logic ---
def prepare_tasks():
"""
Loads master samples, downloads existing annotations, and prepares the
list of un-annotated tasks for the current session.
"""
master_samples = load_master_samples()
annotations_df = download_annotations()
completed_hashes = set(annotations_df['hash'].unique())
to_do_samples = [s for s in master_samples if s['hash'] not in completed_hashes]
tasks = []
for sample in to_do_samples:
# Shuffle response1 and response2 for unbiased presentation
options = [('response1', sample['response1']), ('response2', sample['response2'])]
random.shuffle(options)
tasks.append({
"hash": sample['hash'],
"instruction": sample['instruction'],
"response_A": options[0][1],
"response_B": options[1][1],
# Track which original response corresponds to A and B
"shuffle_map": {'A': options[0][0], 'B': options[1][0]}
})
return tasks
def start_session(annotator_type):
"""
Triggered by the 'Begin' button. Prepares tasks and loads the first one.
"""
tasks = prepare_tasks()
if not tasks:
# All samples are already annotated
return {
consent_group: gr.update(visible=False),
task_group: gr.update(visible=False),
done_group: gr.update(visible=True),
state_tasks: [],
state_task_index: 0,
state_annotator_type: ""
}
first_task = tasks[0]
progress_str = f"Progress: 1 / {len(tasks)}"
return {
consent_group: gr.update(visible=False),
task_group: gr.update(visible=True),
done_group: gr.update(visible=False),
state_tasks: tasks,
state_task_index: 0,
state_annotator_type: annotator_type,
progress_counter: gr.update(value=progress_str),
instruction_box: gr.update(value=first_task['instruction']),
response_a_box: gr.update(value=first_task['response_A']),
response_b_box: gr.update(value=first_task['response_B']),
}
def record_choice(tasks, current_index, annotator_type, choice):
"""
Records the user's choice, saves it, and loads the next task.
"""
# 1. Get current task and determine which original response was preferred
current_task = tasks[current_index]
preferred_response_key = current_task['shuffle_map'][choice] # 'response1' or 'response2'
# 2. Create a new annotation row, now including the annotator_type
new_annotation = {
"hash": current_task['hash'],
"annotator_type": annotator_type,
"choice": choice, # 'A' or 'B'
"preferred_response": preferred_response_key,
"timestamp": datetime.utcnow().isoformat()
}
# 3. Load existing annotations, append, and upload
annotations_df = download_annotations()
new_df = pd.concat([annotations_df, pd.DataFrame([new_annotation])], ignore_index=True)
upload_annotations(new_df)
# 4. Move to the next task
next_index = current_index + 1
if next_index >= len(tasks):
# All tasks for this session are done
return {
task_group: gr.update(visible=False),
done_group: gr.update(visible=True)
}
next_task = tasks[next_index]
progress_str = f"Progress: {next_index + 1} / {len(tasks)}"
return {
state_task_index: next_index,
progress_counter: gr.update(value=progress_str),
instruction_box: gr.update(value=next_task['instruction']),
response_a_box: gr.update(value=next_task['response_A']),
response_b_box: gr.update(value=next_task['response_B']),
}
def update_begin_button_status(consent_given, role_selected):
"""Enable the begin button only if consent is checked and a role is selected."""
return gr.update(interactive=(consent_given and role_selected is not None))
# --- Gradio UI Layout ---
with gr.Blocks(theme=gr.themes.Soft(), title="DPO Annotation") as demo:
# State management
state_tasks = gr.State([])
state_task_index = gr.State(0)
state_annotator_type = gr.State("")
# Page 1: Consent
with gr.Group(visible=True) as consent_group:
gr.Markdown(CONSENT_MD)
with gr.Row():
consent_checkbox = gr.Checkbox(label="I consent to the terms above")
annotator_type_dropdown = gr.Dropdown(["Tester", "Native"], label="Select Your Role")
begin_btn = gr.Button("Begin", interactive=False)
# Page 2: Annotation Task
with gr.Group(visible=False) as task_group:
progress_counter = gr.Markdown("Progress: 0 / 0", elem_id="progress_counter")
with gr.Column():
instruction_box = gr.Textbox(label="Instruction", interactive=False, lines=3)
with gr.Row():
response_a_box = gr.Textbox(label="Answer A", interactive=False, lines=8)
response_b_box = gr.Textbox(label="Answer B", interactive=False, lines=8)
with gr.Row():
choose_a_btn = gr.Button("A is Better", variant="primary")
choose_b_btn = gr.Button("B is Better", variant="primary")
# Page 3: Completion Message
with gr.Group(visible=False) as done_group:
gr.Markdown("## ✅ Thank You!\n\nAll available samples have been annotated. Your contribution is greatly appreciated.")
# --- Event Handlers ---
# Enable 'Begin' button only when consent is checked AND a role is selected
consent_checkbox.change(
fn=update_begin_button_status,
inputs=[consent_checkbox, annotator_type_dropdown],
outputs=begin_btn
)
annotator_type_dropdown.change(
fn=update_begin_button_status,
inputs=[consent_checkbox, annotator_type_dropdown],
outputs=begin_btn
)
# Start the session when 'Begin' is clicked
begin_btn.click(
fn=start_session,
inputs=[annotator_type_dropdown],
outputs=[
consent_group, task_group, done_group,
state_tasks, state_task_index, state_annotator_type,
progress_counter, instruction_box, response_a_box, response_b_box
]
)
# Handle choice A
choose_a_btn.click(
fn=record_choice,
inputs=[state_tasks, state_task_index, state_annotator_type, gr.State('A')],
outputs=[
state_task_index, progress_counter,
instruction_box, response_a_box, response_b_box,
task_group, done_group
]
)
# Handle choice B
choose_b_btn.click(
fn=record_choice,
inputs=[state_tasks, state_task_index, state_annotator_type, gr.State('B')],
outputs=[
state_task_index, progress_counter,
instruction_box, response_a_box, response_b_box,
task_group, done_group
]
)
if __name__ == "__main__":
# Ensure the source file exists before launching
if not Path(TRANSLATED_FILE).exists():
print(f"FATAL: Source data file '{TRANSLATED_FILE}' not found.")
print("Please ensure the file is in the correct directory before running.")
else:
demo.launch() |