import gradio as gr
import json
import random
import os
from datetime import datetime
# --- PATH CONFIGURATION ---
DATA_PATH = "/home/mshahidul/readctrl/data/data_annotator_data/vector_db_all-miniLM/crowdsourcing_input_en_v2.json"
SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data"
QUESTIONS_FILE = "/home/mshahidul/readctrl/code/interface/sp50_questions_en.json"
# --- SESSION CONFIGURATION ---
NUM_QUESTIONS = 30
NUM_DUPLICATES = 4
NUM_LITERACY_QUERIES = 10
DUPLICATE_INTERVAL = 8
# --- ANNOTATION GUIDE TEXT ---
GUIDE_HTML = """
Rating Guide: Medical Text Difficulty
Please rate the difficulty of the documents based on the following scale:
| Score |
Description |
| 1 - 2 |
Very Easy: Clear language, no medical jargon. Like a 5th-grade textbook. |
| 3 - 4 |
Easy: Common medical terms (e.g., "fever", "heart") used in simple sentences. |
| 5 - 6 |
Moderate: Some technical terms. Requires focused reading but understandable. |
| 7 - 8 |
Hard: Heavy use of medical jargon. Read like a clinical report. |
| 9 - 10 |
Very Hard: Specialist-level text. Extremely dense and difficult to follow. |
"""
def load_questions():
with open(QUESTIONS_FILE, "r") as f:
all_q = json.load(f)
return random.sample(all_q, min(NUM_LITERACY_QUERIES, len(all_q)))
class AnnotationSession:
def __init__(self, dataset, questions):
base_samples = random.sample(dataset, NUM_QUESTIONS)
self.queue = list(base_samples)
for i in range(NUM_DUPLICATES):
self.queue.insert(DUPLICATE_INTERVAL + i, base_samples[i])
self.current_index = 0
self.results = []
self.questions = questions
self.session_folder = None
with open(DATA_PATH, "r") as f:
full_dataset = json.load(f)
session = AnnotationSession(full_dataset, load_questions())
# --- UPDATED FUNCTION ---
def start_and_save_literacy(username, *answers):
# Ensure username is filesystem safe
clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Folder name format: username_date_time
folder_name = f"{clean_username}_{timestamp}"
session_folder = os.path.join(SAVE_ROOT, folder_name)
os.makedirs(session_folder, exist_ok=True)
session.session_folder = session_folder
literacy_data = []
for i, ans in enumerate(answers):
q_info = session.questions[i]
literacy_data.append({
"question_id": q_info['id'],
"question_text": q_info['question'],
"user_answer": ans,
"is_correct": ans == q_info['correct']
})
with open(os.path.join(session_folder, "literacy_results.json"), "w") as f:
json.dump(literacy_data, f, indent=4)
first_pair = session.queue[0]
return (
gr.update(visible=False),
gr.update(visible=True),
first_pair['original_doc'],
first_pair['wiki_anchor'],
f"Item 1 of {len(session.queue)}"
)
def submit_rating(doc_slider, wiki_slider):
current_pair = session.queue[session.current_index]
# Capture more metadata for easier evaluation
result_entry = {
"queue_position": session.current_index,
# Ensure we capture unique IDs if they exist in your JSON,
# otherwise use the full text as a fallback key
"doc_id": current_pair.get('index', 'no_id'),
"health_literacy_label": current_pair.get('label', 'no_label'),
"wiki_id": current_pair.get('index', 'no_id'),
# Saving a snippet of the text helps you verify "Text A" vs "Text B"
# during manual CSV/JSON review later.
"doc_snippet": current_pair['original_doc'][:100] + "...",
"wiki_snippet": current_pair['wiki_anchor'][:100] + "...",
"doc_rating": doc_slider,
"wiki_rating": wiki_slider,
# Useful for checking if this was a duplicate/control item
"is_duplicate": session.current_index >= DUPLICATE_INTERVAL and
session.current_index < (DUPLICATE_INTERVAL + NUM_DUPLICATES),
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
session.results.append(result_entry)
# Save after every click to prevent data loss
annotation_file = os.path.join(session.session_folder, "annotation_results.json")
with open(annotation_file, "w") as f:
json.dump(session.results, f, indent=4)
gr.Info(f"Progress Saved: Item {session.current_index + 1} recorded.")
session.current_index += 1
# ... (rest of your logic remains the same)
if session.current_index < len(session.queue):
next_pair = session.queue[session.current_index]
return (
next_pair['original_doc'],
next_pair['wiki_anchor'],
f"Item {session.current_index + 1} of {len(session.queue)}",
5, 5
)
else:
return (
"✅ ALL TASKS COMPLETED",
"The data has been saved to your session folder. You may close this tab.",
"Status: Finished",
0, 0
)
# --- UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Medical Text Readability Annotation")
with gr.Accordion("See Annotation Instructions & Scale Guide", open=False):
gr.HTML(GUIDE_HTML)
with gr.Column(visible=True) as intro_box:
# --- ADDED USERNAME FIELD ---
username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., mshahidul", max_lines=1)
gr.Markdown(f"### Pre-Task: Health Literacy Check ({NUM_LITERACY_QUERIES} Questions)")
literacy_inputs = []
for q in session.questions:
radio = gr.Radio(choices=q['options'], label=q['question'])
literacy_inputs.append(radio)
btn_start = gr.Button("Start Annotation", variant="primary")
with gr.Column(visible=False) as task_box:
progress = gr.Label(label="Progress")
with gr.Row():
with gr.Column():
doc_display = gr.Textbox(interactive=False, lines=12, label="Text A")
doc_slider = gr.Slider(1, 10, step=1, label="Difficulty (1: Simple → 10: Technical)", value=0)
with gr.Column():
wiki_display = gr.Textbox(interactive=False, lines=12, label="Text B")
wiki_slider = gr.Slider(1, 10, step=1, label="Difficulty (1: Simple → 10: Technical)", value=0)
btn_submit = gr.Button("Submit & Next", variant="primary")
# --- UPDATED CLICK EVENT ---
btn_start.click(
start_and_save_literacy,
inputs=[username_input] + literacy_inputs, # Added username_input here
outputs=[intro_box, task_box, doc_display, wiki_display, progress]
)
btn_submit.click(
submit_rating,
inputs=[doc_slider, wiki_slider],
outputs=[doc_display, wiki_display, progress, doc_slider, wiki_slider]
)
demo.launch(share=True)