readctrl / code /interface /old /annotators_v4.py
shahidul034's picture
Add files using upload-large-folder tool
9c6961c verified
import gradio as gr
import json
import random
import os
from datetime import datetime
# --- PATH CONFIGURATION ---
DATA_PATH = "/home/mshahidul/readctrl/data/data_annotator_data/manual_selections_en.json"
SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data"
QUESTIONS_FILE = "/home/mshahidul/readctrl/code/interface/sp50_questions_en.json"
# --- SESSION CONFIGURATION ---
NUM_QUESTIONS = 20
NUM_DUPLICATES = 4
NUM_LITERACY_QUERIES = 10
DUPLICATE_INTERVAL = 8
# --- UI HTML COMPONENTS ---
GUIDE_HTML = """
<div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #4CAF50; border-radius: 4px; margin-bottom: 20px;">
<h3>Rating Guide: Medical Text Difficulty</h3>
<table style="width:100%; border-collapse: collapse; text-align: left;">
<tr style="background-color: #e8f5e9;">
<th style="padding: 8px; border: 1px solid #ddd;">Score</th>
<th style="padding: 8px; border: 1px solid #ddd;">Description</th>
</tr>
<tr><td><b>1</b></td><td><b>Very Easy:</b> Simple words, no medical jargon. Clear to a child.</td></tr>
<tr><td><b>2</b></td><td><b>Easy:</b> Conversational medical terms (e.g., "flu", "broken bone").</td></tr>
<tr><td><b>3</b></td><td><b>Moderate:</b> Standard patient education material. Requires some focus.</td></tr>
<tr><td><b>4</b></td><td><b>Hard:</b> Significant technical jargon. Likely a clinical summary.</td></tr>
<tr><td><b>5</b></td><td><b>Very Hard:</b> Specialist-level / Academic. Extremely dense.</td></tr>
</table>
</div>
"""
EXAMPLES_HTML = """
<div style="background-color: #ffffff; padding: 15px; border: 1px solid #ddd; border-radius: 4px;">
<h3 style="color: #2e7d32;">Reference Examples (Calibration)</h3>
<p>Use these examples of the same medical case to calibrate your ratings:</p>
<div style="display: flex; gap: 15px;">
<div style="flex: 1; background-color: #f1f8e9; padding: 10px; border-radius: 4px;">
<h4 style="margin-top:0;">Level 1-2 (Easy)</h4>
<p style="font-size: 0.9em; line-height: 1.4;">"This is about a 20-year-old woman. She had a kidney problem... The problem first showed up when a big blood clot blocked veins in her brain... She took blood thinners and steroid pills."</p>
<small><i>Reasoning: Uses "kidney problem" instead of "nephrotic syndrome" and "blood thinners" instead of "anticoagulants".</i></small>
</div>
<div style="flex: 1; background-color: #fff3e0; padding: 10px; border-radius: 4px;">
<h4 style="margin-top:0;">Level 3 (Medium)</h4>
<p style="font-size: 0.9em; line-height: 1.4;">"A 20-year-old woman had a 12-year history of idiopathic nephrotic syndrome... treated with anticoagulation and oral corticosteroids... CT showed acute superior mesenteric artery thrombosis."</p>
<small><i>Reasoning: Uses standard medical terminology but keeps sentences relatively concise and structured.</i></small>
</div>
<div style="flex: 1; background-color: #ffebee; padding: 10px; border-radius: 4px;">
<h4 style="margin-top:0;">Level 4-5 (Hard)</h4>
<p style="font-size: 0.9em; line-height: 1.4;">"20-year-old woman... idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein... Hemogasanalysis results showed metabolic acidosis with respiratory compensation."</p>
<small><i>Reasoning: Highly technical, academic language, specific lab values, and complex physiological processes.</i></small>
</div>
</div>
</div>
"""
# --- DATA LOADING ---
def load_all_literacy_questions():
try:
with open(QUESTIONS_FILE, "r") as f:
return json.load(f)
except Exception as e:
print(f"Error loading questions: {e}")
return []
with open(DATA_PATH, "r") as f:
FULL_DATASET = json.load(f)
# --- SESSION CLASS ---
class AnnotationSession:
def __init__(self, dataset, all_questions):
k = min(len(dataset), NUM_QUESTIONS)
base_samples = random.sample(dataset, k)
self.queue = list(base_samples)
for i in range(min(NUM_DUPLICATES, k)):
self.queue.insert(DUPLICATE_INTERVAL + i, base_samples[i])
self.current_index = 0
self.results = []
self.session_questions = random.sample(all_questions, min(NUM_LITERACY_QUERIES, len(all_questions)))
self.session_folder = None
# --- LOGIC FUNCTIONS ---
def start_and_save_literacy(username, *args):
# args contains all the answers from the radio buttons
all_q = load_all_literacy_questions()
new_session = AnnotationSession(FULL_DATASET, all_q)
clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
folder_name = f"{clean_username}_{timestamp}"
session_folder = os.path.join(SAVE_ROOT, folder_name)
os.makedirs(session_folder, exist_ok=True)
new_session.session_folder = session_folder
literacy_data = []
for i, ans in enumerate(args):
if i < len(new_session.session_questions):
q_info = new_session.session_questions[i]
literacy_data.append({
"question_id": q_info['id'],
"question_text": q_info['question'],
"user_answer": ans,
"is_correct": ans == q_info['correct']
})
with open(os.path.join(session_folder, "literacy_results.json"), "w") as f:
json.dump(literacy_data, f, indent=4)
first_item = new_session.queue[0]
return (
gr.update(visible=False),
gr.update(visible=True),
first_item['original_text'],
first_item['selected_wiki_anchor'],
f"Item 1 of {len(new_session.queue)}",
new_session
)
def submit_rating(doc_slider, wiki_slider, current_session):
if current_session is None:
gr.Warning("Session lost! Please refresh.") # Pop-up for errors
return "", "", "Error: Session lost", 3, 3, None
current_pair = current_session.queue[current_session.current_index]
# ... (Keep your existing result_entry logic) ...
result_entry = {
"queue_position": current_session.current_index,
"doc_id": current_pair.get('index', 'no_id'),
"health_literacy_label": current_pair.get('label', 'no_label'),
"doc_rating": doc_slider,
"wiki_rating": wiki_slider,
"is_duplicate": current_session.current_index >= DUPLICATE_INTERVAL and
current_session.current_index < (DUPLICATE_INTERVAL + NUM_DUPLICATES),
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
current_session.results.append(result_entry)
annotation_file = os.path.join(current_session.session_folder, "annotation_results.json")
with open(annotation_file, "w") as f:
json.dump(current_session.results, f, indent=4)
current_session.current_index += 1
# Check if there are more items
if current_session.current_index < len(current_session.queue):
# Trigger the "Success" pop-up
gr.Info(f"Rating {current_session.current_index} saved successfully!")
print(f"Progress Saved: Item {current_session.current_index}")
next_pair = current_session.queue[current_session.current_index]
return (
next_pair['original_text'],
next_pair['selected_wiki_anchor'],
f"Item {current_session.current_index + 1} of {len(current_session.queue)}",
3, 3,
current_session
)
else:
# Trigger the "Finished" pop-up
gr.Info("Final rating saved. Task complete!")
return (
"✅ ALL TASKS COMPLETED",
"The data has been saved. You may close this tab.",
"Status: Finished",
1, 1,
current_session
)
# --- UI INTERFACE ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
# State object to keep data separate for each user
session_state = gr.State()
gr.Markdown("# Medical Text Readability Annotation")
with gr.Accordion("See Annotation Instructions & Calibration Examples", open=True):
gr.HTML(GUIDE_HTML)
gr.HTML(EXAMPLES_HTML)
with gr.Column(visible=True) as intro_box:
username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_1", max_lines=1)
gr.Markdown(f"### Pre-Task: Health Literacy Check ({NUM_LITERACY_QUERIES} Questions)")
all_possible_q = load_all_literacy_questions()
literacy_inputs = []
# We display the first 10 for the UI layout; session logic will pick 10 random ones later
for i in range(min(NUM_LITERACY_QUERIES, len(all_possible_q))):
q = all_possible_q[i]
radio = gr.Radio(choices=q['options'], label=q['question'])
literacy_inputs.append(radio)
btn_start = gr.Button("Start Annotation", variant="primary")
with gr.Column(visible=False) as task_box:
progress_label = gr.Label(label="Progress")
with gr.Row():
with gr.Column():
doc_display = gr.Textbox(interactive=False, lines=15, label="Text A")
doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3)
with gr.Column():
wiki_display = gr.Textbox(interactive=False, lines=15, label="Text B")
wiki_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3)
btn_submit = gr.Button("Submit & Next", variant="primary")
# --- EVENT HANDLERS ---
# Start button: inputs must include username + all radio buttons
btn_start.click(
fn=start_and_save_literacy,
inputs=[username_input] + literacy_inputs,
outputs=[intro_box, task_box, doc_display, wiki_display, progress_label, session_state]
)
# Submit button: inputs MUST include the session_state
btn_submit.click(
fn=submit_rating,
inputs=[doc_slider, wiki_slider, session_state], # Fixed: Added session_state
outputs=[doc_display, wiki_display, progress_label, doc_slider, wiki_slider, session_state]
)
if __name__ == "__main__":
demo.launch(share=True)