readctrl / code /interface /old /annotators_v4.py

Add files using upload-large-folder tool

9c6961c verified 28 days ago

10.4 kB

	import gradio as gr
	import json
	import random
	import os
	from datetime import datetime

	# --- PATH CONFIGURATION ---
	DATA_PATH = "/home/mshahidul/readctrl/data/data_annotator_data/manual_selections_en.json"
	SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data"
	QUESTIONS_FILE = "/home/mshahidul/readctrl/code/interface/sp50_questions_en.json"

	# --- SESSION CONFIGURATION ---
	NUM_QUESTIONS = 20
	NUM_DUPLICATES = 4
	NUM_LITERACY_QUERIES = 10
	DUPLICATE_INTERVAL = 8

	# --- UI HTML COMPONENTS ---
	GUIDE_HTML = """
	<div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #4CAF50; border-radius: 4px; margin-bottom: 20px;">
	<h3>Rating Guide: Medical Text Difficulty</h3>
	<table style="width:100%; border-collapse: collapse; text-align: left;">
	<tr style="background-color: #e8f5e9;">
	<th style="padding: 8px; border: 1px solid #ddd;">Score</th>
	<th style="padding: 8px; border: 1px solid #ddd;">Description</th>
	</tr>
	<tr><td><b>1</b></td><td><b>Very Easy:</b> Simple words, no medical jargon. Clear to a child.</td></tr>
	<tr><td><b>2</b></td><td><b>Easy:</b> Conversational medical terms (e.g., "flu", "broken bone").</td></tr>
	<tr><td><b>3</b></td><td><b>Moderate:</b> Standard patient education material. Requires some focus.</td></tr>
	<tr><td><b>4</b></td><td><b>Hard:</b> Significant technical jargon. Likely a clinical summary.</td></tr>
	<tr><td><b>5</b></td><td><b>Very Hard:</b> Specialist-level / Academic. Extremely dense.</td></tr>
	</table>
	</div>
	"""

	EXAMPLES_HTML = """
	<div style="background-color: #ffffff; padding: 15px; border: 1px solid #ddd; border-radius: 4px;">
	<h3 style="color: #2e7d32;">Reference Examples (Calibration)</h3>
	<p>Use these examples of the same medical case to calibrate your ratings:</p>
	<div style="display: flex; gap: 15px;">
	<div style="flex: 1; background-color: #f1f8e9; padding: 10px; border-radius: 4px;">
	<h4 style="margin-top:0;">Level 1-2 (Easy)</h4>
	<p style="font-size: 0.9em; line-height: 1.4;">"This is about a 20-year-old woman. She had a kidney problem... The problem first showed up when a big blood clot blocked veins in her brain... She took blood thinners and steroid pills."</p>
	<small><i>Reasoning: Uses "kidney problem" instead of "nephrotic syndrome" and "blood thinners" instead of "anticoagulants".</i></small>
	</div>
	<div style="flex: 1; background-color: #fff3e0; padding: 10px; border-radius: 4px;">
	<h4 style="margin-top:0;">Level 3 (Medium)</h4>
	<p style="font-size: 0.9em; line-height: 1.4;">"A 20-year-old woman had a 12-year history of idiopathic nephrotic syndrome... treated with anticoagulation and oral corticosteroids... CT showed acute superior mesenteric artery thrombosis."</p>
	<small><i>Reasoning: Uses standard medical terminology but keeps sentences relatively concise and structured.</i></small>
	</div>
	<div style="flex: 1; background-color: #ffebee; padding: 10px; border-radius: 4px;">
	<h4 style="margin-top:0;">Level 4-5 (Hard)</h4>
	<p style="font-size: 0.9em; line-height: 1.4;">"20-year-old woman... idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein... Hemogasanalysis results showed metabolic acidosis with respiratory compensation."</p>
	<small><i>Reasoning: Highly technical, academic language, specific lab values, and complex physiological processes.</i></small>
	</div>
	</div>
	</div>
	"""

	# --- DATA LOADING ---
	def load_all_literacy_questions():
	try:
	with open(QUESTIONS_FILE, "r") as f:
	return json.load(f)
	except Exception as e:
	print(f"Error loading questions: {e}")
	return []

	with open(DATA_PATH, "r") as f:
	FULL_DATASET = json.load(f)

	# --- SESSION CLASS ---
	class AnnotationSession:
	def __init__(self, dataset, all_questions):
	k = min(len(dataset), NUM_QUESTIONS)
	base_samples = random.sample(dataset, k)
	self.queue = list(base_samples)
	for i in range(min(NUM_DUPLICATES, k)):
	self.queue.insert(DUPLICATE_INTERVAL + i, base_samples[i])

	self.current_index = 0
	self.results = []
	self.session_questions = random.sample(all_questions, min(NUM_LITERACY_QUERIES, len(all_questions)))
	self.session_folder = None

	# --- LOGIC FUNCTIONS ---
	def start_and_save_literacy(username, *args):
	# args contains all the answers from the radio buttons
	all_q = load_all_literacy_questions()
	new_session = AnnotationSession(FULL_DATASET, all_q)

	clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
	folder_name = f"{clean_username}_{timestamp}"
	session_folder = os.path.join(SAVE_ROOT, folder_name)
	os.makedirs(session_folder, exist_ok=True)
	new_session.session_folder = session_folder

	literacy_data = []
	for i, ans in enumerate(args):
	if i < len(new_session.session_questions):
	q_info = new_session.session_questions[i]
	literacy_data.append({
	"question_id": q_info['id'],
	"question_text": q_info['question'],
	"user_answer": ans,
	"is_correct": ans == q_info['correct']
	})

	with open(os.path.join(session_folder, "literacy_results.json"), "w") as f:
	json.dump(literacy_data, f, indent=4)

	first_item = new_session.queue[0]
	return (
	gr.update(visible=False),
	gr.update(visible=True),
	first_item['original_text'],
	first_item['selected_wiki_anchor'],
	f"Item 1 of {len(new_session.queue)}",
	new_session
	)
	def submit_rating(doc_slider, wiki_slider, current_session):
	if current_session is None:
	gr.Warning("Session lost! Please refresh.") # Pop-up for errors
	return "", "", "Error: Session lost", 3, 3, None

	current_pair = current_session.queue[current_session.current_index]

	# ... (Keep your existing result_entry logic) ...
	result_entry = {
	"queue_position": current_session.current_index,
	"doc_id": current_pair.get('index', 'no_id'),
	"health_literacy_label": current_pair.get('label', 'no_label'),
	"doc_rating": doc_slider,
	"wiki_rating": wiki_slider,
	"is_duplicate": current_session.current_index >= DUPLICATE_INTERVAL and
	current_session.current_index < (DUPLICATE_INTERVAL + NUM_DUPLICATES),
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	}

	current_session.results.append(result_entry)
	annotation_file = os.path.join(current_session.session_folder, "annotation_results.json")
	with open(annotation_file, "w") as f:
	json.dump(current_session.results, f, indent=4)

	current_session.current_index += 1

	# Check if there are more items
	if current_session.current_index < len(current_session.queue):
	# Trigger the "Success" pop-up
	gr.Info(f"Rating {current_session.current_index} saved successfully!")
	print(f"Progress Saved: Item {current_session.current_index}")

	next_pair = current_session.queue[current_session.current_index]
	return (
	next_pair['original_text'],
	next_pair['selected_wiki_anchor'],
	f"Item {current_session.current_index + 1} of {len(current_session.queue)}",
	3, 3,
	current_session
	)
	else:
	# Trigger the "Finished" pop-up
	gr.Info("Final rating saved. Task complete!")
	return (
	"✅ ALL TASKS COMPLETED",
	"The data has been saved. You may close this tab.",
	"Status: Finished",
	1, 1,
	current_session
	)

	# --- UI INTERFACE ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	# State object to keep data separate for each user
	session_state = gr.State()

	gr.Markdown("# Medical Text Readability Annotation")

	with gr.Accordion("See Annotation Instructions & Calibration Examples", open=True):
	gr.HTML(GUIDE_HTML)
	gr.HTML(EXAMPLES_HTML)

	with gr.Column(visible=True) as intro_box:
	username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_1", max_lines=1)
	gr.Markdown(f"### Pre-Task: Health Literacy Check ({NUM_LITERACY_QUERIES} Questions)")

	all_possible_q = load_all_literacy_questions()
	literacy_inputs = []
	# We display the first 10 for the UI layout; session logic will pick 10 random ones later
	for i in range(min(NUM_LITERACY_QUERIES, len(all_possible_q))):
	q = all_possible_q[i]
	radio = gr.Radio(choices=q['options'], label=q['question'])
	literacy_inputs.append(radio)

	btn_start = gr.Button("Start Annotation", variant="primary")

	with gr.Column(visible=False) as task_box:
	progress_label = gr.Label(label="Progress")
	with gr.Row():
	with gr.Column():
	doc_display = gr.Textbox(interactive=False, lines=15, label="Text A")
	doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3)
	with gr.Column():
	wiki_display = gr.Textbox(interactive=False, lines=15, label="Text B")
	wiki_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3)
	btn_submit = gr.Button("Submit & Next", variant="primary")

	# --- EVENT HANDLERS ---

	# Start button: inputs must include username + all radio buttons
	btn_start.click(
	fn=start_and_save_literacy,
	inputs=[username_input] + literacy_inputs,
	outputs=[intro_box, task_box, doc_display, wiki_display, progress_label, session_state]
	)

	# Submit button: inputs MUST include the session_state
	btn_submit.click(
	fn=submit_rating,
	inputs=[doc_slider, wiki_slider, session_state], # Fixed: Added session_state
	outputs=[doc_display, wiki_display, progress_label, doc_slider, wiki_slider, session_state]
	)

	if __name__ == "__main__":
	demo.launch(share=True)