| | import gradio as gr |
| | import json |
| | import random |
| | import os |
| | from datetime import datetime |
| |
|
| | |
| | DATA_PATH = "/home/mshahidul/readctrl/data/data_annotator_data/manual_selections_en.json" |
| | SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data" |
| | QUESTIONS_FILE = "/home/mshahidul/readctrl/code/interface/sp50_questions_en.json" |
| |
|
| | |
| | NUM_QUESTIONS = 20 |
| | NUM_DUPLICATES = 4 |
| | NUM_LITERACY_QUERIES = 10 |
| | DUPLICATE_INTERVAL = 8 |
| |
|
| | |
| | GUIDE_HTML = """ |
| | <div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #4CAF50; border-radius: 4px; margin-bottom: 20px;"> |
| | <h3>Rating Guide: Medical Text Difficulty</h3> |
| | <table style="width:100%; border-collapse: collapse; text-align: left;"> |
| | <tr style="background-color: #e8f5e9;"> |
| | <th style="padding: 8px; border: 1px solid #ddd;">Score</th> |
| | <th style="padding: 8px; border: 1px solid #ddd;">Description</th> |
| | </tr> |
| | <tr><td><b>1</b></td><td><b>Very Easy:</b> Simple words, no medical jargon. Clear to a child.</td></tr> |
| | <tr><td><b>2</b></td><td><b>Easy:</b> Conversational medical terms (e.g., "flu", "broken bone").</td></tr> |
| | <tr><td><b>3</b></td><td><b>Moderate:</b> Standard patient education material. Requires some focus.</td></tr> |
| | <tr><td><b>4</b></td><td><b>Hard:</b> Significant technical jargon. Likely a clinical summary.</td></tr> |
| | <tr><td><b>5</b></td><td><b>Very Hard:</b> Specialist-level / Academic. Extremely dense.</td></tr> |
| | </table> |
| | </div> |
| | """ |
| |
|
| | EXAMPLES_HTML = """ |
| | <div style="background-color: #ffffff; padding: 15px; border: 1px solid #ddd; border-radius: 4px;"> |
| | <h3 style="color: #2e7d32;">Reference Examples (Calibration)</h3> |
| | <p>Use these examples of the same medical case to calibrate your ratings:</p> |
| | <div style="display: flex; gap: 15px;"> |
| | <div style="flex: 1; background-color: #f1f8e9; padding: 10px; border-radius: 4px;"> |
| | <h4 style="margin-top:0;">Level 1-2 (Easy)</h4> |
| | <p style="font-size: 0.9em; line-height: 1.4;">"This is about a 20-year-old woman. She had a kidney problem... The problem first showed up when a big blood clot blocked veins in her brain... She took blood thinners and steroid pills."</p> |
| | <small><i>Reasoning: Uses "kidney problem" instead of "nephrotic syndrome" and "blood thinners" instead of "anticoagulants".</i></small> |
| | </div> |
| | <div style="flex: 1; background-color: #fff3e0; padding: 10px; border-radius: 4px;"> |
| | <h4 style="margin-top:0;">Level 3 (Medium)</h4> |
| | <p style="font-size: 0.9em; line-height: 1.4;">"A 20-year-old woman had a 12-year history of idiopathic nephrotic syndrome... treated with anticoagulation and oral corticosteroids... CT showed acute superior mesenteric artery thrombosis."</p> |
| | <small><i>Reasoning: Uses standard medical terminology but keeps sentences relatively concise and structured.</i></small> |
| | </div> |
| | <div style="flex: 1; background-color: #ffebee; padding: 10px; border-radius: 4px;"> |
| | <h4 style="margin-top:0;">Level 4-5 (Hard)</h4> |
| | <p style="font-size: 0.9em; line-height: 1.4;">"20-year-old woman... idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein... Hemogasanalysis results showed metabolic acidosis with respiratory compensation."</p> |
| | <small><i>Reasoning: Highly technical, academic language, specific lab values, and complex physiological processes.</i></small> |
| | </div> |
| | </div> |
| | </div> |
| | """ |
| |
|
| | |
| | def load_all_literacy_questions(): |
| | try: |
| | with open(QUESTIONS_FILE, "r") as f: |
| | return json.load(f) |
| | except Exception as e: |
| | print(f"Error loading questions: {e}") |
| | return [] |
| |
|
| | with open(DATA_PATH, "r") as f: |
| | FULL_DATASET = json.load(f) |
| |
|
| | |
| | class AnnotationSession: |
| | def __init__(self, dataset, all_questions): |
| | k = min(len(dataset), NUM_QUESTIONS) |
| | base_samples = random.sample(dataset, k) |
| | self.queue = list(base_samples) |
| | for i in range(min(NUM_DUPLICATES, k)): |
| | self.queue.insert(DUPLICATE_INTERVAL + i, base_samples[i]) |
| | |
| | self.current_index = 0 |
| | self.results = [] |
| | self.session_questions = random.sample(all_questions, min(NUM_LITERACY_QUERIES, len(all_questions))) |
| | self.session_folder = None |
| |
|
| | |
| | def start_and_save_literacy(username, *args): |
| | |
| | all_q = load_all_literacy_questions() |
| | new_session = AnnotationSession(FULL_DATASET, all_q) |
| | |
| | clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous" |
| | timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") |
| | folder_name = f"{clean_username}_{timestamp}" |
| | session_folder = os.path.join(SAVE_ROOT, folder_name) |
| | os.makedirs(session_folder, exist_ok=True) |
| | new_session.session_folder = session_folder |
| |
|
| | literacy_data = [] |
| | for i, ans in enumerate(args): |
| | if i < len(new_session.session_questions): |
| | q_info = new_session.session_questions[i] |
| | literacy_data.append({ |
| | "question_id": q_info['id'], |
| | "question_text": q_info['question'], |
| | "user_answer": ans, |
| | "is_correct": ans == q_info['correct'] |
| | }) |
| | |
| | with open(os.path.join(session_folder, "literacy_results.json"), "w") as f: |
| | json.dump(literacy_data, f, indent=4) |
| | |
| | first_item = new_session.queue[0] |
| | return ( |
| | gr.update(visible=False), |
| | gr.update(visible=True), |
| | first_item['original_text'], |
| | first_item['selected_wiki_anchor'], |
| | f"Item 1 of {len(new_session.queue)}", |
| | new_session |
| | ) |
| | def submit_rating(doc_slider, wiki_slider, current_session): |
| | if current_session is None: |
| | gr.Warning("Session lost! Please refresh.") |
| | return "", "", "Error: Session lost", 3, 3, None |
| |
|
| | current_pair = current_session.queue[current_session.current_index] |
| | |
| | |
| | result_entry = { |
| | "queue_position": current_session.current_index, |
| | "doc_id": current_pair.get('index', 'no_id'), |
| | "health_literacy_label": current_pair.get('label', 'no_label'), |
| | "doc_rating": doc_slider, |
| | "wiki_rating": wiki_slider, |
| | "is_duplicate": current_session.current_index >= DUPLICATE_INTERVAL and |
| | current_session.current_index < (DUPLICATE_INTERVAL + NUM_DUPLICATES), |
| | "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| | } |
| | |
| | current_session.results.append(result_entry) |
| | annotation_file = os.path.join(current_session.session_folder, "annotation_results.json") |
| | with open(annotation_file, "w") as f: |
| | json.dump(current_session.results, f, indent=4) |
| | |
| | current_session.current_index += 1 |
| | |
| | |
| | if current_session.current_index < len(current_session.queue): |
| | |
| | gr.Info(f"Rating {current_session.current_index} saved successfully!") |
| | print(f"Progress Saved: Item {current_session.current_index}") |
| | |
| | next_pair = current_session.queue[current_session.current_index] |
| | return ( |
| | next_pair['original_text'], |
| | next_pair['selected_wiki_anchor'], |
| | f"Item {current_session.current_index + 1} of {len(current_session.queue)}", |
| | 3, 3, |
| | current_session |
| | ) |
| | else: |
| | |
| | gr.Info("Final rating saved. Task complete!") |
| | return ( |
| | "✅ ALL TASKS COMPLETED", |
| | "The data has been saved. You may close this tab.", |
| | "Status: Finished", |
| | 1, 1, |
| | current_session |
| | ) |
| |
|
| | |
| | with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| | |
| | session_state = gr.State() |
| |
|
| | gr.Markdown("# Medical Text Readability Annotation") |
| | |
| | with gr.Accordion("See Annotation Instructions & Calibration Examples", open=True): |
| | gr.HTML(GUIDE_HTML) |
| | gr.HTML(EXAMPLES_HTML) |
| |
|
| | with gr.Column(visible=True) as intro_box: |
| | username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_1", max_lines=1) |
| | gr.Markdown(f"### Pre-Task: Health Literacy Check ({NUM_LITERACY_QUERIES} Questions)") |
| | |
| | all_possible_q = load_all_literacy_questions() |
| | literacy_inputs = [] |
| | |
| | for i in range(min(NUM_LITERACY_QUERIES, len(all_possible_q))): |
| | q = all_possible_q[i] |
| | radio = gr.Radio(choices=q['options'], label=q['question']) |
| | literacy_inputs.append(radio) |
| | |
| | btn_start = gr.Button("Start Annotation", variant="primary") |
| |
|
| | with gr.Column(visible=False) as task_box: |
| | progress_label = gr.Label(label="Progress") |
| | with gr.Row(): |
| | with gr.Column(): |
| | doc_display = gr.Textbox(interactive=False, lines=15, label="Text A") |
| | doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3) |
| | with gr.Column(): |
| | wiki_display = gr.Textbox(interactive=False, lines=15, label="Text B") |
| | wiki_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3) |
| | btn_submit = gr.Button("Submit & Next", variant="primary") |
| |
|
| | |
| | |
| | |
| | btn_start.click( |
| | fn=start_and_save_literacy, |
| | inputs=[username_input] + literacy_inputs, |
| | outputs=[intro_box, task_box, doc_display, wiki_display, progress_label, session_state] |
| | ) |
| | |
| | |
| | btn_submit.click( |
| | fn=submit_rating, |
| | inputs=[doc_slider, wiki_slider, session_state], |
| | outputs=[doc_display, wiki_display, progress_label, doc_slider, wiki_slider, session_state] |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch(share=True) |