| | import gradio as gr |
| | import json |
| | import random |
| | import os |
| | from datetime import datetime |
| |
|
| | |
| | DATA_PATH = "/home/mshahidul/readctrl/data/data_annotator_data/vector_db_all-miniLM/crowdsourcing_input_en_v2.json" |
| | SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data" |
| | QUESTIONS_FILE = "/home/mshahidul/readctrl/code/interface/sp50_questions_en.json" |
| |
|
| | |
| | NUM_QUESTIONS = 30 |
| | NUM_DUPLICATES = 4 |
| | NUM_LITERACY_QUERIES = 10 |
| | DUPLICATE_INTERVAL = 8 |
| |
|
| | |
| | GUIDE_HTML = """ |
| | <div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #2196F3; border-radius: 4px;"> |
| | <h3>Rating Guide: Medical Text Difficulty</h3> |
| | <p>Please rate the difficulty of the documents based on the following scale:</p> |
| | <table style="width:100%; border-collapse: collapse; text-align: left;"> |
| | <tr style="background-color: #e3f2fd;"> |
| | <th style="padding: 8px; border: 1px solid #ddd;">Score</th> |
| | <th style="padding: 8px; border: 1px solid #ddd;">Description</th> |
| | </tr> |
| | <tr> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>1 - 2</b></td> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>Very Easy:</b> Clear language, no medical jargon. Like a 5th-grade textbook.</td> |
| | </tr> |
| | <tr> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>3 - 4</b></td> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>Easy:</b> Common medical terms (e.g., "fever", "heart") used in simple sentences.</td> |
| | </tr> |
| | <tr> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>5 - 6</b></td> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>Moderate:</b> Some technical terms. Requires focused reading but understandable.</td> |
| | </tr> |
| | <tr> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>7 - 8</b></td> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>Hard:</b> Heavy use of medical jargon. Read like a clinical report.</td> |
| | </tr> |
| | <tr> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>9 - 10</b></td> |
| | <td style="padding: 8px; border: 1px solid #ddd;"><b>Very Hard:</b> Specialist-level text. Extremely dense and difficult to follow.</td> |
| | </tr> |
| | </table> |
| | </div> |
| | """ |
| |
|
| | def load_questions(): |
| | with open(QUESTIONS_FILE, "r") as f: |
| | all_q = json.load(f) |
| | return random.sample(all_q, min(NUM_LITERACY_QUERIES, len(all_q))) |
| |
|
| | class AnnotationSession: |
| | def __init__(self, dataset, questions): |
| | base_samples = random.sample(dataset, NUM_QUESTIONS) |
| | self.queue = list(base_samples) |
| | for i in range(NUM_DUPLICATES): |
| | self.queue.insert(DUPLICATE_INTERVAL + i, base_samples[i]) |
| | |
| | self.current_index = 0 |
| | self.results = [] |
| | self.questions = questions |
| | self.session_folder = None |
| |
|
| | with open(DATA_PATH, "r") as f: |
| | full_dataset = json.load(f) |
| |
|
| | session = AnnotationSession(full_dataset, load_questions()) |
| |
|
| | |
| | def start_and_save_literacy(username, *answers): |
| | |
| | clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous" |
| | |
| | timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") |
| | |
| | folder_name = f"{clean_username}_{timestamp}" |
| | |
| | session_folder = os.path.join(SAVE_ROOT, folder_name) |
| | os.makedirs(session_folder, exist_ok=True) |
| | session.session_folder = session_folder |
| |
|
| | literacy_data = [] |
| | for i, ans in enumerate(answers): |
| | q_info = session.questions[i] |
| | literacy_data.append({ |
| | "question_id": q_info['id'], |
| | "question_text": q_info['question'], |
| | "user_answer": ans, |
| | "is_correct": ans == q_info['correct'] |
| | }) |
| | |
| | with open(os.path.join(session_folder, "literacy_results.json"), "w") as f: |
| | json.dump(literacy_data, f, indent=4) |
| | |
| | first_pair = session.queue[0] |
| | return ( |
| | gr.update(visible=False), |
| | gr.update(visible=True), |
| | first_pair['original_doc'], |
| | first_pair['wiki_anchor'], |
| | f"Item 1 of {len(session.queue)}" |
| | ) |
| |
|
| | def submit_rating(doc_slider, wiki_slider): |
| | current_pair = session.queue[session.current_index] |
| | |
| | |
| | result_entry = { |
| | "queue_position": session.current_index, |
| | |
| | |
| | "doc_id": current_pair.get('index', 'no_id'), |
| | "health_literacy_label": current_pair.get('label', 'no_label'), |
| | "wiki_id": current_pair.get('index', 'no_id'), |
| | |
| | |
| | |
| | "doc_snippet": current_pair['original_doc'][:100] + "...", |
| | "wiki_snippet": current_pair['wiki_anchor'][:100] + "...", |
| | |
| | "doc_rating": doc_slider, |
| | "wiki_rating": wiki_slider, |
| | |
| | |
| | "is_duplicate": session.current_index >= DUPLICATE_INTERVAL and |
| | session.current_index < (DUPLICATE_INTERVAL + NUM_DUPLICATES), |
| | "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| | } |
| | |
| | session.results.append(result_entry) |
| | |
| | |
| | annotation_file = os.path.join(session.session_folder, "annotation_results.json") |
| | with open(annotation_file, "w") as f: |
| | json.dump(session.results, f, indent=4) |
| | |
| | gr.Info(f"Progress Saved: Item {session.current_index + 1} recorded.") |
| |
|
| | session.current_index += 1 |
| | |
| | |
| | if session.current_index < len(session.queue): |
| | next_pair = session.queue[session.current_index] |
| | return ( |
| | next_pair['original_doc'], |
| | next_pair['wiki_anchor'], |
| | f"Item {session.current_index + 1} of {len(session.queue)}", |
| | 5, 5 |
| | ) |
| | else: |
| | return ( |
| | "✅ ALL TASKS COMPLETED", |
| | "The data has been saved to your session folder. You may close this tab.", |
| | "Status: Finished", |
| | 0, 0 |
| | ) |
| |
|
| | |
| | with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| | gr.Markdown("# Medical Text Readability Annotation") |
| | |
| | with gr.Accordion("See Annotation Instructions & Scale Guide", open=False): |
| | gr.HTML(GUIDE_HTML) |
| |
|
| | with gr.Column(visible=True) as intro_box: |
| | |
| | username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., mshahidul", max_lines=1) |
| | |
| | gr.Markdown(f"### Pre-Task: Health Literacy Check ({NUM_LITERACY_QUERIES} Questions)") |
| | literacy_inputs = [] |
| | for q in session.questions: |
| | radio = gr.Radio(choices=q['options'], label=q['question']) |
| | literacy_inputs.append(radio) |
| | btn_start = gr.Button("Start Annotation", variant="primary") |
| |
|
| | with gr.Column(visible=False) as task_box: |
| | progress = gr.Label(label="Progress") |
| | with gr.Row(): |
| | with gr.Column(): |
| | doc_display = gr.Textbox(interactive=False, lines=12, label="Text A") |
| | doc_slider = gr.Slider(1, 10, step=1, label="Difficulty (1: Simple → 10: Technical)", value=0) |
| | with gr.Column(): |
| | wiki_display = gr.Textbox(interactive=False, lines=12, label="Text B") |
| | wiki_slider = gr.Slider(1, 10, step=1, label="Difficulty (1: Simple → 10: Technical)", value=0) |
| | btn_submit = gr.Button("Submit & Next", variant="primary") |
| |
|
| | |
| | btn_start.click( |
| | start_and_save_literacy, |
| | inputs=[username_input] + literacy_inputs, |
| | outputs=[intro_box, task_box, doc_display, wiki_display, progress] |
| | ) |
| | |
| | btn_submit.click( |
| | submit_rating, |
| | inputs=[doc_slider, wiki_slider], |
| | outputs=[doc_display, wiki_display, progress, doc_slider, wiki_slider] |
| | ) |
| |
|
| | demo.launch(share=True) |