File size: 10,422 Bytes

9c6961c

import gradio as gr
import json
import random
import os
from datetime import datetime

# --- PATH CONFIGURATION ---
DATA_PATH = "/home/mshahidul/readctrl/data/data_annotator_data/manual_selections_en.json"
SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data"
QUESTIONS_FILE = "/home/mshahidul/readctrl/code/interface/sp50_questions_en.json"

# --- SESSION CONFIGURATION ---
NUM_QUESTIONS = 20        
NUM_DUPLICATES = 4       
NUM_LITERACY_QUERIES = 10 
DUPLICATE_INTERVAL = 8   

# --- UI HTML COMPONENTS ---
GUIDE_HTML = """
<div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #4CAF50; border-radius: 4px; margin-bottom: 20px;">
    <h3>Rating Guide: Medical Text Difficulty</h3>
    <table style="width:100%; border-collapse: collapse; text-align: left;">
        <tr style="background-color: #e8f5e9;">
            <th style="padding: 8px; border: 1px solid #ddd;">Score</th>
            <th style="padding: 8px; border: 1px solid #ddd;">Description</th>
        </tr>
        <tr><td><b>1</b></td><td><b>Very Easy:</b> Simple words, no medical jargon. Clear to a child.</td></tr>
        <tr><td><b>2</b></td><td><b>Easy:</b> Conversational medical terms (e.g., "flu", "broken bone").</td></tr>
        <tr><td><b>3</b></td><td><b>Moderate:</b> Standard patient education material. Requires some focus.</td></tr>
        <tr><td><b>4</b></td><td><b>Hard:</b> Significant technical jargon. Likely a clinical summary.</td></tr>
        <tr><td><b>5</b></td><td><b>Very Hard:</b> Specialist-level / Academic. Extremely dense.</td></tr>
    </table>
</div>
"""

EXAMPLES_HTML = """
<div style="background-color: #ffffff; padding: 15px; border: 1px solid #ddd; border-radius: 4px;">
    <h3 style="color: #2e7d32;">Reference Examples (Calibration)</h3>
    <p>Use these examples of the same medical case to calibrate your ratings:</p>
    <div style="display: flex; gap: 15px;">
        <div style="flex: 1; background-color: #f1f8e9; padding: 10px; border-radius: 4px;">
            <h4 style="margin-top:0;">Level 1-2 (Easy)</h4>
            <p style="font-size: 0.9em; line-height: 1.4;">"This is about a 20-year-old woman. She had a kidney problem... The problem first showed up when a big blood clot blocked veins in her brain... She took blood thinners and steroid pills."</p>
            <small><i>Reasoning: Uses "kidney problem" instead of "nephrotic syndrome" and "blood thinners" instead of "anticoagulants".</i></small>
        </div>
        <div style="flex: 1; background-color: #fff3e0; padding: 10px; border-radius: 4px;">
            <h4 style="margin-top:0;">Level 3 (Medium)</h4>
            <p style="font-size: 0.9em; line-height: 1.4;">"A 20-year-old woman had a 12-year history of idiopathic nephrotic syndrome... treated with anticoagulation and oral corticosteroids... CT showed acute superior mesenteric artery thrombosis."</p>
            <small><i>Reasoning: Uses standard medical terminology but keeps sentences relatively concise and structured.</i></small>
        </div>
        <div style="flex: 1; background-color: #ffebee; padding: 10px; border-radius: 4px;">
            <h4 style="margin-top:0;">Level 4-5 (Hard)</h4>
            <p style="font-size: 0.9em; line-height: 1.4;">"20-year-old woman... idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein... Hemogasanalysis results showed metabolic acidosis with respiratory compensation."</p>
            <small><i>Reasoning: Highly technical, academic language, specific lab values, and complex physiological processes.</i></small>
        </div>
    </div>
</div>
"""

# --- DATA LOADING ---
def load_all_literacy_questions():
    try:
        with open(QUESTIONS_FILE, "r") as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading questions: {e}")
        return []

with open(DATA_PATH, "r") as f:
    FULL_DATASET = json.load(f)

# --- SESSION CLASS ---
class AnnotationSession:
    def __init__(self, dataset, all_questions):
        k = min(len(dataset), NUM_QUESTIONS)
        base_samples = random.sample(dataset, k)
        self.queue = list(base_samples)
        for i in range(min(NUM_DUPLICATES, k)):
            self.queue.insert(DUPLICATE_INTERVAL + i, base_samples[i])
        
        self.current_index = 0
        self.results = []
        self.session_questions = random.sample(all_questions, min(NUM_LITERACY_QUERIES, len(all_questions)))
        self.session_folder = None 

# --- LOGIC FUNCTIONS ---
def start_and_save_literacy(username, *args):
    # args contains all the answers from the radio buttons
    all_q = load_all_literacy_questions()
    new_session = AnnotationSession(FULL_DATASET, all_q)
    
    clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    folder_name = f"{clean_username}_{timestamp}"
    session_folder = os.path.join(SAVE_ROOT, folder_name)
    os.makedirs(session_folder, exist_ok=True)
    new_session.session_folder = session_folder 

    literacy_data = []
    for i, ans in enumerate(args):
        if i < len(new_session.session_questions):
            q_info = new_session.session_questions[i]
            literacy_data.append({
                "question_id": q_info['id'],
                "question_text": q_info['question'],
                "user_answer": ans,
                "is_correct": ans == q_info['correct']
            })
    
    with open(os.path.join(session_folder, "literacy_results.json"), "w") as f:
        json.dump(literacy_data, f, indent=4)
    
    first_item = new_session.queue[0]
    return (
        gr.update(visible=False), 
        gr.update(visible=True),  
        first_item['original_text'], 
        first_item['selected_wiki_anchor'],
        f"Item 1 of {len(new_session.queue)}",
        new_session
    )
def submit_rating(doc_slider, wiki_slider, current_session):
    if current_session is None:
        gr.Warning("Session lost! Please refresh.") # Pop-up for errors
        return "", "", "Error: Session lost", 3, 3, None

    current_pair = current_session.queue[current_session.current_index]
    
    # ... (Keep your existing result_entry logic) ...
    result_entry = {
        "queue_position": current_session.current_index,
        "doc_id": current_pair.get('index', 'no_id'), 
        "health_literacy_label": current_pair.get('label', 'no_label'),
        "doc_rating": doc_slider,
        "wiki_rating": wiki_slider,
        "is_duplicate": current_session.current_index >= DUPLICATE_INTERVAL and 
                        current_session.current_index < (DUPLICATE_INTERVAL + NUM_DUPLICATES),
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    current_session.results.append(result_entry)
    annotation_file = os.path.join(current_session.session_folder, "annotation_results.json")
    with open(annotation_file, "w") as f:
        json.dump(current_session.results, f, indent=4)
    
    current_session.current_index += 1
    
    # Check if there are more items
    if current_session.current_index < len(current_session.queue):
        # Trigger the "Success" pop-up
        gr.Info(f"Rating {current_session.current_index} saved successfully!") 
        print(f"Progress Saved: Item {current_session.current_index}")
        
        next_pair = current_session.queue[current_session.current_index]
        return (
            next_pair['original_text'], 
            next_pair['selected_wiki_anchor'], 
            f"Item {current_session.current_index + 1} of {len(current_session.queue)}",
            3, 3,
            current_session
        )
    else:
        # Trigger the "Finished" pop-up
        gr.Info("Final rating saved. Task complete!")
        return (
            "✅ ALL TASKS COMPLETED", 
            "The data has been saved. You may close this tab.", 
            "Status: Finished", 
            1, 1,
            current_session
        )

# --- UI INTERFACE ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # State object to keep data separate for each user
    session_state = gr.State()

    gr.Markdown("# Medical Text Readability Annotation")
    
    with gr.Accordion("See Annotation Instructions & Calibration Examples", open=True):
        gr.HTML(GUIDE_HTML)
        gr.HTML(EXAMPLES_HTML)

    with gr.Column(visible=True) as intro_box:
        username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_1", max_lines=1)
        gr.Markdown(f"### Pre-Task: Health Literacy Check ({NUM_LITERACY_QUERIES} Questions)")
        
        all_possible_q = load_all_literacy_questions()
        literacy_inputs = []
        # We display the first 10 for the UI layout; session logic will pick 10 random ones later
        for i in range(min(NUM_LITERACY_QUERIES, len(all_possible_q))):
            q = all_possible_q[i]
            radio = gr.Radio(choices=q['options'], label=q['question'])
            literacy_inputs.append(radio)
        
        btn_start = gr.Button("Start Annotation", variant="primary")

    with gr.Column(visible=False) as task_box:
        progress_label = gr.Label(label="Progress")
        with gr.Row():
            with gr.Column():
                doc_display = gr.Textbox(interactive=False, lines=15, label="Text A")
                doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3)
            with gr.Column():
                wiki_display = gr.Textbox(interactive=False, lines=15, label="Text B")
                wiki_slider = gr.Slider(1, 5, step=1, label="Difficulty (1-5)", value=3)
        btn_submit = gr.Button("Submit & Next", variant="primary")

    # --- EVENT HANDLERS ---
    
    # Start button: inputs must include username + all radio buttons
    btn_start.click(
        fn=start_and_save_literacy, 
        inputs=[username_input] + literacy_inputs,
        outputs=[intro_box, task_box, doc_display, wiki_display, progress_label, session_state]
    )
    
    # Submit button: inputs MUST include the session_state
    btn_submit.click(
        fn=submit_rating,
        inputs=[doc_slider, wiki_slider, session_state], # Fixed: Added session_state
        outputs=[doc_display, wiki_display, progress_label, doc_slider, wiki_slider, session_state]
    )

if __name__ == "__main__":
    demo.launch(share=True)