File size: 9,994 Bytes

9c6961c

import gradio as gr
import json
import os
import random
from datetime import datetime

# --- Configuration & Folder Setup ---
DATA_PATH = '/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json'
KEY_DATA_PATH = '/home/mshahidul/readctrl/data/key_subclaims_testing/key_subclaims.json'
BASE_SAVE_DIR = '/home/mshahidul/readctrl/data/thresold_finding/'

# 1. Create folder based on date+hour of app start
session_folder_name = datetime.now().strftime("%Y-%m-%d_%Hh")
SESSION_PATH = os.path.join(BASE_SAVE_DIR, session_folder_name)
os.makedirs(SESSION_PATH, exist_ok=True)

# --- Data Loading ---
with open(DATA_PATH, 'r') as f:
    data = json.load(f)
NUM_SAMPLES= 10
random.seed(42) 
all_possible_indices = list(range(len(data)))
shuffled_indices = random.sample(all_possible_indices, min(NUM_SAMPLES, len(data)))

with open(KEY_DATA_PATH, 'r') as f:
    key_data = json.load(f)

key_lookup = {item['index']: item['llm_output'] for item in key_data}

# --- Logic Functions ---
def get_key_indices(index, source_type):
    if index not in key_lookup:
        return []
    
    key_field = 'key_source_text_subclaims' if source_type == "Full Original Text" else 'key_gold_summary_subclaims'
    id_key = "source_subclaim_id" if source_type == "Full Original Text" else "gold_subclaim_id"
    
    key_items = key_lookup[index].get(key_field, [])
    
    indices = []
    for item in key_items:
        raw_id = item.get(id_key, "")
        try:
            idx = int(raw_id.split('-')[-1])
            indices.append(idx)
        except (ValueError, IndexError):
            continue
    return indices

def load_example(progress_index):
    # Check if we've reached the end of our fixed sample size
    if progress_index >= len(shuffled_indices):
        return [
            gr.update(value="### 🎉 Session Complete!"), 
            gr.update(value=f"You have finished your set of {NUM_SAMPLES} records."), 
            [], "0%", "0%", "0%", gr.update(choices=[], value=[]), 
            gr.update(choices=[], value=[]), gr.update(choices=[], value=[]), ""
        ]
    
    # Get the actual index from our sample pool
    actual_data_index = shuffled_indices[progress_index]
    record = data[actual_data_index]
    
    # Seed by actual_data_index for consistency
    random.seed(actual_data_index)
    source_type = random.choice(["Full Original Text", "Gold Summary"])
    
    if source_type == "Full Original Text":
        text_content, subclaims = record['fulltext'], record['fulltext_subclaims']
    else:
        text_content, subclaims = record['summary'], record['summary_subclaims']
        
    source_info = f"### Instance: {progress_index + 1}/{len(shuffled_indices)} | Source: **{source_type}**"
    key_indices = get_key_indices(actual_data_index, source_type)
    
    pre_selected = [subclaims[idx] for idx in key_indices if 0 <= idx < len(subclaims)]

    return [
        source_info, text_content, subclaims, "0%", "0%", "0%", 
        gr.update(choices=subclaims, value=pre_selected), 
        gr.update(choices=subclaims, value=pre_selected), 
        gr.update(choices=subclaims, value=pre_selected),
        "" 
    ]

def calc_pct_and_validate(low, inter, prof, total_list):
    if not total_list: return "0%", "0%", "0%", ""
    l_pct, i_pct, p_pct = (len(x)/len(total_list) * 100 for x in [low, inter, prof])
    
    warning = ""
    if not (l_pct <= i_pct <= p_pct):
        warning = "⚠️ **Hierarchy Warning:** Information density should be: Low ≤ Intermediate ≤ Proficient."
        
    return f"{l_pct:.1f}%", f"{i_pct:.1f}%", f"{p_pct:.1f}%", warning

def save_and_next(username, progress_index, source_info, low_sel, int_sel, prof_sel, subclaims):
    """
    Saves the current annotation and moves to the next record in the random sample.
    
    progress_index: The sequence number (0, 1, 2...) from the shuffled list.
    shuffled_indices: This must be the global list generated at the top of your script.
    """
    
    # 1. Validation: Ensure we haven't exceeded the sample size
    if progress_index >= len(shuffled_indices):
        return [progress_index] + load_example(progress_index)

    # 2. Validation: Annotator Name
    if not username or username.strip() == "":
        gr.Warning("Action Required: Please enter your name before submitting!")
        # Return current state to avoid losing work
        return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), 
                gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel), 
                "⚠️ **Error:** Please enter your name."]

    # 3. Validation: Hierarchy Check (Low <= Intermediate <= Proficient)
    if not (len(low_sel) <= len(int_sel) <= len(prof_sel)):
        gr.Warning("DATA NOT SAVED! The selection does not follow the hierarchy: Low ≤ Intermediate ≤ Proficient.")
        return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), 
                gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel), 
                "⚠️ **Error:** Selection sequence is invalid. Please adjust before saving."]

    # 4. Map progress to the actual data index from your JSON
    actual_data_index = shuffled_indices[progress_index]

    # 5. File System Management
    try:
        if not os.path.exists(SESSION_PATH):
            os.makedirs(SESSION_PATH, exist_ok=True)
    except Exception as e:
        gr.Error(f"Critical Error: Could not create directory {SESSION_PATH}. Error: {e}")
        return [progress_index] + load_example(progress_index)

    # 6. Prepare Metadata and Filename
    now = datetime.now()
    timestamp_str = now.strftime("%Y%m%d_%H%M%S")
    safe_username = "".join(x for x in username if x.isalnum())
    
    # Use actual_data_index so you can easily match this file back to your master JSON
    filename = f"recordID{actual_data_index}_seq{progress_index}_{safe_username}_{timestamp_str}.json"
    file_path = os.path.join(SESSION_PATH, filename)
    
    stype = "Full Original Text" if "Full Original Text" in source_info else "Gold Summary"
    
    # 7. Construct Result Object
    result = {
        "annotator": username,
        "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
        "progress_sequence": progress_index, # The order it was shown
        "original_data_index": actual_data_index, # The real ID in the source JSON
        "source_type": stype,
        "annotations": {
            "low": {
                "count": len(low_sel),
                "subclaims": low_sel, 
                "pct": len(low_sel)/len(subclaims) if subclaims else 0
            },
            "intermediate": {
                "count": len(int_sel),
                "subclaims": int_sel, 
                "pct": len(int_sel)/len(subclaims) if subclaims else 0
            },
            "proficient": {
                "count": len(prof_sel),
                "subclaims": prof_sel, 
                "pct": len(prof_sel)/len(subclaims) if subclaims else 0
            }
        }
    }
    
    # 8. Write to Disk
    with open(file_path, 'w') as f:
        json.dump(result, f, indent=4)
    
    gr.Info(f"Success! Record {actual_data_index} saved (Item {progress_index + 1} of {len(shuffled_indices)}).")
    
    # 9. Return the NEXT progress index and its data
    return [progress_index + 1] + load_example(progress_index + 1)

# --- UI Definition ---
with gr.Blocks(theme=gr.themes.Soft(), title="Medical Literacy Annotation Tool") as demo:
    index_state = gr.State(0)
    subclaim_list_state = gr.State([])
    
    try:
        with open("/home/mshahidul/readctrl/code/interface/instructions", "r") as f:
            instructions_text = f.read()
    except:
        instructions_text = "# Medical Annotation Task"
        
    gr.Markdown(instructions_text)
   
    with gr.Row():
        with gr.Column(scale=1, variant="panel"):
            user_input = gr.Textbox(label="Annotator Name", placeholder="e.g., mshahidul", interactive=True)
            gr.HTML("<hr>")
            source_display = gr.Markdown("### Initializing...")
            text_viewer = gr.Textbox(label="Reference Text", interactive=False, lines=15)

        with gr.Column(scale=2):
            hierarchy_warning = gr.Markdown(value="", visible=True)
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### 🟢 Low")
                    low_pct = gr.Label(label="Coverage", value="0%")
                    low_check = gr.CheckboxGroup(label="Subclaims", choices=[])
                
                with gr.Column():
                    gr.Markdown("### 🟡 Intermediate")
                    int_pct = gr.Label(label="Coverage", value="0%")
                    int_check = gr.CheckboxGroup(label="Subclaims", choices=[])
                
                with gr.Column():
                    gr.Markdown("### 🔴 Proficient")
                    prof_pct = gr.Label(label="Coverage", value="0%")
                    prof_check = gr.CheckboxGroup(label="Subclaims", choices=[])

            submit_btn = gr.Button("Submit & Next Record", variant="primary", size="lg")

    # Event Handlers
    demo.load(load_example, [index_state], [source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])
    
    for check_sys in [low_check, int_check, prof_check]:
        check_sys.change(calc_pct_and_validate, [low_check, int_check, prof_check, subclaim_list_state], [low_pct, int_pct, prof_pct, hierarchy_warning])

    submit_btn.click(save_and_next, [user_input, index_state, source_display, low_check, int_check, prof_check, subclaim_list_state], [index_state, source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])

if __name__ == "__main__":
    demo.launch(share=True)