readctrl / code /interface /old /label_thresold_v3.py
shahidul034's picture
Add files using upload-large-folder tool
9c6961c verified
import gradio as gr
import json
import os
import random
from datetime import datetime
# --- Configuration & Folder Setup ---
DATA_PATH = '/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json'
KEY_DATA_PATH = '/home/mshahidul/readctrl/data/key_subclaims_testing/key_subclaims.json'
BASE_SAVE_DIR = '/home/mshahidul/readctrl/data/thresold_finding/'
# 1. Create folder based on date+hour of app start
session_folder_name = datetime.now().strftime("%Y-%m-%d_%Hh")
SESSION_PATH = os.path.join(BASE_SAVE_DIR, session_folder_name)
os.makedirs(SESSION_PATH, exist_ok=True)
# --- Data Loading ---
with open(DATA_PATH, 'r') as f:
data = json.load(f)
NUM_SAMPLES= 10
random.seed(42)
all_possible_indices = list(range(len(data)))
shuffled_indices = random.sample(all_possible_indices, min(NUM_SAMPLES, len(data)))
with open(KEY_DATA_PATH, 'r') as f:
key_data = json.load(f)
key_lookup = {item['index']: item['llm_output'] for item in key_data}
# --- Logic Functions ---
def get_key_indices(index, source_type):
if index not in key_lookup:
return []
key_field = 'key_source_text_subclaims' if source_type == "Full Original Text" else 'key_gold_summary_subclaims'
id_key = "source_subclaim_id" if source_type == "Full Original Text" else "gold_subclaim_id"
key_items = key_lookup[index].get(key_field, [])
indices = []
for item in key_items:
raw_id = item.get(id_key, "")
try:
idx = int(raw_id.split('-')[-1])
indices.append(idx)
except (ValueError, IndexError):
continue
return indices
def load_example(progress_index):
# Check if we've reached the end of our fixed sample size
if progress_index >= len(shuffled_indices):
return [
gr.update(value="### 🎉 Session Complete!"),
gr.update(value=f"You have finished your set of {NUM_SAMPLES} records."),
[], "0%", "0%", "0%", gr.update(choices=[], value=[]),
gr.update(choices=[], value=[]), gr.update(choices=[], value=[]), ""
]
# Get the actual index from our sample pool
actual_data_index = shuffled_indices[progress_index]
record = data[actual_data_index]
# Seed by actual_data_index for consistency
random.seed(actual_data_index)
source_type = random.choice(["Full Original Text", "Gold Summary"])
if source_type == "Full Original Text":
text_content, subclaims = record['fulltext'], record['fulltext_subclaims']
else:
text_content, subclaims = record['summary'], record['summary_subclaims']
source_info = f"### Instance: {progress_index + 1}/{len(shuffled_indices)} | Source: **{source_type}**"
key_indices = get_key_indices(actual_data_index, source_type)
pre_selected = [subclaims[idx] for idx in key_indices if 0 <= idx < len(subclaims)]
return [
source_info, text_content, subclaims, "0%", "0%", "0%",
gr.update(choices=subclaims, value=pre_selected),
gr.update(choices=subclaims, value=pre_selected),
gr.update(choices=subclaims, value=pre_selected),
""
]
def calc_pct_and_validate(low, inter, prof, total_list):
if not total_list: return "0%", "0%", "0%", ""
l_pct, i_pct, p_pct = (len(x)/len(total_list) * 100 for x in [low, inter, prof])
warning = ""
if not (l_pct <= i_pct <= p_pct):
warning = "⚠️ **Hierarchy Warning:** Information density should be: Low ≤ Intermediate ≤ Proficient."
return f"{l_pct:.1f}%", f"{i_pct:.1f}%", f"{p_pct:.1f}%", warning
def save_and_next(username, progress_index, source_info, low_sel, int_sel, prof_sel, subclaims):
"""
Saves the current annotation and moves to the next record in the random sample.
progress_index: The sequence number (0, 1, 2...) from the shuffled list.
shuffled_indices: This must be the global list generated at the top of your script.
"""
# 1. Validation: Ensure we haven't exceeded the sample size
if progress_index >= len(shuffled_indices):
return [progress_index] + load_example(progress_index)
# 2. Validation: Annotator Name
if not username or username.strip() == "":
gr.Warning("Action Required: Please enter your name before submitting!")
# Return current state to avoid losing work
return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel),
"⚠️ **Error:** Please enter your name."]
# 3. Validation: Hierarchy Check (Low <= Intermediate <= Proficient)
if not (len(low_sel) <= len(int_sel) <= len(prof_sel)):
gr.Warning("DATA NOT SAVED! The selection does not follow the hierarchy: Low ≤ Intermediate ≤ Proficient.")
return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel),
"⚠️ **Error:** Selection sequence is invalid. Please adjust before saving."]
# 4. Map progress to the actual data index from your JSON
actual_data_index = shuffled_indices[progress_index]
# 5. File System Management
try:
if not os.path.exists(SESSION_PATH):
os.makedirs(SESSION_PATH, exist_ok=True)
except Exception as e:
gr.Error(f"Critical Error: Could not create directory {SESSION_PATH}. Error: {e}")
return [progress_index] + load_example(progress_index)
# 6. Prepare Metadata and Filename
now = datetime.now()
timestamp_str = now.strftime("%Y%m%d_%H%M%S")
safe_username = "".join(x for x in username if x.isalnum())
# Use actual_data_index so you can easily match this file back to your master JSON
filename = f"recordID{actual_data_index}_seq{progress_index}_{safe_username}_{timestamp_str}.json"
file_path = os.path.join(SESSION_PATH, filename)
stype = "Full Original Text" if "Full Original Text" in source_info else "Gold Summary"
# 7. Construct Result Object
result = {
"annotator": username,
"timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
"progress_sequence": progress_index, # The order it was shown
"original_data_index": actual_data_index, # The real ID in the source JSON
"source_type": stype,
"annotations": {
"low": {
"count": len(low_sel),
"subclaims": low_sel,
"pct": len(low_sel)/len(subclaims) if subclaims else 0
},
"intermediate": {
"count": len(int_sel),
"subclaims": int_sel,
"pct": len(int_sel)/len(subclaims) if subclaims else 0
},
"proficient": {
"count": len(prof_sel),
"subclaims": prof_sel,
"pct": len(prof_sel)/len(subclaims) if subclaims else 0
}
}
}
# 8. Write to Disk
with open(file_path, 'w') as f:
json.dump(result, f, indent=4)
gr.Info(f"Success! Record {actual_data_index} saved (Item {progress_index + 1} of {len(shuffled_indices)}).")
# 9. Return the NEXT progress index and its data
return [progress_index + 1] + load_example(progress_index + 1)
# --- UI Definition ---
with gr.Blocks(theme=gr.themes.Soft(), title="Medical Literacy Annotation Tool") as demo:
index_state = gr.State(0)
subclaim_list_state = gr.State([])
try:
with open("/home/mshahidul/readctrl/code/interface/instructions", "r") as f:
instructions_text = f.read()
except:
instructions_text = "# Medical Annotation Task"
gr.Markdown(instructions_text)
with gr.Row():
with gr.Column(scale=1, variant="panel"):
user_input = gr.Textbox(label="Annotator Name", placeholder="e.g., mshahidul", interactive=True)
gr.HTML("<hr>")
source_display = gr.Markdown("### Initializing...")
text_viewer = gr.Textbox(label="Reference Text", interactive=False, lines=15)
with gr.Column(scale=2):
hierarchy_warning = gr.Markdown(value="", visible=True)
with gr.Row():
with gr.Column():
gr.Markdown("### 🟢 Low")
low_pct = gr.Label(label="Coverage", value="0%")
low_check = gr.CheckboxGroup(label="Subclaims", choices=[])
with gr.Column():
gr.Markdown("### 🟡 Intermediate")
int_pct = gr.Label(label="Coverage", value="0%")
int_check = gr.CheckboxGroup(label="Subclaims", choices=[])
with gr.Column():
gr.Markdown("### 🔴 Proficient")
prof_pct = gr.Label(label="Coverage", value="0%")
prof_check = gr.CheckboxGroup(label="Subclaims", choices=[])
submit_btn = gr.Button("Submit & Next Record", variant="primary", size="lg")
# Event Handlers
demo.load(load_example, [index_state], [source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])
for check_sys in [low_check, int_check, prof_check]:
check_sys.change(calc_pct_and_validate, [low_check, int_check, prof_check, subclaim_list_state], [low_pct, int_pct, prof_pct, hierarchy_warning])
submit_btn.click(save_and_next, [user_input, index_state, source_display, low_check, int_check, prof_check, subclaim_list_state], [index_state, source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])
if __name__ == "__main__":
demo.launch(share=True)