readctrl / code /interface /old /label_thresold_v3.py

Add files using upload-large-folder tool

9c6961c verified 28 days ago

9.99 kB

	import gradio as gr
	import json
	import os
	import random
	from datetime import datetime

	# --- Configuration & Folder Setup ---
	DATA_PATH = '/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json'
	KEY_DATA_PATH = '/home/mshahidul/readctrl/data/key_subclaims_testing/key_subclaims.json'
	BASE_SAVE_DIR = '/home/mshahidul/readctrl/data/thresold_finding/'

	# 1. Create folder based on date+hour of app start
	session_folder_name = datetime.now().strftime("%Y-%m-%d_%Hh")
	SESSION_PATH = os.path.join(BASE_SAVE_DIR, session_folder_name)
	os.makedirs(SESSION_PATH, exist_ok=True)

	# --- Data Loading ---
	with open(DATA_PATH, 'r') as f:
	data = json.load(f)
	NUM_SAMPLES= 10
	random.seed(42)
	all_possible_indices = list(range(len(data)))
	shuffled_indices = random.sample(all_possible_indices, min(NUM_SAMPLES, len(data)))

	with open(KEY_DATA_PATH, 'r') as f:
	key_data = json.load(f)

	key_lookup = {item['index']: item['llm_output'] for item in key_data}

	# --- Logic Functions ---
	def get_key_indices(index, source_type):
	if index not in key_lookup:
	return []

	key_field = 'key_source_text_subclaims' if source_type == "Full Original Text" else 'key_gold_summary_subclaims'
	id_key = "source_subclaim_id" if source_type == "Full Original Text" else "gold_subclaim_id"

	key_items = key_lookup[index].get(key_field, [])

	indices = []
	for item in key_items:
	raw_id = item.get(id_key, "")
	try:
	idx = int(raw_id.split('-')[-1])
	indices.append(idx)
	except (ValueError, IndexError):
	continue
	return indices

	def load_example(progress_index):
	# Check if we've reached the end of our fixed sample size
	if progress_index >= len(shuffled_indices):
	return [
	gr.update(value="### 🎉 Session Complete!"),
	gr.update(value=f"You have finished your set of {NUM_SAMPLES} records."),
	[], "0%", "0%", "0%", gr.update(choices=[], value=[]),
	gr.update(choices=[], value=[]), gr.update(choices=[], value=[]), ""
	]

	# Get the actual index from our sample pool
	actual_data_index = shuffled_indices[progress_index]
	record = data[actual_data_index]

	# Seed by actual_data_index for consistency
	random.seed(actual_data_index)
	source_type = random.choice(["Full Original Text", "Gold Summary"])

	if source_type == "Full Original Text":
	text_content, subclaims = record['fulltext'], record['fulltext_subclaims']
	else:
	text_content, subclaims = record['summary'], record['summary_subclaims']

	source_info = f"### Instance: {progress_index + 1}/{len(shuffled_indices)} \| Source: {source_type}"
	key_indices = get_key_indices(actual_data_index, source_type)

	pre_selected = [subclaims[idx] for idx in key_indices if 0 <= idx < len(subclaims)]

	return [
	source_info, text_content, subclaims, "0%", "0%", "0%",
	gr.update(choices=subclaims, value=pre_selected),
	gr.update(choices=subclaims, value=pre_selected),
	gr.update(choices=subclaims, value=pre_selected),
	""
	]

	def calc_pct_and_validate(low, inter, prof, total_list):
	if not total_list: return "0%", "0%", "0%", ""
	l_pct, i_pct, p_pct = (len(x)/len(total_list) * 100 for x in [low, inter, prof])

	warning = ""
	if not (l_pct <= i_pct <= p_pct):
	warning = "⚠️ Hierarchy Warning: Information density should be: Low ≤ Intermediate ≤ Proficient."

	return f"{l_pct:.1f}%", f"{i_pct:.1f}%", f"{p_pct:.1f}%", warning

	def save_and_next(username, progress_index, source_info, low_sel, int_sel, prof_sel, subclaims):
	"""
	Saves the current annotation and moves to the next record in the random sample.

	progress_index: The sequence number (0, 1, 2...) from the shuffled list.
	shuffled_indices: This must be the global list generated at the top of your script.
	"""

	# 1. Validation: Ensure we haven't exceeded the sample size
	if progress_index >= len(shuffled_indices):
	return [progress_index] + load_example(progress_index)

	# 2. Validation: Annotator Name
	if not username or username.strip() == "":
	gr.Warning("Action Required: Please enter your name before submitting!")
	# Return current state to avoid losing work
	return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
	gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel),
	"⚠️ Error: Please enter your name."]

	# 3. Validation: Hierarchy Check (Low <= Intermediate <= Proficient)
	if not (len(low_sel) <= len(int_sel) <= len(prof_sel)):
	gr.Warning("DATA NOT SAVED! The selection does not follow the hierarchy: Low ≤ Intermediate ≤ Proficient.")
	return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
	gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel),
	"⚠️ Error: Selection sequence is invalid. Please adjust before saving."]

	# 4. Map progress to the actual data index from your JSON
	actual_data_index = shuffled_indices[progress_index]

	# 5. File System Management
	try:
	if not os.path.exists(SESSION_PATH):
	os.makedirs(SESSION_PATH, exist_ok=True)
	except Exception as e:
	gr.Error(f"Critical Error: Could not create directory {SESSION_PATH}. Error: {e}")
	return [progress_index] + load_example(progress_index)

	# 6. Prepare Metadata and Filename
	now = datetime.now()
	timestamp_str = now.strftime("%Y%m%d_%H%M%S")
	safe_username = "".join(x for x in username if x.isalnum())

	# Use actual_data_index so you can easily match this file back to your master JSON
	filename = f"recordID{actual_data_index}_seq{progress_index}_{safe_username}_{timestamp_str}.json"
	file_path = os.path.join(SESSION_PATH, filename)

	stype = "Full Original Text" if "Full Original Text" in source_info else "Gold Summary"

	# 7. Construct Result Object
	result = {
	"annotator": username,
	"timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
	"progress_sequence": progress_index, # The order it was shown
	"original_data_index": actual_data_index, # The real ID in the source JSON
	"source_type": stype,
	"annotations": {
	"low": {
	"count": len(low_sel),
	"subclaims": low_sel,
	"pct": len(low_sel)/len(subclaims) if subclaims else 0
	},
	"intermediate": {
	"count": len(int_sel),
	"subclaims": int_sel,
	"pct": len(int_sel)/len(subclaims) if subclaims else 0
	},
	"proficient": {
	"count": len(prof_sel),
	"subclaims": prof_sel,
	"pct": len(prof_sel)/len(subclaims) if subclaims else 0
	}
	}
	}

	# 8. Write to Disk
	with open(file_path, 'w') as f:
	json.dump(result, f, indent=4)

	gr.Info(f"Success! Record {actual_data_index} saved (Item {progress_index + 1} of {len(shuffled_indices)}).")

	# 9. Return the NEXT progress index and its data
	return [progress_index + 1] + load_example(progress_index + 1)

	# --- UI Definition ---
	with gr.Blocks(theme=gr.themes.Soft(), title="Medical Literacy Annotation Tool") as demo:
	index_state = gr.State(0)
	subclaim_list_state = gr.State([])

	try:
	with open("/home/mshahidul/readctrl/code/interface/instructions", "r") as f:
	instructions_text = f.read()
	except:
	instructions_text = "# Medical Annotation Task"

	gr.Markdown(instructions_text)

	with gr.Row():
	with gr.Column(scale=1, variant="panel"):
	user_input = gr.Textbox(label="Annotator Name", placeholder="e.g., mshahidul", interactive=True)
	gr.HTML("<hr>")
	source_display = gr.Markdown("### Initializing...")
	text_viewer = gr.Textbox(label="Reference Text", interactive=False, lines=15)

	with gr.Column(scale=2):
	hierarchy_warning = gr.Markdown(value="", visible=True)

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🟢 Low")
	low_pct = gr.Label(label="Coverage", value="0%")
	low_check = gr.CheckboxGroup(label="Subclaims", choices=[])

	with gr.Column():
	gr.Markdown("### 🟡 Intermediate")
	int_pct = gr.Label(label="Coverage", value="0%")
	int_check = gr.CheckboxGroup(label="Subclaims", choices=[])

	with gr.Column():
	gr.Markdown("### 🔴 Proficient")
	prof_pct = gr.Label(label="Coverage", value="0%")
	prof_check = gr.CheckboxGroup(label="Subclaims", choices=[])

	submit_btn = gr.Button("Submit & Next Record", variant="primary", size="lg")

	# Event Handlers
	demo.load(load_example, [index_state], [source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])

	for check_sys in [low_check, int_check, prof_check]:
	check_sys.change(calc_pct_and_validate, [low_check, int_check, prof_check, subclaim_list_state], [low_pct, int_pct, prof_pct, hierarchy_warning])

	submit_btn.click(save_and_next, [user_input, index_state, source_display, low_check, int_check, prof_check, subclaim_list_state], [index_state, source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])

	if __name__ == "__main__":
	demo.launch(share=True)