import gradio as gr
import json
import os
from datetime import datetime
# --- PATH CONFIGURATION ---
# DATA_PATH = "/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en_0_20.json"
DATA_PATH = "/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_bn_0_80.json"
SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data_Bangla_(0_80)"
os.makedirs(SAVE_ROOT, exist_ok=True)
# --- UI HTML COMPONENTS (Kept same as original) ---
GUIDE_HTML = """
Rating Guide: Medical Text Difficulty
| Score |
Description |
| 1 | Very Easy: Simple words, no medical jargon. |
| 2 | Easy: Conversational medical terms. |
| 3 | Moderate: Standard patient education material. |
| 4 | Hard: Significant technical jargon. |
| 5 | Very Hard: Specialist-level / Academic. |
"""
EXAMPLES_HTML = """
Reference Examples
Level 1-2
"She had a kidney problem... a big blood clot blocked veins in her brain."
Level 4-5
"Idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein."
"""
def parse_diff_label_texts(raw_value):
"""
Parse diff_label_texts that may be:
- dict (already parsed)
- JSON string
- Python-dict-like string (single quotes)
"""
if isinstance(raw_value, dict):
return raw_value
if not isinstance(raw_value, str):
return {}
text = raw_value.strip()
if not text:
return {}
# Prefer strict JSON first; fall back to Python literal parsing.
try:
parsed = json.loads(text)
return parsed if isinstance(parsed, dict) else {}
except json.JSONDecodeError:
pass
try:
parsed = ast.literal_eval(text)
return parsed if isinstance(parsed, dict) else {}
except (ValueError, SyntaxError):
return {}
import ast
# --- DATA LOADING ---
def normalize_dataset(raw_dataset):
"""
Normalize different dataset layouts into a flat queue where each item has:
index, id, label, generated_summary.
"""
normalized = []
for item in raw_dataset:
# New layout: {"diff_label_texts": {label: text, ...}}
diff_label_texts = item.get("diff_label_texts")
if isinstance(diff_label_texts, dict):
for label, text in diff_label_texts.items():
normalized.append({
"index": item.get("index"),
"id": item.get("id"),
"label": label,
"generated_summary": text
})
else:
diff_label_texts = parse_diff_label_texts(item.get("diff_label_texts"))
for label, text in diff_label_texts.items():
normalized.append({
"index": item.get("index"),
"id": item.get("id"),
"label": label,
"generated_summary": text
})
return normalized
if os.path.exists(DATA_PATH):
with open(DATA_PATH, "r", encoding="utf-8") as f:
RAW_DATASET = json.load(f)
FULL_DATASET = normalize_dataset(RAW_DATASET)
print(len(FULL_DATASET))
assert FULL_DATASET, f"No valid items found in dataset: {DATA_PATH}"
else:
assert False, f"Data file not found at {DATA_PATH}"
# --- PERSISTENCE HELPERS ---
def get_user_dir(username):
clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
return os.path.join(SAVE_ROOT, clean_username)
def save_state(user_dir, state_dict):
with open(os.path.join(user_dir, "state.json"), "w") as f:
json.dump(state_dict, f, indent=4)
def load_state(user_dir):
state_path = os.path.join(user_dir, "state.json")
if os.path.exists(state_path):
with open(state_path, "r") as f:
return json.load(f)
return None
# --- LOGIC FUNCTIONS ---
def get_current_ui_values(state):
"""Helper to get UI values for the current index, including previous ratings if they exist."""
idx = state['current_index']
current_item = state['queue'][idx]
# Check if we already have a rating for this specific index
existing_rating = 3 # Default
for res in state['results']:
if res['queue_position'] == idx:
existing_rating = res['rating']
break
progress = f"Item {idx + 1} of {len(state['queue'])}"
return current_item['generated_summary'], progress, existing_rating
def start_session(username):
if not username:
gr.Warning("Please enter a username!")
return [gr.update()] * 5
user_dir = get_user_dir(username)
os.makedirs(user_dir, exist_ok=True)
existing_state = load_state(user_dir)
if existing_state:
gr.Info(f"Welcome back! Resuming from item {existing_state['current_index'] + 1}.")
state = existing_state
else:
state = {
"username": username,
"current_index": 0,
"queue": list(FULL_DATASET),
"results": [],
"completed": False
}
save_state(user_dir, state)
text, progress, rating = get_current_ui_values(state)
return (gr.update(visible=False), gr.update(visible=True), text, progress, rating, state)
def submit_rating(doc_slider, state):
if state is None: return "", "Error", 3, 3, None
user_dir = get_user_dir(state['username'])
idx = state['current_index']
current_item = state['queue'][idx]
# Update existing rating if editing, otherwise append
new_result = {
"queue_position": idx,
"index": current_item.get('index', idx),
"doc_id": current_item.get('id', current_item.get('index', 'no_id')),
"label": current_item.get('label', 'no_label'),
"rating": doc_slider,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
# Logic to overwrite existing rating for this index
state['results'] = [r for r in state['results'] if r['queue_position'] != idx]
state['results'].append(new_result)
state['results'].sort(key=lambda x: x['queue_position']) # Keep sorted
if idx + 1 < len(state['queue']):
state['current_index'] += 1
save_state(user_dir, state)
# Save results file
with open(os.path.join(user_dir, "annotation_results.json"), "w") as f:
json.dump(state['results'], f, indent=4)
text, progress, rating = get_current_ui_values(state)
return text, progress, rating, state
else:
state['completed'] = True
save_state(user_dir, state)
return "✅ ALL TASKS COMPLETED", "Status: Finished", 1, state
def go_back(state):
if state is None or state['current_index'] <= 0:
gr.Warning("Already at the first item.")
return [gr.update()] * 3 + [state]
state['current_index'] -= 1
text, progress, rating = get_current_ui_values(state)
return text, progress, rating, state
# --- UI INTERFACE ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
session_state = gr.State()
gr.Markdown("# Medical Text Readability Annotation")
with gr.Accordion("Instructions & Calibration", open=False):
gr.HTML(GUIDE_HTML)
gr.HTML(EXAMPLES_HTML)
with gr.Column(visible=True) as intro_box:
username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_101")
btn_start = gr.Button("Start / Resume Annotation", variant="primary")
with gr.Column(visible=False) as task_box:
progress_label = gr.Label(label="Overall Progress")
doc_display = gr.Textbox(interactive=False, lines=12, label="Medical Text")
doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1=Easy, 5=Hard)", value=3)
with gr.Row():
btn_prev = gr.Button("⬅️ Previous", variant="secondary")
btn_submit = gr.Button("Submit & Next ➡️", variant="primary")
# --- EVENT HANDLERS ---
btn_start.click(
fn=start_session,
inputs=[username_input],
outputs=[intro_box, task_box, doc_display, progress_label, doc_slider, session_state]
)
btn_submit.click(
fn=submit_rating,
inputs=[doc_slider, session_state],
outputs=[doc_display, progress_label, doc_slider, session_state]
)
btn_prev.click(
fn=go_back,
inputs=[session_state],
outputs=[doc_display, progress_label, doc_slider, session_state]
)
if __name__ == "__main__":
demo.launch(share=True)