File size: 9,693 Bytes
1db7196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import gradio as gr
import json
import os
from datetime import datetime

# --- PATH CONFIGURATION ---
# DATA_PATH = "/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en_0_20.json"
DATA_PATH = "/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_bn_0_80.json"
SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data_Bangla_(0_80)"
os.makedirs(SAVE_ROOT, exist_ok=True)

# --- UI HTML COMPONENTS (Kept same as original) ---
GUIDE_HTML = """
<div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #4CAF50; border-radius: 4px; margin-bottom: 20px;">
    <h3>Rating Guide: Medical Text Difficulty</h3>
    <table style="width:100%; border-collapse: collapse; text-align: left;">
        <tr style="background-color: #e8f5e9;">
            <th style="padding: 8px; border: 1px solid #ddd;">Score</th>
            <th style="padding: 8px; border: 1px solid #ddd;">Description</th>
        </tr>
        <tr><td><b>1</b></td><td><b>Very Easy:</b> Simple words, no medical jargon.</td></tr>
        <tr><td><b>2</b></td><td><b>Easy:</b> Conversational medical terms.</td></tr>
        <tr><td><b>3</b></td><td><b>Moderate:</b> Standard patient education material.</td></tr>
        <tr><td><b>4</b></td><td><b>Hard:</b> Significant technical jargon.</td></tr>
        <tr><td><b>5</b></td><td><b>Very Hard:</b> Specialist-level / Academic.</td></tr>
    </table>
</div>
"""

EXAMPLES_HTML = """
<div style="background-color: #ffffff; padding: 15px; border: 1px solid #ddd; border-radius: 4px;">
    <h3 style="color: #2e7d32;">Reference Examples</h3>
    <div style="display: flex; gap: 15px;">
        <div style="flex: 1; background-color: #f1f8e9; padding: 10px; border-radius: 4px;">
            <h4>Level 1-2</h4>
            <p>"She had a kidney problem... a big blood clot blocked veins in her brain."</p>
        </div>
        <div style="flex: 1; background-color: #ffebee; padding: 10px; border-radius: 4px;">
            <h4>Level 4-5</h4>
            <p>"Idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein."</p>
        </div>
    </div>
</div>
"""
def parse_diff_label_texts(raw_value):
    """
    Parse diff_label_texts that may be:
    - dict (already parsed)
    - JSON string
    - Python-dict-like string (single quotes)
    """
    if isinstance(raw_value, dict):
        return raw_value

    if not isinstance(raw_value, str):
        return {}

    text = raw_value.strip()
    if not text:
        return {}

    # Prefer strict JSON first; fall back to Python literal parsing.
    try:
        parsed = json.loads(text)
        return parsed if isinstance(parsed, dict) else {}
    except json.JSONDecodeError:
        pass

    try:
        parsed = ast.literal_eval(text)
        return parsed if isinstance(parsed, dict) else {}
    except (ValueError, SyntaxError):
        return {}
import ast
# --- DATA LOADING ---
def normalize_dataset(raw_dataset):
    """
    Normalize different dataset layouts into a flat queue where each item has:
    index, id, label, generated_summary.
    """
    normalized = []

    for item in raw_dataset:
 

        # New layout: {"diff_label_texts": {label: text, ...}}
        diff_label_texts = item.get("diff_label_texts")
        if isinstance(diff_label_texts, dict):
            for label, text in diff_label_texts.items():
                normalized.append({
                    "index": item.get("index"),
                    "id": item.get("id"),
                    "label": label,
                    "generated_summary": text
                })
            
        else:
            diff_label_texts = parse_diff_label_texts(item.get("diff_label_texts"))
            for label, text in diff_label_texts.items():
                normalized.append({
                    "index": item.get("index"),
                    "id": item.get("id"),
                    "label": label,
                    "generated_summary": text
                })
            


    return normalized


if os.path.exists(DATA_PATH):
    with open(DATA_PATH, "r", encoding="utf-8") as f:
        RAW_DATASET = json.load(f)
        FULL_DATASET = normalize_dataset(RAW_DATASET)
        print(len(FULL_DATASET))
    assert FULL_DATASET, f"No valid items found in dataset: {DATA_PATH}"
else:
    assert False, f"Data file not found at {DATA_PATH}"

# --- PERSISTENCE HELPERS ---
def get_user_dir(username):
    clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
    return os.path.join(SAVE_ROOT, clean_username)

def save_state(user_dir, state_dict):
    with open(os.path.join(user_dir, "state.json"), "w") as f:
        json.dump(state_dict, f, indent=4)

def load_state(user_dir):
    state_path = os.path.join(user_dir, "state.json")
    if os.path.exists(state_path):
        with open(state_path, "r") as f:
            return json.load(f)
    return None

# --- LOGIC FUNCTIONS ---
def get_current_ui_values(state):
    """Helper to get UI values for the current index, including previous ratings if they exist."""
    idx = state['current_index']
    current_item = state['queue'][idx]
    
    # Check if we already have a rating for this specific index
    existing_rating = 3  # Default
    for res in state['results']:
        if res['queue_position'] == idx:
            existing_rating = res['rating']
            break
            
    progress = f"Item {idx + 1} of {len(state['queue'])}"
    return current_item['generated_summary'], progress, existing_rating

def start_session(username):
    if not username:
        gr.Warning("Please enter a username!")
        return [gr.update()] * 5

    user_dir = get_user_dir(username)
    os.makedirs(user_dir, exist_ok=True)
    existing_state = load_state(user_dir)
    
    if existing_state:
        gr.Info(f"Welcome back! Resuming from item {existing_state['current_index'] + 1}.")
        state = existing_state
    else:
        state = {
            "username": username,
            "current_index": 0,
            "queue": list(FULL_DATASET),
            "results": [],
            "completed": False
        }
        save_state(user_dir, state)

    text, progress, rating = get_current_ui_values(state)
    return (gr.update(visible=False), gr.update(visible=True), text, progress, rating, state)

def submit_rating(doc_slider, state):
    if state is None: return "", "Error", 3, 3, None

    user_dir = get_user_dir(state['username'])
    idx = state['current_index']
    current_item = state['queue'][idx]
    
    # Update existing rating if editing, otherwise append
    new_result = {
        "queue_position": idx,
        "index": current_item.get('index', idx),
        "doc_id": current_item.get('id', current_item.get('index', 'no_id')),
        "label": current_item.get('label', 'no_label'),
        "rating": doc_slider,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    # Logic to overwrite existing rating for this index
    state['results'] = [r for r in state['results'] if r['queue_position'] != idx]
    state['results'].append(new_result)
    state['results'].sort(key=lambda x: x['queue_position']) # Keep sorted

    if idx + 1 < len(state['queue']):
        state['current_index'] += 1
        save_state(user_dir, state)
        # Save results file
        with open(os.path.join(user_dir, "annotation_results.json"), "w") as f:
            json.dump(state['results'], f, indent=4)
            
        text, progress, rating = get_current_ui_values(state)
        return text, progress, rating, state
    else:
        state['completed'] = True
        save_state(user_dir, state)
        return "✅ ALL TASKS COMPLETED", "Status: Finished", 1, state

def go_back(state):
    if state is None or state['current_index'] <= 0:
        gr.Warning("Already at the first item.")
        return [gr.update()] * 3 + [state]

    state['current_index'] -= 1
    text, progress, rating = get_current_ui_values(state)
    return text, progress, rating, state

# --- UI INTERFACE ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    session_state = gr.State()

    gr.Markdown("# Medical Text Readability Annotation")
    
    with gr.Accordion("Instructions & Calibration", open=False):
        gr.HTML(GUIDE_HTML)
        gr.HTML(EXAMPLES_HTML)

    with gr.Column(visible=True) as intro_box:
        username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_101")
        btn_start = gr.Button("Start / Resume Annotation", variant="primary")

    with gr.Column(visible=False) as task_box:
        progress_label = gr.Label(label="Overall Progress")
        doc_display = gr.Textbox(interactive=False, lines=12, label="Medical Text")
        doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1=Easy, 5=Hard)", value=3)
        
        with gr.Row():
            btn_prev = gr.Button("⬅️ Previous", variant="secondary")
            btn_submit = gr.Button("Submit & Next ➡️", variant="primary")

    # --- EVENT HANDLERS ---
    btn_start.click(
        fn=start_session, 
        inputs=[username_input],
        outputs=[intro_box, task_box, doc_display, progress_label, doc_slider, session_state]
    )
    
    btn_submit.click(
        fn=submit_rating,
        inputs=[doc_slider, session_state],
        outputs=[doc_display, progress_label, doc_slider, session_state]
    )

    btn_prev.click(
        fn=go_back,
        inputs=[session_state],
        outputs=[doc_display, progress_label, doc_slider, session_state]
    )

if __name__ == "__main__":
    demo.launch(share=True)