File size: 11,548 Bytes
9c6961c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import gradio as gr
import json
import os
import random
from datetime import datetime

# --- Configuration & Folder Setup ---
DATA_PATH = '/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json'
KEY_DATA_PATH = '/home/mshahidul/readctrl/data/key_subclaims_testing/key_subclaims.json'
BASE_SAVE_DIR = '/home/mshahidul/readctrl/data/thresold_finding/'

# 1. Create folder based on date+hour of app start
session_folder_name = datetime.now().strftime("%Y-%m-%d_%Hh")
SESSION_PATH = os.path.join(BASE_SAVE_DIR, session_folder_name)
os.makedirs(SESSION_PATH, exist_ok=True)

# --- Data Loading ---
with open(DATA_PATH, 'r') as f:
    data = json.load(f)
NUM_SAMPLES= 10
random.seed(42) 
all_possible_indices = list(range(len(data)))
shuffled_indices = random.sample(all_possible_indices, min(NUM_SAMPLES, len(data)))

with open(KEY_DATA_PATH, 'r') as f:
    key_data = json.load(f)

key_lookup = {item['index']: item['llm_output'] for item in key_data}

# --- Logic Functions ---
def get_key_indices(index, source_type):
    if index not in key_lookup:
        return []
    
    key_field = 'key_source_text_subclaims' if source_type == "Full Original Text" else 'key_gold_summary_subclaims'
    id_key = "source_subclaim_id" if source_type == "Full Original Text" else "gold_subclaim_id"
    
    key_items = key_lookup[index].get(key_field, [])
    
    indices = []
    for item in key_items:
        raw_id = item.get(id_key, "")
        try:
            idx = int(raw_id.split('-')[-1])
            indices.append(idx)
        except (ValueError, IndexError):
            continue
    return indices

def load_example(progress_index):
    # Check if we've reached the end of our fixed sample size
    if progress_index >= len(shuffled_indices):
        return [
            gr.update(value="### 🎉 Session Complete!"), 
            gr.update(value=f"You have finished your set of {NUM_SAMPLES} records."), 
            [], "0%", "0%", "0%", gr.update(choices=[], value=[]), 
            gr.update(choices=[], value=[]), gr.update(choices=[], value=[]), ""
        ]
    
    # Get the actual index from our sample pool
    actual_data_index = shuffled_indices[progress_index]
    record = data[actual_data_index]
    
    # Seed by actual_data_index for consistency
    random.seed(actual_data_index)
    source_type = random.choice(["Full Original Text", "Gold Summary"])
    
    if source_type == "Full Original Text":
        text_content, subclaims = record['fulltext'], record['fulltext_subclaims']
    else:
        text_content, subclaims = record['summary'], record['summary_subclaims']
        
    source_info = f"### Instance: {progress_index + 1}/{len(shuffled_indices)} | Source: **{source_type}**"
    key_indices = get_key_indices(actual_data_index, source_type)
    
    pre_selected = [subclaims[idx] for idx in key_indices if 0 <= idx < len(subclaims)]

    return [
        source_info, text_content, subclaims, "0%", "0%", "0%", 
        gr.update(choices=subclaims, value=pre_selected), 
        gr.update(choices=subclaims, value=pre_selected), 
        gr.update(choices=subclaims, value=pre_selected),
        "" 
    ]
def sync_from_low(low, inter, prof, total_list):
    # Everything in Low must be in Intermediate and Proficient
    new_inter = list(set(inter) | set(low))
    new_prof = list(set(prof) | set(new_inter))
    return update_ui_components(low, new_inter, new_prof, total_list)

def sync_from_inter(low, inter, prof, total_list):
    # 1. Proficient must include everything in Intermediate
    new_prof = list(set(prof) | set(inter))
    # 2. Low can only contain items that are in Intermediate
    new_low = list(set(low) & set(inter))
    return update_ui_components(new_low, inter, new_prof, total_list)

def sync_from_prof(low, inter, prof, total_list):
    # Intermediate and Low can only contain items that are in Proficient
    new_inter = list(set(inter) & set(prof))
    new_low = list(set(low) & set(prof))
    return update_ui_components(new_low, new_inter, prof, total_list)

def update_ui_components(low, inter, prof, total_list):
    """Helper to calculate percentages and return updates for all groups"""
    if not total_list: 
        return "0%", "0%", "0%", "", low, inter, prof
    
    l_pct, i_pct, p_pct = (len(x)/len(total_list) * 100 for x in [low, inter, prof])
    
    # Validation is now redundant because the code enforces it, 
    # but we can keep a success message.
    msg = "✅ Hierarchy Enforced: Low ⊆ Intermediate ⊆ Proficient"
    
    return (
        f"{l_pct:.1f}%", f"{i_pct:.1f}%", f"{p_pct:.1f}%", msg,
        gr.update(value=low), gr.update(value=inter), gr.update(value=prof)
    )

def save_and_next(username, progress_index, source_info, low_sel, int_sel, prof_sel, subclaims):
    """
    Saves the current annotation and moves to the next record in the random sample.
    
    progress_index: The sequence number (0, 1, 2...) from the shuffled list.
    shuffled_indices: This must be the global list generated at the top of your script.
    """
    
    # 1. Validation: Ensure we haven't exceeded the sample size
    if progress_index >= len(shuffled_indices):
        return [progress_index] + load_example(progress_index)

    # 2. Validation: Annotator Name
    if not username or username.strip() == "":
        gr.Warning("Action Required: Please enter your name before submitting!")
        # Return current state to avoid losing work
        return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), 
                gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel), 
                "⚠️ **Error:** Please enter your name."]

    # 3. Validation: Hierarchy Check (Low <= Intermediate <= Proficient)
    if not (len(low_sel) <= len(int_sel) <= len(prof_sel)):
        gr.Warning("DATA NOT SAVED! The selection does not follow the hierarchy: Low ≤ Intermediate ≤ Proficient.")
        return [progress_index, source_info, gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), 
                gr.update(value=low_sel), gr.update(value=int_sel), gr.update(value=prof_sel), 
                "⚠️ **Error:** Selection sequence is invalid. Please adjust before saving."]

    # 4. Map progress to the actual data index from your JSON
    actual_data_index = shuffled_indices[progress_index]

    # 5. File System Management
    try:
        if not os.path.exists(SESSION_PATH):
            os.makedirs(SESSION_PATH, exist_ok=True)
    except Exception as e:
        gr.Error(f"Critical Error: Could not create directory {SESSION_PATH}. Error: {e}")
        return [progress_index] + load_example(progress_index)

    # 6. Prepare Metadata and Filename
    now = datetime.now()
    timestamp_str = now.strftime("%Y%m%d_%H%M%S")
    safe_username = "".join(x for x in username if x.isalnum())
    
    # Use actual_data_index so you can easily match this file back to your master JSON
    filename = f"recordID{actual_data_index}_seq{progress_index}_{safe_username}_{timestamp_str}.json"
    file_path = os.path.join(SESSION_PATH, filename)
    
    stype = "Full Original Text" if "Full Original Text" in source_info else "Gold Summary"
    
    # 7. Construct Result Object
    result = {
        "annotator": username,
        "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
        "progress_sequence": progress_index, # The order it was shown
        "original_data_index": actual_data_index, # The real ID in the source JSON
        "source_type": stype,
        "annotations": {
            "low": {
                "count": len(low_sel),
                "subclaims": low_sel, 
                "pct": len(low_sel)/len(subclaims) if subclaims else 0
            },
            "intermediate": {
                "count": len(int_sel),
                "subclaims": int_sel, 
                "pct": len(int_sel)/len(subclaims) if subclaims else 0
            },
            "proficient": {
                "count": len(prof_sel),
                "subclaims": prof_sel, 
                "pct": len(prof_sel)/len(subclaims) if subclaims else 0
            }
        }
    }
    
    # 8. Write to Disk
    with open(file_path, 'w') as f:
        json.dump(result, f, indent=4)
    
    gr.Info(f"Success! Record {actual_data_index} saved (Item {progress_index + 1} of {len(shuffled_indices)}).")
    
    # 9. Return the NEXT progress index and its data
    return [progress_index + 1] + load_example(progress_index + 1)

# --- UI Definition ---
with gr.Blocks(theme=gr.themes.Soft(), title="Medical Literacy Annotation Tool") as demo:
    index_state = gr.State(0)
    subclaim_list_state = gr.State([])
    
    try:
        with open("/home/mshahidul/readctrl/code/interface/instructions", "r") as f:
            instructions_text = f.read()
    except:
        instructions_text = "# Medical Annotation Task"
        
    gr.Markdown(instructions_text)
   
    with gr.Row():
        with gr.Column(scale=1, variant="panel"):
            user_input = gr.Textbox(label="Annotator Name", placeholder="e.g., mshahidul", interactive=True)
            gr.HTML("<hr>")
            source_display = gr.Markdown("### Initializing...")
            text_viewer = gr.Textbox(label="Reference Text", interactive=False, lines=15)

        with gr.Column(scale=2):
            hierarchy_warning = gr.Markdown(value="", visible=True)
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### 🟢 Low")
                    low_pct = gr.Label(label="Coverage", value="0%")
                    low_check = gr.CheckboxGroup(label="Subclaims", choices=[])
                
                with gr.Column():
                    gr.Markdown("### 🟡 Intermediate")
                    int_pct = gr.Label(label="Coverage", value="0%")
                    int_check = gr.CheckboxGroup(label="Subclaims", choices=[])
                
                with gr.Column():
                    gr.Markdown("### 🔴 Proficient")
                    prof_pct = gr.Label(label="Coverage", value="0%")
                    prof_check = gr.CheckboxGroup(label="Subclaims", choices=[])

            submit_btn = gr.Button("Submit & Next Record", variant="primary", size="lg")

    # Event Handlers
    demo.load(load_example, [index_state], [source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])
    
    # Event Handlers for Hierarchy Synchronization
    low_check.input(
        sync_from_low, 
        [low_check, int_check, prof_check, subclaim_list_state], 
        [low_pct, int_pct, prof_pct, hierarchy_warning, low_check, int_check, prof_check]
    )
    
    int_check.input(
        sync_from_inter, 
        [low_check, int_check, prof_check, subclaim_list_state], 
        [low_pct, int_pct, prof_pct, hierarchy_warning, low_check, int_check, prof_check]
    )
    
    prof_check.input(
        sync_from_prof, 
        [low_check, int_check, prof_check, subclaim_list_state], 
        [low_pct, int_pct, prof_pct, hierarchy_warning, low_check, int_check, prof_check]
    )
    submit_btn.click(save_and_next, [user_input, index_state, source_display, low_check, int_check, prof_check, subclaim_list_state], [index_state, source_display, text_viewer, subclaim_list_state, low_pct, int_pct, prof_pct, low_check, int_check, prof_check, hierarchy_warning])

if __name__ == "__main__":
    demo.launch(share=True)