import gradio as gr import json import os from datetime import datetime def sanitize_username(username: str) -> str: """Make username safe for filesystem paths.""" if not username: return "" username = username.strip() safe = "".join(ch for ch in username if ch.isalnum() or ch in ("_", "-")) return safe def get_user_session_file(username): safe = sanitize_username(username) return os.path.join(SAVE_DIR, f"ratings_{safe}.json") language="Bengali" if language=="Chinese": language_code="ch" elif language=="Hindi": language_code="hi" elif language=="Bengali": language_code="be" else: assert False, "Unsupported language" # Load translation dataset (EN -> BN fulltext/summary) TRANSLATION_PATH = ( "/home/mshahidul/readctrl/data/translated_data/translation_wo_judge/" "multiclinsum_gs_train_en2bn_gemma(0_200).json" ) with open(TRANSLATION_PATH, "r", encoding="utf-8") as f: translation_dataset = json.load(f) dataset = [ { "src_fulltext": item.get("fulltext", ""), "translated_fulltext": item.get("translated_fulltext", ""), "id": item.get("id"), } for item in translation_dataset ][:50] # 2. Configuration for saving SAVE_DIR = f"/home/mshahidul/readctrl/data/translated_data/rating_info_v2/{language_code}" os.makedirs(SAVE_DIR, exist_ok=True) SESSION_FILE = None # Will be set per user RATING_OPTIONS = [ ("1 - Poor (Incorrect/Nonsense)", 1), ("2 - Fair (Understandable but awkward)", 2), ("3 - Good (Accurate/Perfect)", 3) ] custom_css = """ .small-header { font-size: 0.85rem !important; font-weight: 600; margin-bottom: -10px; color: #555; } .nav-row { background-color: #f9f9f9; padding: 10px; border-radius: 8px; margin-bottom: 15px; } """ def save_rating_to_json(data_item, username): session_file = get_user_session_file(username) output_data = [] if os.path.exists(session_file): with open(session_file, "r", encoding="utf-8") as f: try: output_data = json.load(f) except json.JSONDecodeError: output_data = [] # Backward/forward compatibility: support either list[record] or dict with "records". if isinstance(output_data, dict): records = output_data.get("records", []) else: records = output_data if isinstance(output_data, list) else [] # Keep a single record per index (update if it already exists). new_index = data_item.get("index") updated = False for i, rec in enumerate(records): if isinstance(rec, dict) and rec.get("index") == new_index: records[i] = data_item updated = True break if not updated: records.append(data_item) payload = { "username": sanitize_username(username) or username, "updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "records": records, } with open(session_file, "w", encoding="utf-8") as f: json.dump(payload, f, ensure_ascii=False, indent=4) def load_user_records(username): session_file = get_user_session_file(username) if not os.path.exists(session_file): return [] try: with open(session_file, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, dict): records = data.get("records", []) else: records = data return records if isinstance(records, list) else [] except Exception: return [] def load_example(index): total = len(dataset) index = max(0, min(index, total - 1)) item = dataset[index] progress_pct = (index / total) * 100 progress_text = f"Sample {index + 1} of {total} ({progress_pct:.1f}%)" src_fulltext = item["src_fulltext"] translated_fulltext = item["translated_fulltext"] return ( src_fulltext, # src_display translated_fulltext, # eng_display None, # rating_dropdown (clears selection) index, # current_index progress_text, # progress_display progress_pct, # progress_bar index + 1 # jump_input ) def get_last_index_for_user(username): if not username: return 0 records = load_user_records(username) done_indices = set() for rec in records: if isinstance(rec, dict) and isinstance(rec.get("index"), int): done_indices.add(rec["index"]) # Resume means: first unannotated sample in order. for i in range(len(dataset)): if i not in done_indices: return i # Completed. return len(dataset) def load_example_or_done(index): if index >= len(dataset): total = len(dataset) progress_text = f"✅ Completed all {total} samples" return ( "✅ ALL DONE", "✅ ALL DONE", None, total, progress_text, 100, total, ) return load_example(index) def next_item(index, rating, src_txt, eng_txt, username): if rating is None: raise gr.Error("Please select a rating before proceeding!") if not username: raise gr.Error("Please enter your username!") safe_user = sanitize_username(username) if not safe_user: raise gr.Error("Username must contain letters/numbers (optionally _ or -).") record = { "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "index": index, "src_text": src_txt, "translated_text": eng_txt, "rating": rating, "username": safe_user } save_rating_to_json(record, safe_user) gr.Info(f"Saved record {index + 1} for {safe_user}.") # After saving, resume at first unannotated index. next_idx = get_last_index_for_user(safe_user) return load_example_or_done(next_idx) def jump_to_instance(target_index): return load_example_or_done(target_index - 1) with gr.Blocks(css=custom_css) as demo: username_box = gr.Textbox(label="Enter your username", value="", interactive=True) login_btn = gr.Button("Start/Resume Session", variant="primary") current_index = gr.State(0) total_count = len(dataset) gr.Markdown("## Translation Quality Annotation") gr.Markdown("Data generated by TranslateGemma.") with gr.Row(elem_classes="nav-row"): with gr.Column(scale=2): progress_bar = gr.Slider(label="Progress", minimum=0, maximum=100, value=0, interactive=False) progress_display = gr.Markdown(f"Sample 1 of {total_count} (0.0%)") with gr.Column(scale=1): jump_input = gr.Number(label="Jump to Sample #", value=1, precision=0) jump_btn = gr.Button("Go", size="sm") with gr.Row(): with gr.Column(): gr.Markdown("##### Source Fulltext (English)") src_display = gr.Textbox(value=dataset[0]["src_fulltext"], interactive=False, lines=12, show_label=False) with gr.Column(): gr.Markdown("##### Fulltext Translation (Bangla)") eng_display = gr.Textbox(value=dataset[0]["translated_fulltext"], interactive=False, lines=12, show_label=False) rating_dropdown = gr.Dropdown(choices=RATING_OPTIONS, label="Select Rating") with gr.Row(): prev_btn = gr.Button("⬅ Previous (Review)", variant="secondary") submit_btn = gr.Button("Save & Next ➡", variant="primary") def login_user(username): safe_user = sanitize_username(username) if not safe_user: raise gr.Error("Please enter a valid username (letters/numbers, _ or -).") idx = get_last_index_for_user(safe_user) return load_example_or_done(idx) login_btn.click( fn=login_user, inputs=[username_box], outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input] ) submit_btn.click( fn=next_item, inputs=[current_index, rating_dropdown, src_display, eng_display, username_box], outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input] ) # 2. Update Prev Button: removed tr_display from outputs prev_btn.click( fn=lambda idx: load_example_or_done(idx - 1), inputs=[current_index], outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input] ) # 3. Update Jump Button: removed tr_display from outputs jump_btn.click( fn=jump_to_instance, inputs=[jump_input], outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input] ) if __name__ == "__main__": demo.launch(share=True)