readctrl / code /interface /translation_quality.py
shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
import gradio as gr
import json
import os
from datetime import datetime
def sanitize_username(username: str) -> str:
"""Make username safe for filesystem paths."""
if not username:
return ""
username = username.strip()
safe = "".join(ch for ch in username if ch.isalnum() or ch in ("_", "-"))
return safe
def get_user_session_file(username):
safe = sanitize_username(username)
return os.path.join(SAVE_DIR, f"ratings_{safe}.json")
language="Bengali"
if language=="Chinese":
language_code="ch"
elif language=="Hindi":
language_code="hi"
elif language=="Bengali":
language_code="be"
else:
assert False, "Unsupported language"
# Load translation dataset
TRANSLATION_PATH = f"/home/mshahidul/readctrl/data/translated_data/translation_english2bangla_v1.json"
with open(TRANSLATION_PATH, "r", encoding="utf-8") as f:
translation_dataset = json.load(f)[:50]
# Load source dataset for English fulltext
SRC_PATH = f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
with open(SRC_PATH, "r", encoding="utf-8") as f:
src_dataset = json.load(f)[:50]
# Merge datasets by index (assume same order)
dataset = [
{
"src_fulltext": src_dataset[i]["fulltext"],
"translated_fulltext": translation_dataset[i]["fulltext_translated"]["translated_medical_note"],
"id": translation_dataset[i]["id"]
}
for i in range(min(len(src_dataset), len(translation_dataset)))
]
# 2. Configuration for saving
SAVE_DIR = f"/home/mshahidul/readctrl/data/translated_data/rating_info/{language_code}"
os.makedirs(SAVE_DIR, exist_ok=True)
SESSION_FILE = None # Will be set per user
RATING_OPTIONS = [
("1 - Poor (Incorrect/Nonsense)", 1),
("2 - Fair (Understandable but awkward)", 2),
("3 - Good (Accurate/Perfect)", 3)
]
custom_css = """
.small-header { font-size: 0.85rem !important; font-weight: 600; margin-bottom: -10px; color: #555; }
.nav-row { background-color: #f9f9f9; padding: 10px; border-radius: 8px; margin-bottom: 15px; }
"""
def save_rating_to_json(data_item, username):
session_file = get_user_session_file(username)
output_data = []
if os.path.exists(session_file):
with open(session_file, "r", encoding="utf-8") as f:
try:
output_data = json.load(f)
except json.JSONDecodeError:
output_data = []
# Backward/forward compatibility: support either list[record] or dict with "records".
if isinstance(output_data, dict):
records = output_data.get("records", [])
else:
records = output_data if isinstance(output_data, list) else []
# Keep a single record per index (update if it already exists).
new_index = data_item.get("index")
updated = False
for i, rec in enumerate(records):
if isinstance(rec, dict) and rec.get("index") == new_index:
records[i] = data_item
updated = True
break
if not updated:
records.append(data_item)
payload = {
"username": sanitize_username(username) or username,
"updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"records": records,
}
with open(session_file, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=4)
def load_user_records(username):
session_file = get_user_session_file(username)
if not os.path.exists(session_file):
return []
try:
with open(session_file, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
records = data.get("records", [])
else:
records = data
return records if isinstance(records, list) else []
except Exception:
return []
def load_example(index):
total = len(dataset)
index = max(0, min(index, total - 1))
item = dataset[index]
progress_pct = (index / total) * 100
progress_text = f"Sample {index + 1} of {total} ({progress_pct:.1f}%)"
src_fulltext = item["src_fulltext"]
translated_fulltext = item["translated_fulltext"]
return (
src_fulltext, # src_display
translated_fulltext, # eng_display
None, # rating_dropdown (clears selection)
index, # current_index
progress_text, # progress_display
progress_pct, # progress_bar
index + 1 # jump_input
)
def get_last_index_for_user(username):
if not username:
return 0
records = load_user_records(username)
done_indices = set()
for rec in records:
if isinstance(rec, dict) and isinstance(rec.get("index"), int):
done_indices.add(rec["index"])
# Resume means: first unannotated sample in order.
for i in range(len(dataset)):
if i not in done_indices:
return i
# Completed.
return len(dataset)
def load_example_or_done(index):
if index >= len(dataset):
total = len(dataset)
progress_text = f"✅ Completed all {total} samples"
return (
"✅ ALL DONE",
"✅ ALL DONE",
None,
total,
progress_text,
100,
total,
)
return load_example(index)
def next_item(index, rating, src_txt, eng_txt, username):
if rating is None:
raise gr.Error("Please select a rating before proceeding!")
if not username:
raise gr.Error("Please enter your username!")
safe_user = sanitize_username(username)
if not safe_user:
raise gr.Error("Username must contain letters/numbers (optionally _ or -).")
record = {
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"index": index,
"src_text": src_txt,
"translated_text": eng_txt,
"rating": rating,
"username": safe_user
}
save_rating_to_json(record, safe_user)
gr.Info(f"Saved record {index + 1} for {safe_user}.")
# After saving, resume at first unannotated index.
next_idx = get_last_index_for_user(safe_user)
return load_example_or_done(next_idx)
def jump_to_instance(target_index):
return load_example_or_done(target_index - 1)
with gr.Blocks(css=custom_css) as demo:
username_box = gr.Textbox(label="Enter your username", value="", interactive=True)
login_btn = gr.Button("Start/Resume Session", variant="primary")
current_index = gr.State(0)
total_count = len(dataset)
gr.Markdown(f"### Translation Quality Annotation")
with gr.Row(elem_classes="nav-row"):
with gr.Column(scale=2):
progress_bar = gr.Slider(label="Progress", minimum=0, maximum=100, value=0, interactive=False)
progress_display = gr.Markdown(f"Sample 1 of {total_count} (0.0%)")
with gr.Column(scale=1):
jump_input = gr.Number(label="Jump to Sample #", value=1, precision=0)
jump_btn = gr.Button("Go", size="sm")
with gr.Row():
with gr.Column():
gr.Markdown("##### Source Fulltext (English)")
src_display = gr.Textbox(value=dataset[0]["src_fulltext"], interactive=False, lines=12, show_label=False)
with gr.Column():
gr.Markdown("##### Fulltext Translation (Bangla)")
eng_display = gr.Textbox(value=dataset[0]["translated_fulltext"], interactive=False, lines=12, show_label=False)
rating_dropdown = gr.Dropdown(choices=RATING_OPTIONS, label="Select Rating")
with gr.Row():
prev_btn = gr.Button("⬅ Previous (Review)", variant="secondary")
submit_btn = gr.Button("Save & Next ➡", variant="primary")
def login_user(username):
safe_user = sanitize_username(username)
if not safe_user:
raise gr.Error("Please enter a valid username (letters/numbers, _ or -).")
idx = get_last_index_for_user(safe_user)
return load_example_or_done(idx)
login_btn.click(
fn=login_user,
inputs=[username_box],
outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
)
submit_btn.click(
fn=next_item,
inputs=[current_index, rating_dropdown, src_display, eng_display, username_box],
outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
)
# 2. Update Prev Button: removed tr_display from outputs
prev_btn.click(
fn=lambda idx: load_example_or_done(idx - 1),
inputs=[current_index],
outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
)
# 3. Update Jump Button: removed tr_display from outputs
jump_btn.click(
fn=jump_to_instance,
inputs=[jump_input],
outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
)
if __name__ == "__main__":
demo.launch(share=True)