readctrl / code /interface /translation_quality.py

Add files using upload-large-folder tool

1db7196 verified 28 days ago

9.05 kB

	import gradio as gr
	import json
	import os
	from datetime import datetime


	def sanitize_username(username: str) -> str:
	"""Make username safe for filesystem paths."""
	if not username:
	return ""
	username = username.strip()
	safe = "".join(ch for ch in username if ch.isalnum() or ch in ("_", "-"))
	return safe

	def get_user_session_file(username):
	safe = sanitize_username(username)
	return os.path.join(SAVE_DIR, f"ratings_{safe}.json")

	language="Bengali"
	if language=="Chinese":
	language_code="ch"
	elif language=="Hindi":
	language_code="hi"
	elif language=="Bengali":
	language_code="be"
	else:
	assert False, "Unsupported language"


	# Load translation dataset
	TRANSLATION_PATH = f"/home/mshahidul/readctrl/data/translated_data/translation_english2bangla_v1.json"
	with open(TRANSLATION_PATH, "r", encoding="utf-8") as f:
	translation_dataset = json.load(f)[:50]

	# Load source dataset for English fulltext
	SRC_PATH = f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
	with open(SRC_PATH, "r", encoding="utf-8") as f:
	src_dataset = json.load(f)[:50]

	# Merge datasets by index (assume same order)
	dataset = [
	{
	"src_fulltext": src_dataset[i]["fulltext"],
	"translated_fulltext": translation_dataset[i]["fulltext_translated"]["translated_medical_note"],
	"id": translation_dataset[i]["id"]
	}
	for i in range(min(len(src_dataset), len(translation_dataset)))
	]

	# 2. Configuration for saving
	SAVE_DIR = f"/home/mshahidul/readctrl/data/translated_data/rating_info/{language_code}"
	os.makedirs(SAVE_DIR, exist_ok=True)

	SESSION_FILE = None # Will be set per user

	RATING_OPTIONS = [
	("1 - Poor (Incorrect/Nonsense)", 1),
	("2 - Fair (Understandable but awkward)", 2),
	("3 - Good (Accurate/Perfect)", 3)
	]

	custom_css = """
	.small-header { font-size: 0.85rem !important; font-weight: 600; margin-bottom: -10px; color: #555; }
	.nav-row { background-color: #f9f9f9; padding: 10px; border-radius: 8px; margin-bottom: 15px; }
	"""

	def save_rating_to_json(data_item, username):
	session_file = get_user_session_file(username)
	output_data = []
	if os.path.exists(session_file):
	with open(session_file, "r", encoding="utf-8") as f:
	try:
	output_data = json.load(f)
	except json.JSONDecodeError:
	output_data = []

	# Backward/forward compatibility: support either list[record] or dict with "records".
	if isinstance(output_data, dict):
	records = output_data.get("records", [])
	else:
	records = output_data if isinstance(output_data, list) else []

	# Keep a single record per index (update if it already exists).
	new_index = data_item.get("index")
	updated = False
	for i, rec in enumerate(records):
	if isinstance(rec, dict) and rec.get("index") == new_index:
	records[i] = data_item
	updated = True
	break
	if not updated:
	records.append(data_item)

	payload = {
	"username": sanitize_username(username) or username,
	"updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"records": records,
	}
	with open(session_file, "w", encoding="utf-8") as f:
	json.dump(payload, f, ensure_ascii=False, indent=4)


	def load_user_records(username):
	session_file = get_user_session_file(username)
	if not os.path.exists(session_file):
	return []
	try:
	with open(session_file, "r", encoding="utf-8") as f:
	data = json.load(f)
	if isinstance(data, dict):
	records = data.get("records", [])
	else:
	records = data
	return records if isinstance(records, list) else []
	except Exception:
	return []

	def load_example(index):
	total = len(dataset)
	index = max(0, min(index, total - 1))
	item = dataset[index]
	progress_pct = (index / total) * 100
	progress_text = f"Sample {index + 1} of {total} ({progress_pct:.1f}%)"
	src_fulltext = item["src_fulltext"]
	translated_fulltext = item["translated_fulltext"]
	return (
	src_fulltext, # src_display
	translated_fulltext, # eng_display
	None, # rating_dropdown (clears selection)
	index, # current_index
	progress_text, # progress_display
	progress_pct, # progress_bar
	index + 1 # jump_input
	)

	def get_last_index_for_user(username):
	if not username:
	return 0
	records = load_user_records(username)
	done_indices = set()
	for rec in records:
	if isinstance(rec, dict) and isinstance(rec.get("index"), int):
	done_indices.add(rec["index"])

	# Resume means: first unannotated sample in order.
	for i in range(len(dataset)):
	if i not in done_indices:
	return i
	# Completed.
	return len(dataset)


	def load_example_or_done(index):
	if index >= len(dataset):
	total = len(dataset)
	progress_text = f"✅ Completed all {total} samples"
	return (
	"✅ ALL DONE",
	"✅ ALL DONE",
	None,
	total,
	progress_text,
	100,
	total,
	)
	return load_example(index)

	def next_item(index, rating, src_txt, eng_txt, username):
	if rating is None:
	raise gr.Error("Please select a rating before proceeding!")
	if not username:
	raise gr.Error("Please enter your username!")
	safe_user = sanitize_username(username)
	if not safe_user:
	raise gr.Error("Username must contain letters/numbers (optionally _ or -).")
	record = {
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"index": index,
	"src_text": src_txt,
	"translated_text": eng_txt,
	"rating": rating,
	"username": safe_user
	}
	save_rating_to_json(record, safe_user)
	gr.Info(f"Saved record {index + 1} for {safe_user}.")

	# After saving, resume at first unannotated index.
	next_idx = get_last_index_for_user(safe_user)
	return load_example_or_done(next_idx)

	def jump_to_instance(target_index):
	return load_example_or_done(target_index - 1)

	with gr.Blocks(css=custom_css) as demo:
	username_box = gr.Textbox(label="Enter your username", value="", interactive=True)
	login_btn = gr.Button("Start/Resume Session", variant="primary")
	current_index = gr.State(0)
	total_count = len(dataset)
	gr.Markdown(f"### Translation Quality Annotation")
	with gr.Row(elem_classes="nav-row"):
	with gr.Column(scale=2):
	progress_bar = gr.Slider(label="Progress", minimum=0, maximum=100, value=0, interactive=False)
	progress_display = gr.Markdown(f"Sample 1 of {total_count} (0.0%)")
	with gr.Column(scale=1):
	jump_input = gr.Number(label="Jump to Sample #", value=1, precision=0)
	jump_btn = gr.Button("Go", size="sm")
	with gr.Row():
	with gr.Column():
	gr.Markdown("##### Source Fulltext (English)")
	src_display = gr.Textbox(value=dataset[0]["src_fulltext"], interactive=False, lines=12, show_label=False)
	with gr.Column():
	gr.Markdown("##### Fulltext Translation (Bangla)")
	eng_display = gr.Textbox(value=dataset[0]["translated_fulltext"], interactive=False, lines=12, show_label=False)
	rating_dropdown = gr.Dropdown(choices=RATING_OPTIONS, label="Select Rating")
	with gr.Row():
	prev_btn = gr.Button("⬅ Previous (Review)", variant="secondary")
	submit_btn = gr.Button("Save & Next ➡", variant="primary")

	def login_user(username):
	safe_user = sanitize_username(username)
	if not safe_user:
	raise gr.Error("Please enter a valid username (letters/numbers, _ or -).")
	idx = get_last_index_for_user(safe_user)
	return load_example_or_done(idx)

	login_btn.click(
	fn=login_user,
	inputs=[username_box],
	outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
	)

	submit_btn.click(
	fn=next_item,
	inputs=[current_index, rating_dropdown, src_display, eng_display, username_box],
	outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
	)

	# 2. Update Prev Button: removed tr_display from outputs
	prev_btn.click(
	fn=lambda idx: load_example_or_done(idx - 1),
	inputs=[current_index],
	outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
	)

	# 3. Update Jump Button: removed tr_display from outputs
	jump_btn.click(
	fn=jump_to_instance,
	inputs=[jump_input],
	outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
	)

	if __name__ == "__main__":
	demo.launch(share=True)