Spaces:

ljm565-org
/

H-AdminSim_Arena

Running

App Files Files Community

H-AdminSim_Arena / app.py

ljm565

feat: Update .gitignore

d0e7aed 12 days ago

raw

history blame contribute delete

16.1 kB

	import os
	import random
	import datetime
	import gradio as gr
	from typing import Tuple, Optional

	from utils.common import upload_to_github
	from utils.postprocess import make_dialog_dict, dialog_translate



	def sample_pair(dialog_dict: dict) -> Tuple[str, str, str, str]:
	"""
	Sample two different models and one dialog from each.

	Args:
	dialog_dict (dict): Dictionary of dialogs per model.

	Returns:
	Tuple[str, str, str, str]: (model1 name, dialog1, model2 name, dialog2)
	"""
	model1, model2 = random.sample(list(dialog_dict.keys()), 2)
	dialog1 = random.choice(dialog_dict[model1])
	dialog2 = random.choice(dialog_dict[model2])
	return model1, dialog1, model2, dialog2



	def new_comparison(dialog_dict: dict) -> Tuple[str, str, str, str, gr.Row, gr.Row, gr.Row]:
	"""
	Generate a new comparison pair and make the arena row visible.

	Args:
	dialog_dict (dict): Dictionary of dialogs per model.

	Returns:
	Tuple[str, str, str, str, gr.Row]:
	(model1 name, dialog1, model2 name, dialog2,
	arena row visibility update, submit button visibility update, new comparison button visibility update)
	"""
	m1, d1, m2, d2 = sample_pair(dialog_dict)
	result_path = set_result_path()
	return m1, d1, m2, d2, d1, d2, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), result_path



	def update_scores(score_a: Optional[int],
	score_b: Optional[int],
	win: Optional[int],
	m1: str,
	m2: str,
	d1: str,
	d2: str,
	en1: str,
	kr1: str,
	en2: str,
	kr2: str,
	l_state1: str,
	l_state2: str,
	dialog_dict: dict,
	score_state: dict,
	is_dev: bool,
	result_file_path: str):
	"""
	Update score_state. If both scores exist, record and sample new dialog pair.
	"""
	scores = score_state.copy()
	if score_a is not None:
	scores["A"] = score_a
	if score_b is not None:
	scores["B"] = score_b
	if win is not None:
	scores["win"] = win

	# Save the data
	if "A" in scores and "B" in scores and "win" in scores:
	# if not is_dev:
	with open(result_file_path, "a") as f:
	f.write(f"{scores['win']}\t{scores['A']}\t{scores['B']}\t{m1}\t{m2}\n")

	new_m1, new_d1, new_m2, new_d2 = sample_pair(dialog_dict)
	return (
	new_m1, new_d1, new_m2, new_d2,
	new_d1, "", new_d2, "",
	"en", "en",
	gr.update(visible=True), # arena
	gr.update(visible=False), # rate A button
	gr.update(visible=False), # rate B button
	gr.update(visible=True), # vote A button
	gr.update(visible=True), # vote B button
	"✅ Both scores recorded!", {}
	)

	# Waiting the other score
	else:
	return (
	m1, d1, m2, d2,
	en1, kr1, en2, kr2,
	l_state1, l_state2,
	gr.update(visible=True), # arena
	gr.update(visible=True), # rate A button
	gr.update(visible=True), # rate B button
	gr.update(visible=False), # vote A button
	gr.update(visible=False), # vote B button
	"🕐 Waiting for the other score...", scores
	)



	def save_data(path: str) -> str:
	"""
	Save the human evaluation data.

	Args:
	path (str): Path to save the results.

	Returns:
	str: Message of the result submission.
	"""
	try:
	upload_to_github(path, open(path).read())
	return "✅ Results are successfully submitted!"
	except Exception as e:
	return f"❌ Upload failed: {e}"



	def set_result_path() -> str:
	"""
	Set the result saving path.

	Returns:
	str: Path to save the results.
	"""
	timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
	unique_id = os.urandom(4).hex()
	result_file = os.path.join("simulation_arena", f"result_{timestamp}_{unique_id}.txt")
	os.makedirs(os.path.dirname(result_file), exist_ok=True)
	return result_file



	def toggle_language(en_dialog, kr_dialog, lang):
	if lang == "en":
	if not kr_dialog:
	kr_dialog = dialog_translate(en_dialog)
	return kr_dialog, "kr", kr_dialog
	else:
	return en_dialog, "en", kr_dialog



	css = """
	.dialog-box {
	height: 600px;
	overflow-y: auto;
	}
	"""



	# Arena GUI
	is_dev = False
	dialog_dict = make_dialog_dict()
	with gr.Blocks(title="1:1 Outpatient Model Simulation Arena", css=css) as demo:
	gr.Markdown("# 🤖 Model Arena Evaluation")
	gr.Markdown("## Compare two model simulations and choose the better one!")

	# English version
	gr.Markdown("### Situation")
	gr.Markdown("* This scenario assumes that the patient called the hospital's administrative office for an outpatient inquiry.")
	gr.Markdown("* Depending on the patient, some may already know their condition because they have diagnostic records from a smaller clinic they previously visited, while others may only know their symptoms since it’s their first visit to the hospital.")
	gr.Markdown("<br>")

	gr.Markdown("### Procedure and Explanation")
	gr.Markdown("* First step: Arena! Please choose which of the two simulations you think is better.")
	gr.Markdown("* Second step: After making your choice, a scoring panel will appear. Please rate each simulation on a scale of 1 to 5.")
	gr.Markdown("* Regardless of which one you selected in step 1, please rate each simulation indepentently based on the criteria below.")
	gr.Markdown("* There’s no required number! Just do as many as you feel like, and hit the submit button before you leave. You can always come back and do a few more later if you’re bored!.")
	gr.Markdown("<br>")

	gr.Markdown("### Evaluation Criteria")
	gr.Markdown("If the simulation satisfies all four criteria below, please give it 5 points. Deduct 1 point for each criterion that is not met. If none of the criteria are satisfied, assign a score of 1 point.")
	gr.Markdown("* Patient: The patient expresses symptoms naturally without using excessive medical jargon.")
	gr.Markdown("* Staff : The staff does not diagnose like a doctor or provide treatment based on previous medical records, but instead focuses on asking appropriate questions within the scope of symptom checking, registration, and guidance. The tone of language is empathetic and polite.")
	gr.Markdown("* Flow : The conversation proceeds naturally in the following order — greeting → patient information collection → symptom collection → department assignment — and each stage achieves its intended purpose.")
	gr.Markdown("* Overall: The conversation overall feels realistic, resembling an actual hospital reception scenario (sentences are concise and the closing expressions sound natural).")

	# Korean version
	gr.Markdown("---")
	gr.Markdown("### 상황")
	gr.Markdown("* 환자가 병원 원무과에 외래 진료 문의를 위해 전화한 상황을 가정합니다.")
	gr.Markdown("* 환자에 따라 이전 작은 병원에서 받은 진단 기록을 가지고 있어 자신의 질병을 이미 알고 있는 경우도 있고, 병원을 처음 방문해 증상만 알고 있는 경우도 있습니다.")
	gr.Markdown("<br>")

	gr.Markdown("### 과정 및 설명")
	gr.Markdown("* First Step: Arena! 두 개의 시뮬레이션 중에서, 실제 병원 원무과 직원과의 대화가 더 현실적 것을 선택해주세요.")
	gr.Markdown("* Second step: Rate! 선택이 끝나면 점수판이 뜹니다. 각각의 시뮬레이션에 1~5점 사이 점수를 매겨주세요. 평가 기준은 아래 섹션을 참고 해주세요!")
	gr.Markdown("* 1번에서 어떤 걸 골랐는지와 상관없이, 두 시뮬레이션을 보고 아래 평가 기준에 맞춰서 점수를 매겨주세요.")
	gr.Markdown("* 개수는 상관 없습니다(그래도 최소 10개정도만 부탁드려용).. 그냥 적당히 하실만큼 하시다가 <i>다하고 나가시기 전에 꼭 submit 버튼만 눌러주시면 돼요!</i> 그리고 나중에 심심하시면 쫌쫌따리 해주셔도 좋아여..")
	gr.Markdown("<br>")

	gr.Markdown("### 점수 평가 기준")
	gr.Markdown("아래의 4가지 기준을 모두 만족하면 5점, 기준을 충족하지 못한 항목이 하나 있을 때마다 1점씩 감점해 주세요. 모든 기준을 만족하지 못한 경우는 1점을 부여하면 됩니다.")
	gr.Markdown("* Patient: 증상 호소가 자연스럽고, 과도한 의학 전문 용어가 사용되지 않았는지.")
	gr.Markdown("* Staff : 의사처럼 진단하거나 혹인 이전 진단 기록을 바탕으로 치료를 하지 않고, 증상 확인·접수·안내 범위 내에서 질의를 잘 수행했는지, 언어 톤이 공감 있고 친절했는지.")
	gr.Markdown("* Flow : 인사 → 환자 정보 수집 → 증상 수집 및 이전 진단 기록 여부 → 진료과 배정 순서로 자연스럽게 진행되었고, 각 단계의 목적이 모두 달성되었는지.")
	gr.Markdown("* Overall: 전체적으로 실제 병원 접수 상황처럼 느껴지는지(문장 표현이 간결하고, 대화의 끝맺음이 자연스러운지 등).")
	gr.Markdown("<br>")

	gr.Markdown("### 기타")
	gr.Markdown("편의를 위해 LLM 번역 기능을 추가했는데(Change language), 번역하는 데 시간이 걸릴 수 있습니다. GPT-5 Nano 모델을 사용하고 있어서 부정확할 수 있으니 번역본은 참고용으로만 해주시면 감사하겠습니다.")
	gr.Markdown("* 가령 'I am sorry to hear that'이라는 문장을 '듣기에 죄송합니다' 이렇게 이상하게 해석되는 경우가 있을 수 있는데, 번역해서 읽다가 이상한 부분은 영어로 잠깐 바꿔서 원문을 봐주시면 됩니다. 이러한 경우 번역의 오류이므로 이 부분은 감안하고 점수를 매겨주시기 바랍니다.")

	gr.Markdown("<br><br>")
	gr.Markdown("---")

	# New comparison button
	btn_new = gr.Button("🥊 Start Arena!! 🥊")
	state_dict = gr.State(dialog_dict)
	en1_dialog_box, en2_dialog_box = gr.State(""), gr.State("")
	kr1_dialog_box, kr2_dialog_box = gr.State(""), gr.State("")
	lang_state1, lang_state2 = gr.State("en"), gr.State("en")

	# Showing two model simulations side by side
	with gr.Row(visible=False) as arena_row:
	with gr.Column():
	model1_name = gr.Textbox(label="Model A", interactive=False, visible=is_dev)
	if not is_dev:
	msg = gr.Markdown("## Anonymous Model A")
	dialog1_box = gr.Markdown(label="Simulation A", elem_classes="dialog-box")
	with gr.Row(visible=True) as translate_A:
	tr1 = gr.Button("Change language")
	with gr.Row(visible=True) as vote1_row:
	vote1 = gr.Button("👍 Choose A")
	with gr.Row(visible=False) as scoreA_row:
	scoreA_buttons = [gr.Button(str(i)) for i in range(1, 6)]

	with gr.Column():
	model2_name = gr.Textbox(label="Model B", interactive=False, visible=is_dev)
	if not is_dev:
	msg = gr.Markdown("## Anonymous Model B")
	dialog2_box = gr.Markdown(label="Simulation B", elem_classes="dialog-box")
	with gr.Row(visible=True) as translate_B:
	tr2 = gr.Button("Change language")
	with gr.Row(visible=True) as vote2_row:
	vote2 = gr.Button("👍 Choose B")
	with gr.Row(visible=False) as scoreB_row:
	scoreB_buttons = [gr.Button(str(i)) for i in range(1, 6)]

	# Showing the status
	msg = gr.Markdown("")

	# Submit button
	with gr.Row(visible=False) as submit_row:
	submit_btn = gr.Button("📤 Submit All Results")
	submit_msg = gr.Markdown("")

	# Button actions
	result_save_state = gr.State(None)
	btn_new.click(
	fn=new_comparison,
	inputs=[state_dict],
	outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, en2_dialog_box, arena_row, submit_row, btn_new, result_save_state],
	)
	tr1.click(
	fn=toggle_language,
	inputs=[en1_dialog_box, kr1_dialog_box, lang_state1],
	outputs=[dialog1_box, lang_state1, kr1_dialog_box]
	)

	tr2.click(
	fn=toggle_language,
	inputs=[en2_dialog_box, kr2_dialog_box, lang_state2],
	outputs=[dialog2_box, lang_state2, kr2_dialog_box]
	)

	# First step: Arena, Choose the only one!
	score_state = gr.State({})
	vote1.click(
	fn=lambda m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, None, 'A', m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
	inputs=[model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
	outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
	)
	vote2.click(
	fn=lambda m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, None, 'B', m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
	inputs=[model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
	outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
	)

	# Second step: Rate, Rate the each score!
	for btn in scoreA_buttons:
	btn.click(
	fn=lambda score, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(int(score), None, None, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
	inputs=[gr.State(btn.value), model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
	outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
	)
	for btn in scoreB_buttons:
	btn.click(
	fn=lambda score, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, int(score), None, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
	inputs=[gr.State(btn.value), model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
	outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
	)

	submit_btn.click(
	fn=save_data,
	inputs=[result_save_state],
	outputs=[submit_msg],
	)

	# Launch the app
	if is_dev:
	demo.launch(server_port=7860)
	else:
	demo.launch()