Spaces:
Running
Running
| import os | |
| import random | |
| import datetime | |
| import gradio as gr | |
| from typing import Tuple, Optional | |
| from utils.common import upload_to_github | |
| from utils.postprocess import make_dialog_dict, dialog_translate | |
| def sample_pair(dialog_dict: dict) -> Tuple[str, str, str, str]: | |
| """ | |
| Sample two different models and one dialog from each. | |
| Args: | |
| dialog_dict (dict): Dictionary of dialogs per model. | |
| Returns: | |
| Tuple[str, str, str, str]: (model1 name, dialog1, model2 name, dialog2) | |
| """ | |
| model1, model2 = random.sample(list(dialog_dict.keys()), 2) | |
| dialog1 = random.choice(dialog_dict[model1]) | |
| dialog2 = random.choice(dialog_dict[model2]) | |
| return model1, dialog1, model2, dialog2 | |
| def new_comparison(dialog_dict: dict) -> Tuple[str, str, str, str, gr.Row, gr.Row, gr.Row]: | |
| """ | |
| Generate a new comparison pair and make the arena row visible. | |
| Args: | |
| dialog_dict (dict): Dictionary of dialogs per model. | |
| Returns: | |
| Tuple[str, str, str, str, gr.Row]: | |
| (model1 name, dialog1, model2 name, dialog2, | |
| arena row visibility update, submit button visibility update, new comparison button visibility update) | |
| """ | |
| m1, d1, m2, d2 = sample_pair(dialog_dict) | |
| result_path = set_result_path() | |
| return m1, d1, m2, d2, d1, d2, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), result_path | |
| def update_scores(score_a: Optional[int], | |
| score_b: Optional[int], | |
| win: Optional[int], | |
| m1: str, | |
| m2: str, | |
| d1: str, | |
| d2: str, | |
| en1: str, | |
| kr1: str, | |
| en2: str, | |
| kr2: str, | |
| l_state1: str, | |
| l_state2: str, | |
| dialog_dict: dict, | |
| score_state: dict, | |
| is_dev: bool, | |
| result_file_path: str): | |
| """ | |
| Update score_state. If both scores exist, record and sample new dialog pair. | |
| """ | |
| scores = score_state.copy() | |
| if score_a is not None: | |
| scores["A"] = score_a | |
| if score_b is not None: | |
| scores["B"] = score_b | |
| if win is not None: | |
| scores["win"] = win | |
| # Save the data | |
| if "A" in scores and "B" in scores and "win" in scores: | |
| # if not is_dev: | |
| with open(result_file_path, "a") as f: | |
| f.write(f"{scores['win']}\t{scores['A']}\t{scores['B']}\t{m1}\t{m2}\n") | |
| new_m1, new_d1, new_m2, new_d2 = sample_pair(dialog_dict) | |
| return ( | |
| new_m1, new_d1, new_m2, new_d2, | |
| new_d1, "", new_d2, "", | |
| "en", "en", | |
| gr.update(visible=True), # arena | |
| gr.update(visible=False), # rate A button | |
| gr.update(visible=False), # rate B button | |
| gr.update(visible=True), # vote A button | |
| gr.update(visible=True), # vote B button | |
| "โ Both scores recorded!", {} | |
| ) | |
| # Waiting the other score | |
| else: | |
| return ( | |
| m1, d1, m2, d2, | |
| en1, kr1, en2, kr2, | |
| l_state1, l_state2, | |
| gr.update(visible=True), # arena | |
| gr.update(visible=True), # rate A button | |
| gr.update(visible=True), # rate B button | |
| gr.update(visible=False), # vote A button | |
| gr.update(visible=False), # vote B button | |
| "๐ Waiting for the other score...", scores | |
| ) | |
| def save_data(path: str) -> str: | |
| """ | |
| Save the human evaluation data. | |
| Args: | |
| path (str): Path to save the results. | |
| Returns: | |
| str: Message of the result submission. | |
| """ | |
| try: | |
| upload_to_github(path, open(path).read()) | |
| return "โ Results are successfully submitted!" | |
| except Exception as e: | |
| return f"โ Upload failed: {e}" | |
| def set_result_path() -> str: | |
| """ | |
| Set the result saving path. | |
| Returns: | |
| str: Path to save the results. | |
| """ | |
| timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
| unique_id = os.urandom(4).hex() | |
| result_file = os.path.join("simulation_arena", f"result_{timestamp}_{unique_id}.txt") | |
| os.makedirs(os.path.dirname(result_file), exist_ok=True) | |
| return result_file | |
| def toggle_language(en_dialog, kr_dialog, lang): | |
| if lang == "en": | |
| if not kr_dialog: | |
| kr_dialog = dialog_translate(en_dialog) | |
| return kr_dialog, "kr", kr_dialog | |
| else: | |
| return en_dialog, "en", kr_dialog | |
| css = """ | |
| .dialog-box { | |
| height: 600px; | |
| overflow-y: auto; | |
| } | |
| """ | |
| # Arena GUI | |
| is_dev = False | |
| dialog_dict = make_dialog_dict() | |
| with gr.Blocks(title="1:1 Outpatient Model Simulation Arena", css=css) as demo: | |
| gr.Markdown("# ๐ค Model Arena Evaluation") | |
| gr.Markdown("## Compare two model simulations and choose the better one!") | |
| # English version | |
| gr.Markdown("### Situation") | |
| gr.Markdown("* This scenario assumes that the patient called the hospital's administrative office for an outpatient inquiry.") | |
| gr.Markdown("* Depending on the patient, some may already know their condition because they have diagnostic records from a smaller clinic they previously visited, while others may only know their symptoms since itโs their first visit to the hospital.") | |
| gr.Markdown("<br>") | |
| gr.Markdown("### Procedure and Explanation") | |
| gr.Markdown("* First step: Arena! Please choose which of the two simulations you think is better.") | |
| gr.Markdown("* Second step: After making your choice, a scoring panel will appear. Please rate each simulation on a scale of 1 to 5.") | |
| gr.Markdown("* Regardless of which one you selected in step 1, please rate each simulation indepentently based on the criteria below.") | |
| gr.Markdown("* Thereโs no required number! Just do as many as you feel like, and hit the submit button before you leave. You can always come back and do a few more later if youโre bored!.") | |
| gr.Markdown("<br>") | |
| gr.Markdown("### Evaluation Criteria") | |
| gr.Markdown("If the simulation satisfies all four criteria below, please give it 5 points. Deduct 1 point for each criterion that is not met. If none of the criteria are satisfied, assign a score of 1 point.") | |
| gr.Markdown("* **Patient**: The patient expresses symptoms naturally without using excessive medical jargon.") | |
| gr.Markdown("* **Staff** : The staff does not diagnose like a doctor or provide treatment based on previous medical records, but instead focuses on asking appropriate questions within the scope of symptom checking, registration, and guidance. The tone of language is empathetic and polite.") | |
| gr.Markdown("* **Flow** : The conversation proceeds naturally in the following order โ greeting โ patient information collection โ symptom collection โ department assignment โ and each stage achieves its intended purpose.") | |
| gr.Markdown("* **Overall**: The conversation overall feels realistic, resembling an actual hospital reception scenario (sentences are concise and the closing expressions sound natural).") | |
| # Korean version | |
| gr.Markdown("---") | |
| gr.Markdown("### ์ํฉ") | |
| gr.Markdown("* ํ์๊ฐ ๋ณ์ ์๋ฌด๊ณผ์ ์ธ๋ ์ง๋ฃ ๋ฌธ์๋ฅผ ์ํด ์ ํํ ์ํฉ์ ๊ฐ์ ํฉ๋๋ค.") | |
| gr.Markdown("* ํ์์ ๋ฐ๋ผ ์ด์ ์์ ๋ณ์์์ ๋ฐ์ ์ง๋จ ๊ธฐ๋ก์ ๊ฐ์ง๊ณ ์์ด ์์ ์ ์ง๋ณ์ ์ด๋ฏธ ์๊ณ ์๋ ๊ฒฝ์ฐ๋ ์๊ณ , ๋ณ์์ ์ฒ์ ๋ฐฉ๋ฌธํด ์ฆ์๋ง ์๊ณ ์๋ ๊ฒฝ์ฐ๋ ์์ต๋๋ค.") | |
| gr.Markdown("<br>") | |
| gr.Markdown("### ๊ณผ์ ๋ฐ ์ค๋ช ") | |
| gr.Markdown("* **First Step: Arena!** ๋ ๊ฐ์ ์๋ฎฌ๋ ์ด์ ์ค์์, ์ค์ ๋ณ์ ์๋ฌด๊ณผ ์ง์๊ณผ์ ๋ํ๊ฐ **๋ ํ์ค์ ๊ฒ**์ ์ ํํด์ฃผ์ธ์.") | |
| gr.Markdown("* **Second step: Rate!** ์ ํ์ด ๋๋๋ฉด ์ ์ํ์ด ๋น๋๋ค. ๊ฐ๊ฐ์ ์๋ฎฌ๋ ์ด์ ์ 1~5์ ์ฌ์ด ์ ์๋ฅผ ๋งค๊ฒจ์ฃผ์ธ์. ํ๊ฐ ๊ธฐ์ค์ ์๋ ์น์ ์ ์ฐธ๊ณ ํด์ฃผ์ธ์!") | |
| gr.Markdown("* 1๋ฒ์์ ์ด๋ค ๊ฑธ ๊ณจ๋๋์ง์ ์๊ด์์ด, ๋ ์๋ฎฌ๋ ์ด์ ์ ๋ณด๊ณ ์๋ ํ๊ฐ ๊ธฐ์ค์ ๋ง์ถฐ์ ์ ์๋ฅผ ๋งค๊ฒจ์ฃผ์ธ์.") | |
| gr.Markdown("* ๊ฐ์๋ ์๊ด ์์ต๋๋ค(๊ทธ๋๋ ์ต์ 10๊ฐ์ ๋๋ง ๋ถํ๋๋ ค์ฉ).. ๊ทธ๋ฅ ์ ๋นํ ํ์ค๋งํผ ํ์๋ค๊ฐ <i>**๋คํ๊ณ ๋๊ฐ์๊ธฐ ์ ์ ๊ผญ submit ๋ฒํผ๋ง ๋๋ฌ์ฃผ์๋ฉด ๋ผ์!**</i> ๊ทธ๋ฆฌ๊ณ ๋์ค์ ์ฌ์ฌํ์๋ฉด ์ซ์ซ๋ฐ๋ฆฌ ํด์ฃผ์ ๋ ์ข์์ฌ..") | |
| gr.Markdown("<br>") | |
| gr.Markdown("### ์ ์ ํ๊ฐ ๊ธฐ์ค") | |
| gr.Markdown("**์๋์ 4๊ฐ์ง ๊ธฐ์ค์ ๋ชจ๋ ๋ง์กฑํ๋ฉด 5์ , ๊ธฐ์ค์ ์ถฉ์กฑํ์ง ๋ชปํ ํญ๋ชฉ์ด ํ๋ ์์ ๋๋ง๋ค 1์ ์ฉ ๊ฐ์ ํด ์ฃผ์ธ์. ๋ชจ๋ ๊ธฐ์ค์ ๋ง์กฑํ์ง ๋ชปํ ๊ฒฝ์ฐ๋ 1์ ์ ๋ถ์ฌํ๋ฉด ๋ฉ๋๋ค.**") | |
| gr.Markdown("* **Patient**: ์ฆ์ ํธ์๊ฐ ์์ฐ์ค๋ฝ๊ณ , ๊ณผ๋ํ ์ํ ์ ๋ฌธ ์ฉ์ด๊ฐ ์ฌ์ฉ๋์ง ์์๋์ง.") | |
| gr.Markdown("* **Staff** : ์์ฌ์ฒ๋ผ ์ง๋จํ๊ฑฐ๋ ํน์ธ ์ด์ ์ง๋จ ๊ธฐ๋ก์ ๋ฐํ์ผ๋ก ์น๋ฃ๋ฅผ ํ์ง ์๊ณ , ์ฆ์ ํ์ธยท์ ์ยท์๋ด ๋ฒ์ ๋ด์์ ์ง์๋ฅผ ์ ์ํํ๋์ง, ์ธ์ด ํค์ด ๊ณต๊ฐ ์๊ณ ์น์ ํ๋์ง.") | |
| gr.Markdown("* **Flow** : ์ธ์ฌ โ ํ์ ์ ๋ณด ์์ง โ ์ฆ์ ์์ง ๋ฐ ์ด์ ์ง๋จ ๊ธฐ๋ก ์ฌ๋ถ โ ์ง๋ฃ๊ณผ ๋ฐฐ์ ์์๋ก ์์ฐ์ค๋ฝ๊ฒ ์งํ๋์๊ณ , ๊ฐ ๋จ๊ณ์ ๋ชฉ์ ์ด ๋ชจ๋ ๋ฌ์ฑ๋์๋์ง.") | |
| gr.Markdown("* **Overall**: ์ ์ฒด์ ์ผ๋ก ์ค์ ๋ณ์ ์ ์ ์ํฉ์ฒ๋ผ ๋๊ปด์ง๋์ง(๋ฌธ์ฅ ํํ์ด ๊ฐ๊ฒฐํ๊ณ , ๋ํ์ ๋๋งบ์์ด ์์ฐ์ค๋ฌ์ด์ง ๋ฑ).") | |
| gr.Markdown("<br>") | |
| gr.Markdown("### ๊ธฐํ") | |
| gr.Markdown("ํธ์๋ฅผ ์ํด LLM ๋ฒ์ญ ๊ธฐ๋ฅ์ ์ถ๊ฐํ๋๋ฐ(Change language), ๋ฒ์ญํ๋ ๋ฐ ์๊ฐ์ด ๊ฑธ๋ฆด ์ ์์ต๋๋ค. GPT-5 Nano ๋ชจ๋ธ์ ์ฌ์ฉํ๊ณ ์์ด์ ๋ถ์ ํํ ์ ์์ผ๋ ๋ฒ์ญ๋ณธ์ ์ฐธ๊ณ ์ฉ์ผ๋ก๋ง ํด์ฃผ์๋ฉด ๊ฐ์ฌํ๊ฒ ์ต๋๋ค.") | |
| gr.Markdown("* ๊ฐ๋ น 'I am sorry to hear that'์ด๋ผ๋ ๋ฌธ์ฅ์ '๋ฃ๊ธฐ์ ์ฃ์กํฉ๋๋ค' ์ด๋ ๊ฒ ์ด์ํ๊ฒ ํด์๋๋ ๊ฒฝ์ฐ๊ฐ ์์ ์ ์๋๋ฐ, ๋ฒ์ญํด์ ์ฝ๋ค๊ฐ ์ด์ํ ๋ถ๋ถ์ ์์ด๋ก ์ ๊น ๋ฐ๊ฟ์ ์๋ฌธ์ ๋ด์ฃผ์๋ฉด ๋ฉ๋๋ค. ์ด๋ฌํ ๊ฒฝ์ฐ ๋ฒ์ญ์ ์ค๋ฅ์ด๋ฏ๋ก ์ด ๋ถ๋ถ์ ๊ฐ์ํ๊ณ ์ ์๋ฅผ ๋งค๊ฒจ์ฃผ์๊ธฐ ๋ฐ๋๋๋ค.") | |
| gr.Markdown("<br><br>") | |
| gr.Markdown("---") | |
| # New comparison button | |
| btn_new = gr.Button("๐ฅ Start Arena!! ๐ฅ") | |
| state_dict = gr.State(dialog_dict) | |
| en1_dialog_box, en2_dialog_box = gr.State(""), gr.State("") | |
| kr1_dialog_box, kr2_dialog_box = gr.State(""), gr.State("") | |
| lang_state1, lang_state2 = gr.State("en"), gr.State("en") | |
| # Showing two model simulations side by side | |
| with gr.Row(visible=False) as arena_row: | |
| with gr.Column(): | |
| model1_name = gr.Textbox(label="Model A", interactive=False, visible=is_dev) | |
| if not is_dev: | |
| msg = gr.Markdown("## Anonymous Model A") | |
| dialog1_box = gr.Markdown(label="Simulation A", elem_classes="dialog-box") | |
| with gr.Row(visible=True) as translate_A: | |
| tr1 = gr.Button("Change language") | |
| with gr.Row(visible=True) as vote1_row: | |
| vote1 = gr.Button("๐ Choose A") | |
| with gr.Row(visible=False) as scoreA_row: | |
| scoreA_buttons = [gr.Button(str(i)) for i in range(1, 6)] | |
| with gr.Column(): | |
| model2_name = gr.Textbox(label="Model B", interactive=False, visible=is_dev) | |
| if not is_dev: | |
| msg = gr.Markdown("## Anonymous Model B") | |
| dialog2_box = gr.Markdown(label="Simulation B", elem_classes="dialog-box") | |
| with gr.Row(visible=True) as translate_B: | |
| tr2 = gr.Button("Change language") | |
| with gr.Row(visible=True) as vote2_row: | |
| vote2 = gr.Button("๐ Choose B") | |
| with gr.Row(visible=False) as scoreB_row: | |
| scoreB_buttons = [gr.Button(str(i)) for i in range(1, 6)] | |
| # Showing the status | |
| msg = gr.Markdown("") | |
| # Submit button | |
| with gr.Row(visible=False) as submit_row: | |
| submit_btn = gr.Button("๐ค Submit All Results") | |
| submit_msg = gr.Markdown("") | |
| # Button actions | |
| result_save_state = gr.State(None) | |
| btn_new.click( | |
| fn=new_comparison, | |
| inputs=[state_dict], | |
| outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, en2_dialog_box, arena_row, submit_row, btn_new, result_save_state], | |
| ) | |
| tr1.click( | |
| fn=toggle_language, | |
| inputs=[en1_dialog_box, kr1_dialog_box, lang_state1], | |
| outputs=[dialog1_box, lang_state1, kr1_dialog_box] | |
| ) | |
| tr2.click( | |
| fn=toggle_language, | |
| inputs=[en2_dialog_box, kr2_dialog_box, lang_state2], | |
| outputs=[dialog2_box, lang_state2, kr2_dialog_box] | |
| ) | |
| # First step: Arena, Choose the only one! | |
| score_state = gr.State({}) | |
| vote1.click( | |
| fn=lambda m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, None, 'A', m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path), | |
| inputs=[model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state], | |
| outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state], | |
| ) | |
| vote2.click( | |
| fn=lambda m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, None, 'B', m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path), | |
| inputs=[model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state], | |
| outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state], | |
| ) | |
| # Second step: Rate, Rate the each score! | |
| for btn in scoreA_buttons: | |
| btn.click( | |
| fn=lambda score, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(int(score), None, None, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path), | |
| inputs=[gr.State(btn.value), model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state], | |
| outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state], | |
| ) | |
| for btn in scoreB_buttons: | |
| btn.click( | |
| fn=lambda score, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, int(score), None, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path), | |
| inputs=[gr.State(btn.value), model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state], | |
| outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state], | |
| ) | |
| submit_btn.click( | |
| fn=save_data, | |
| inputs=[result_save_state], | |
| outputs=[submit_msg], | |
| ) | |
| # Launch the app | |
| if is_dev: | |
| demo.launch(server_port=7860) | |
| else: | |
| demo.launch() | |