Spaces:

ljm565-org
/

H-AdminSim_Arena

Running

File size: 16,149 Bytes

b2a5882
 
 
 
 
 
 
3703b6a
b2a5882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ee0c0f
3703b6a
b2a5882
 
 
af3a9ea
 
cfd32b7
af3a9ea
 
 
 
3703b6a
 
 
 
 
 
af3a9ea
 
e73f228
af3a9ea
b2a5882
af3a9ea
b2a5882
af3a9ea
 
 
 
 
cfd32b7
 
af3a9ea
 
cfd32b7
af3a9ea
b2a5882
cfd32b7
af3a9ea
 
 
 
3703b6a
 
af3a9ea
 
 
 
 
 
 
b2a5882
af3a9ea
 
 
 
3703b6a
 
af3a9ea
 
 
 
 
 
 
b2a5882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b76fa9b
b2a5882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3703b6a
 
 
 
 
 
 
 
 
 
b2a5882
 
 
 
 
 
 
 
 
 
 
 
 
 
f2ae91d
 
 
 
 
 
3703b6a
f2ae91d
 
 
 
3703b6a
f2ae91d
3703b6a
 
 
 
 
 
 
 
f2ae91d
 
 
 
30db1ac
f2ae91d
3703b6a
 
f2ae91d
30db1ac
dc9850f
3703b6a
30db1ac
3703b6a
 
 
962cda8
3703b6a
 
b69629d
d0e7aed
3703b6a
 
 
962cda8
 
f2ae91d
b76fa9b
 
b2a5882
 
adf8468
b2a5882
3703b6a
 
 
b2a5882
 
 
 
 
b76fa9b
 
af3a9ea
3703b6a
 
af3a9ea
 
 
 
b2a5882
 
 
b76fa9b
 
af3a9ea
3703b6a
 
af3a9ea
 
 
 
 
 
 
b2a5882
 
 
 
 
 
 
1ee0c0f
b2a5882
 
 
3703b6a
 
 
 
 
 
 
 
 
 
 
 
b2a5882
 
af3a9ea
cfd32b7
b2a5882
3703b6a
 
 
b2a5882
 
3703b6a
 
 
b2a5882
 
af3a9ea
 
 
3703b6a
 
 
af3a9ea
 
 
3703b6a
 
 
af3a9ea
1c18bb0
 
 
 
 
 
b2a5882

import os
import random
import datetime
import gradio as gr
from typing import Tuple, Optional

from utils.common import upload_to_github
from utils.postprocess import make_dialog_dict, dialog_translate



def sample_pair(dialog_dict: dict) -> Tuple[str, str, str, str]:
    """
    Sample two different models and one dialog from each.

    Args:
        dialog_dict (dict): Dictionary of dialogs per model.

    Returns:
        Tuple[str, str, str, str]: (model1 name, dialog1, model2 name, dialog2)
    """
    model1, model2 = random.sample(list(dialog_dict.keys()), 2)
    dialog1 = random.choice(dialog_dict[model1])
    dialog2 = random.choice(dialog_dict[model2])
    return model1, dialog1, model2, dialog2



def new_comparison(dialog_dict: dict) -> Tuple[str, str, str, str, gr.Row, gr.Row, gr.Row]:
    """
    Generate a new comparison pair and make the arena row visible.

    Args:
        dialog_dict (dict): Dictionary of dialogs per model.

    Returns:
        Tuple[str, str, str, str, gr.Row]: 
            (model1 name, dialog1, model2 name, dialog2, 
            arena row visibility update, submit button visibility update, new comparison button visibility update)
    """
    m1, d1, m2, d2 = sample_pair(dialog_dict)
    result_path = set_result_path()
    return m1, d1, m2, d2, d1, d2, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), result_path



def update_scores(score_a: Optional[int],
                  score_b: Optional[int],
                  win: Optional[int],
                  m1: str,
                  m2: str,
                  d1: str,
                  d2: str,
                  en1: str,
                  kr1: str,
                  en2: str,
                  kr2: str,
                  l_state1: str,
                  l_state2: str,
                  dialog_dict: dict,
                  score_state: dict,
                  is_dev: bool,
                  result_file_path: str):
    """
    Update score_state. If both scores exist, record and sample new dialog pair.
    """
    scores = score_state.copy()
    if score_a is not None:
        scores["A"] = score_a
    if score_b is not None:
        scores["B"] = score_b
    if win is not None:
        scores["win"] = win

    # Save the data
    if "A" in scores and "B" in scores and "win" in scores:
        # if not is_dev:
        with open(result_file_path, "a") as f:
            f.write(f"{scores['win']}\t{scores['A']}\t{scores['B']}\t{m1}\t{m2}\n")

        new_m1, new_d1, new_m2, new_d2 = sample_pair(dialog_dict)
        return (
            new_m1, new_d1, new_m2, new_d2, 
            new_d1, "", new_d2, "",
            "en", "en",
            gr.update(visible=True),    # arena
            gr.update(visible=False),   # rate A button
            gr.update(visible=False),   # rate B button
            gr.update(visible=True),    # vote A button
            gr.update(visible=True),    # vote B button
            "✅ Both scores recorded!", {}
        )
    
    # Waiting the other score
    else:
        return (
            m1, d1, m2, d2, 
            en1, kr1, en2, kr2,
            l_state1, l_state2,
            gr.update(visible=True),    # arena
            gr.update(visible=True),    # rate A button
            gr.update(visible=True),    # rate B button
            gr.update(visible=False),   # vote A button
            gr.update(visible=False),   # vote B button
            "🕐 Waiting for the other score...", scores
        )



def save_data(path: str) -> str:
    """
    Save the human evaluation data.

    Args:
        path (str): Path to save the results.
    
    Returns:
        str: Message of the result submission.
    """
    try:
        upload_to_github(path, open(path).read())
        return "✅ Results are successfully submitted!"
    except Exception as e:
        return f"❌ Upload failed: {e}"



def set_result_path() -> str:
    """
    Set the result saving path.

    Returns:
        str: Path to save the results.
    """
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    unique_id = os.urandom(4).hex()
    result_file = os.path.join("simulation_arena", f"result_{timestamp}_{unique_id}.txt")
    os.makedirs(os.path.dirname(result_file), exist_ok=True)
    return result_file



def toggle_language(en_dialog, kr_dialog, lang):
    if lang == "en":
        if not kr_dialog:
            kr_dialog = dialog_translate(en_dialog)
        return kr_dialog, "kr", kr_dialog
    else:
        return en_dialog, "en", kr_dialog



css = """
.dialog-box { 
    height: 600px;        
    overflow-y: auto;     
}
"""



# Arena GUI
is_dev = False
dialog_dict = make_dialog_dict()
with gr.Blocks(title="1:1 Outpatient Model Simulation Arena", css=css) as demo:
    gr.Markdown("# 🤖 Model Arena Evaluation")
    gr.Markdown("## Compare two model simulations and choose the better one!")
    
    # English version
    gr.Markdown("### Situation")
    gr.Markdown("* This scenario assumes that the patient called the hospital's administrative office for an outpatient inquiry.")
    gr.Markdown("* Depending on the patient, some may already know their condition because they have diagnostic records from a smaller clinic they previously visited, while others may only know their symptoms since it’s their first visit to the hospital.")
    gr.Markdown("<br>")
    
    gr.Markdown("### Procedure and Explanation")
    gr.Markdown("* First step: Arena! Please choose which of the two simulations you think is better.")
    gr.Markdown("* Second step: After making your choice, a scoring panel will appear. Please rate each simulation on a scale of 1 to 5.")
    gr.Markdown("* Regardless of which one you selected in step 1, please rate each simulation indepentently based on the criteria below.")
    gr.Markdown("* There’s no required number! Just do as many as you feel like, and hit the submit button before you leave. You can always come back and do a few more later if you’re bored!.")
    gr.Markdown("<br>")

    gr.Markdown("### Evaluation Criteria")
    gr.Markdown("If the simulation satisfies all four criteria below, please give it 5 points. Deduct 1 point for each criterion that is not met. If none of the criteria are satisfied, assign a score of 1 point.")
    gr.Markdown("* **Patient**: The patient expresses symptoms naturally without using excessive medical jargon.")
    gr.Markdown("* **Staff**  : The staff does not diagnose like a doctor or provide treatment based on previous medical records, but instead focuses on asking appropriate questions within the scope of symptom checking, registration, and guidance. The tone of language is empathetic and polite.")
    gr.Markdown("* **Flow**   : The conversation proceeds naturally in the following order — greeting → patient information collection → symptom collection → department assignment — and each stage achieves its intended purpose.")
    gr.Markdown("* **Overall**: The conversation overall feels realistic, resembling an actual hospital reception scenario (sentences are concise and the closing expressions sound natural).")

    # Korean version
    gr.Markdown("---")
    gr.Markdown("### 상황")
    gr.Markdown("* 환자가 병원 원무과에 외래 진료 문의를 위해 전화한 상황을 가정합니다.")
    gr.Markdown("* 환자에 따라 이전 작은 병원에서 받은 진단 기록을 가지고 있어 자신의 질병을 이미 알고 있는 경우도 있고, 병원을 처음 방문해 증상만 알고 있는 경우도 있습니다.")
    gr.Markdown("<br>")

    gr.Markdown("### 과정 및 설명")
    gr.Markdown("* **First Step: Arena!** 두 개의 시뮬레이션 중에서, 실제 병원 원무과 직원과의 대화가 **더 현실적 것**을 선택해주세요.")
    gr.Markdown("* **Second step: Rate!** 선택이 끝나면 점수판이 뜹니다. 각각의 시뮬레이션에 1~5점 사이 점수를 매겨주세요. 평가 기준은 아래 섹션을 참고 해주세요!")
    gr.Markdown("* 1번에서 어떤 걸 골랐는지와 상관없이, 두 시뮬레이션을 보고 아래 평가 기준에 맞춰서 점수를 매겨주세요.")
    gr.Markdown("* 개수는 상관 없습니다(그래도 최소 10개정도만 부탁드려용).. 그냥 적당히 하실만큼 하시다가 <i>**다하고 나가시기 전에 꼭 submit 버튼만 눌러주시면 돼요!**</i> 그리고 나중에 심심하시면 쫌쫌따리 해주셔도 좋아여..")
    gr.Markdown("<br>")
    
    gr.Markdown("### 점수 평가 기준")
    gr.Markdown("**아래의 4가지 기준을 모두 만족하면 5점, 기준을 충족하지 못한 항목이 하나 있을 때마다 1점씩 감점해 주세요. 모든 기준을 만족하지 못한 경우는 1점을 부여하면 됩니다.**")
    gr.Markdown("* **Patient**: 증상 호소가 자연스럽고, 과도한 의학 전문 용어가 사용되지 않았는지.")
    gr.Markdown("* **Staff**  : 의사처럼 진단하거나 혹인 이전 진단 기록을 바탕으로 치료를 하지 않고, 증상 확인·접수·안내 범위 내에서 질의를 잘 수행했는지, 언어 톤이 공감 있고 친절했는지.")
    gr.Markdown("* **Flow**   : 인사 → 환자 정보 수집 → 증상 수집 및 이전 진단 기록 여부 → 진료과 배정 순서로 자연스럽게 진행되었고, 각 단계의 목적이 모두 달성되었는지.")
    gr.Markdown("* **Overall**: 전체적으로 실제 병원 접수 상황처럼 느껴지는지(문장 표현이 간결하고, 대화의 끝맺음이 자연스러운지 등).") 
    gr.Markdown("<br>")

    gr.Markdown("### 기타")
    gr.Markdown("편의를 위해 LLM 번역 기능을 추가했는데(Change language), 번역하는 데 시간이 걸릴 수 있습니다. GPT-5 Nano 모델을 사용하고 있어서 부정확할 수 있으니 번역본은 참고용으로만 해주시면 감사하겠습니다.")
    gr.Markdown("* 가령 'I am sorry to hear that'이라는 문장을 '듣기에 죄송합니다' 이렇게 이상하게 해석되는 경우가 있을 수 있는데, 번역해서 읽다가 이상한 부분은 영어로 잠깐 바꿔서 원문을 봐주시면 됩니다. 이러한 경우 번역의 오류이므로 이 부분은 감안하고 점수를 매겨주시기 바랍니다.")

    gr.Markdown("<br><br>")
    gr.Markdown("---")

    # New comparison button
    btn_new = gr.Button("🥊 Start Arena!! 🥊") 
    state_dict = gr.State(dialog_dict)
    en1_dialog_box, en2_dialog_box = gr.State(""), gr.State("")
    kr1_dialog_box, kr2_dialog_box = gr.State(""), gr.State("")
    lang_state1, lang_state2 = gr.State("en"), gr.State("en")

    # Showing two model simulations side by side
    with gr.Row(visible=False) as arena_row:
        with gr.Column():
            model1_name = gr.Textbox(label="Model A", interactive=False, visible=is_dev)
            if not is_dev:
                msg = gr.Markdown("## Anonymous Model A")
            dialog1_box = gr.Markdown(label="Simulation A", elem_classes="dialog-box")
            with gr.Row(visible=True) as translate_A:
                tr1 = gr.Button("Change language")
            with gr.Row(visible=True) as vote1_row:
                vote1 = gr.Button("👍 Choose A")
            with gr.Row(visible=False) as scoreA_row:
                scoreA_buttons = [gr.Button(str(i)) for i in range(1, 6)]

        with gr.Column():
            model2_name = gr.Textbox(label="Model B", interactive=False, visible=is_dev)
            if not is_dev:
                msg = gr.Markdown("## Anonymous Model B")
            dialog2_box = gr.Markdown(label="Simulation B", elem_classes="dialog-box")
            with gr.Row(visible=True) as translate_B:
                tr2 = gr.Button("Change language")
            with gr.Row(visible=True) as vote2_row:
                vote2 = gr.Button("👍 Choose B")
            with gr.Row(visible=False) as scoreB_row:
                scoreB_buttons = [gr.Button(str(i)) for i in range(1, 6)]
    
    # Showing the status
    msg = gr.Markdown("")

    # Submit button
    with gr.Row(visible=False) as submit_row:
        submit_btn = gr.Button("📤 Submit All Results")
        submit_msg = gr.Markdown("")
        
    # Button actions
    result_save_state = gr.State(None)
    btn_new.click(
        fn=new_comparison,
        inputs=[state_dict],
        outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, en2_dialog_box, arena_row, submit_row, btn_new, result_save_state],
    )
    tr1.click(
        fn=toggle_language,
        inputs=[en1_dialog_box, kr1_dialog_box, lang_state1],
        outputs=[dialog1_box, lang_state1, kr1_dialog_box]
    )

    tr2.click(
        fn=toggle_language,
        inputs=[en2_dialog_box, kr2_dialog_box, lang_state2],
        outputs=[dialog2_box, lang_state2, kr2_dialog_box]
    )

    # First step: Arena, Choose the only one!
    score_state = gr.State({})
    vote1.click(
        fn=lambda m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, None, 'A', m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
        inputs=[model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
        outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
    )
    vote2.click(
        fn=lambda m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, None, 'B', m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
        inputs=[model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
        outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
    )

    # Second step: Rate, Rate the each score!
    for btn in scoreA_buttons:
        btn.click(
            fn=lambda score, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(int(score), None, None, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
            inputs=[gr.State(btn.value), model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
            outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
        )
    for btn in scoreB_buttons:
        btn.click(
            fn=lambda score, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, scores, result_file_path: update_scores(None, int(score), None, m1, m2, d1, d2, en1, kr1, en2, kr2, l_state1, l_state2, dialog_dict, scores, is_dev, result_file_path),
            inputs=[gr.State(btn.value), model1_name, model2_name, dialog1_box, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, score_state, result_save_state],
            outputs=[model1_name, dialog1_box, model2_name, dialog2_box, en1_dialog_box, kr1_dialog_box, en2_dialog_box, kr2_dialog_box, lang_state1, lang_state2, arena_row, scoreA_row, scoreB_row, vote1_row, vote2_row, msg, score_state],
        )
    
    submit_btn.click(
        fn=save_data,
        inputs=[result_save_state],
        outputs=[submit_msg],
    )

# Launch the app
if is_dev:
    demo.launch(server_port=7860)
else:
    demo.launch()