| """A/B Arena ํญ UI""" |
|
|
| import gradio as gr |
| import random |
| from typing import Dict, List, Tuple, Optional, Any |
|
|
| from models.model_registry import get_all_models, get_model_info, get_models_for_dropdown |
| from characters import get_character_loader, build_system_prompt |
| from scenarios import get_scenario_loader |
| from voting import get_vote_storage, get_elo_calculator |
| from utils import parse_thinking_response, format_thinking_for_display |
|
|
|
|
| def create_arena_tab( |
| model_manager: Any = None, |
| use_mock: bool = False, |
| ): |
| """A/B Arena ํญ ์์ฑ""" |
|
|
| |
| char_loader = get_character_loader() |
| scenario_loader = get_scenario_loader() |
| vote_storage = get_vote_storage() |
| elo_calculator = get_elo_calculator() |
|
|
| |
| all_models = get_all_models() |
| model_choices = [(f"{get_model_info(m).get('description', m)}", m) for m in all_models] |
|
|
| |
| characters = char_loader.get_character_names() |
|
|
| |
| scenario_choices = scenario_loader.get_scenarios_for_dropdown() |
|
|
| |
| |
| |
|
|
| gr.Markdown("## A/B ํ
์คํธ ์๋ ๋") |
| gr.Markdown("๋ ๋ชจ๋ธ์ ์๋ต์ ๋น๊ตํ๊ณ ๋ ์ข์ ์๋ต์ ํฌํํ์ธ์.") |
|
|
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| character_dropdown = gr.Dropdown( |
| choices=characters, |
| value=characters[0] if characters else None, |
| label="์บ๋ฆญํฐ ์ ํ", |
| ) |
| with gr.Column(scale=1): |
| scenario_dropdown = gr.Dropdown( |
| choices=scenario_choices, |
| value=scenario_choices[0][1] if scenario_choices else None, |
| label="์๋๋ฆฌ์ค ํ๋ฆฌ์
", |
| ) |
| with gr.Column(scale=1): |
| blind_mode = gr.Checkbox( |
| value=True, |
| label="๋ธ๋ผ์ธ๋ ๋ชจ๋ (๋ชจ๋ธ๋ช
์จ๊น)", |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=2): |
| model_a_dropdown = gr.Dropdown( |
| choices=model_choices, |
| value=all_models[0] if all_models else None, |
| label="Model A", |
| ) |
| with gr.Column(scale=2): |
| model_b_dropdown = gr.Dropdown( |
| choices=model_choices, |
| value=all_models[1] if len(all_models) > 1 else None, |
| label="Model B", |
| ) |
| with gr.Column(scale=1): |
| random_models_btn = gr.Button("๋๋ค ๋ชจ๋ธ", size="sm") |
|
|
| |
| with gr.Row(): |
| |
| with gr.Column(scale=1): |
| model_a_label = gr.Markdown("### Model A") |
| with gr.Accordion("Thinking Process", open=False): |
| thinking_a = gr.Markdown("*(์๋ต ์์ฑ ํ ํ์๋ฉ๋๋ค)*") |
| response_a = gr.Textbox( |
| label="์๋ต", |
| lines=8, |
| interactive=False, |
| ) |
| metadata_a = gr.Markdown("") |
|
|
| |
| with gr.Column(scale=1): |
| model_b_label = gr.Markdown("### Model B") |
| with gr.Accordion("Thinking Process", open=False): |
| thinking_b = gr.Markdown("*(์๋ต ์์ฑ ํ ํ์๋ฉ๋๋ค)*") |
| response_b = gr.Textbox( |
| label="์๋ต", |
| lines=8, |
| interactive=False, |
| ) |
| metadata_b = gr.Markdown("") |
|
|
| |
| with gr.Row(): |
| user_input = gr.Textbox( |
| label="ํฌ ๋ฉ์์ง", |
| placeholder="์์ด๋์๊ฒ ๋ณด๋ผ ๋ฉ์์ง๋ฅผ ์
๋ ฅํ์ธ์...", |
| lines=2, |
| scale=4, |
| ) |
| with gr.Column(scale=1): |
| random_scenario_btn = gr.Button("๋๋ค ์๋๋ฆฌ์ค") |
| submit_btn = gr.Button("์ ์ก", variant="primary") |
|
|
| |
| gr.Markdown("### ํฌํ") |
| with gr.Row(): |
| vote_a_btn = gr.Button("A๊ฐ ๋ ์ข์", variant="secondary") |
| vote_tie_btn = gr.Button("๋น์ทํจ", variant="secondary") |
| vote_b_btn = gr.Button("B๊ฐ ๋ ์ข์", variant="secondary") |
| vote_skip_btn = gr.Button("์คํต", variant="secondary") |
|
|
| vote_reason = gr.Textbox( |
| label="ํฌํ ์ด์ (์ ํ์ฌํญ)", |
| placeholder="์ ์ด ์๋ต์ด ๋ ์ข๋ค๊ณ ์๊ฐํ์๋์?", |
| lines=1, |
| ) |
|
|
| vote_result = gr.Markdown("") |
|
|
| |
| state = gr.State({ |
| "model_a": None, |
| "model_b": None, |
| "response_a": None, |
| "response_b": None, |
| "character": None, |
| "user_input": None, |
| }) |
|
|
| |
| |
| |
|
|
| def select_random_models(): |
| """๋๋ค ๋ชจ๋ธ ์ ํ""" |
| if len(all_models) < 2: |
| return all_models[0] if all_models else None, None |
| selected = random.sample(all_models, 2) |
| return selected[0], selected[1] |
|
|
| def load_random_scenario(character: str): |
| """๋๋ค ์๋๋ฆฌ์ค ๋ก๋""" |
| scenario = scenario_loader.get_random_scenario() |
| if scenario: |
| user_msg = scenario_loader.format_user_input(scenario, character) |
| return user_msg, scenario["id"] |
| return "", None |
|
|
| def load_scenario_input(scenario_id: str, character: str): |
| """์ ํ๋ ์๋๋ฆฌ์ค ๋ก๋""" |
| scenario = scenario_loader.get_scenario(scenario_id) |
| if scenario: |
| return scenario_loader.format_user_input(scenario, character) |
| return "" |
|
|
| def generate_responses( |
| model_a: str, |
| model_b: str, |
| character: str, |
| user_msg: str, |
| current_state: dict, |
| ): |
| """๋ ๋ชจ๋ธ์ ์๋ต ์์ฑ""" |
| if not model_a or not model_b: |
| return ( |
| "*(๋ชจ๋ธ์ ์ ํํด์ฃผ์ธ์)*", "", "", |
| "*(๋ชจ๋ธ์ ์ ํํด์ฃผ์ธ์)*", "", "", |
| current_state, |
| ) |
|
|
| if not user_msg.strip(): |
| return ( |
| "*(๋ฉ์์ง๋ฅผ ์
๋ ฅํด์ฃผ์ธ์)*", "", "", |
| "*(๋ฉ์์ง๋ฅผ ์
๋ ฅํด์ฃผ์ธ์)*", "", "", |
| current_state, |
| ) |
|
|
| system_prompt = build_system_prompt(character) |
| messages = [{"role": "user", "content": user_msg}] |
|
|
| |
| if use_mock or model_manager is None: |
| response_a_full = f"<think>\n{character}์ ์
์ฅ์์ ์๊ฐํด๋ณด๋ฉด... ์ด ๋ฉ์์ง์ ์ด๋ป๊ฒ ๋ฐ์ํด์ผ ํ ๊น?\n</think>\n\n์๋
! ๋ฐ๊ฐ์~ (Mock Response A)" |
| response_b_full = f"<think>\n์... ์ด๋ฐ ์ํฉ์์๋...\n</think>\n\nํค์ด~ ๋ญํด? (Mock Response B)" |
| meta_a = {"latency_s": 0.5, "output_tokens": 50} |
| meta_b = {"latency_s": 0.6, "output_tokens": 55} |
| else: |
| |
| try: |
| response_a_full, meta_a = model_manager.generate_response( |
| model_a, messages, system_prompt |
| ) |
| except Exception as e: |
| response_a_full = f"*Error: {str(e)}*" |
| meta_a = {"latency_s": 0, "output_tokens": 0} |
|
|
| try: |
| response_b_full, meta_b = model_manager.generate_response( |
| model_b, messages, system_prompt |
| ) |
| except Exception as e: |
| response_b_full = f"*Error: {str(e)}*" |
| meta_b = {"latency_s": 0, "output_tokens": 0} |
|
|
| |
| think_a, clean_a = parse_thinking_response(response_a_full) |
| think_b, clean_b = parse_thinking_response(response_b_full) |
|
|
| |
| meta_str_a = f"โฑ๏ธ {meta_a.get('latency_s', 0):.2f}s | {meta_a.get('output_tokens', 0)} tokens" |
| meta_str_b = f"โฑ๏ธ {meta_b.get('latency_s', 0):.2f}s | {meta_b.get('output_tokens', 0)} tokens" |
|
|
| |
| new_state = { |
| "model_a": model_a, |
| "model_b": model_b, |
| "response_a": response_a_full, |
| "response_b": response_b_full, |
| "character": character, |
| "user_input": user_msg, |
| } |
|
|
| return ( |
| format_thinking_for_display(think_a) if think_a else "*No thinking*", |
| clean_a, |
| meta_str_a, |
| format_thinking_for_display(think_b) if think_b else "*No thinking*", |
| clean_b, |
| meta_str_b, |
| new_state, |
| ) |
|
|
| def handle_vote(vote_type: str, reason: str, current_state: dict): |
| """ํฌํ ์ฒ๋ฆฌ""" |
| if not current_state.get("model_a") or not current_state.get("model_b"): |
| return "๋จผ์ ์๋ต์ ์์ฑํด์ฃผ์ธ์." |
|
|
| vote_data = { |
| "model_a": current_state["model_a"], |
| "model_b": current_state["model_b"], |
| "response_a": current_state.get("response_a", ""), |
| "response_b": current_state.get("response_b", ""), |
| "character": current_state.get("character", ""), |
| "user_input": current_state.get("user_input", ""), |
| "vote": vote_type, |
| "reason": reason, |
| } |
|
|
| vote_id = vote_storage.save_vote(vote_data) |
|
|
| |
| if vote_type != "skip": |
| new_a, new_b = elo_calculator.update_ratings( |
| current_state["model_a"], |
| current_state["model_b"], |
| vote_type, |
| ) |
| return f"ํฌํ ์๋ฃ! (ID: {vote_id})\n\nELO ๋ณ๊ฒฝ:\n- {current_state['model_a']}: {new_a:.0f}\n- {current_state['model_b']}: {new_b:.0f}" |
|
|
| return f"์คํต๋จ (ID: {vote_id})" |
|
|
| def update_model_labels(blind: bool, model_a: str, model_b: str): |
| """๋ธ๋ผ์ธ๋ ๋ชจ๋์ ๋ฐ๋ผ ๋ ์ด๋ธ ์
๋ฐ์ดํธ""" |
| if blind: |
| return "### Model A", "### Model B" |
| else: |
| info_a = get_model_info(model_a) |
| info_b = get_model_info(model_b) |
| label_a = f"### {info_a.get('description', model_a)}" if info_a else f"### {model_a}" |
| label_b = f"### {info_b.get('description', model_b)}" if info_b else f"### {model_b}" |
| return label_a, label_b |
|
|
| |
| |
| |
|
|
| random_models_btn.click( |
| fn=select_random_models, |
| outputs=[model_a_dropdown, model_b_dropdown], |
| ) |
|
|
| random_scenario_btn.click( |
| fn=load_random_scenario, |
| inputs=[character_dropdown], |
| outputs=[user_input, scenario_dropdown], |
| ) |
|
|
| scenario_dropdown.change( |
| fn=load_scenario_input, |
| inputs=[scenario_dropdown, character_dropdown], |
| outputs=[user_input], |
| ) |
|
|
| submit_btn.click( |
| fn=generate_responses, |
| inputs=[model_a_dropdown, model_b_dropdown, character_dropdown, user_input, state], |
| outputs=[thinking_a, response_a, metadata_a, thinking_b, response_b, metadata_b, state], |
| ) |
|
|
| |
| blind_mode.change( |
| fn=update_model_labels, |
| inputs=[blind_mode, model_a_dropdown, model_b_dropdown], |
| outputs=[model_a_label, model_b_label], |
| ) |
|
|
| |
| vote_a_btn.click( |
| fn=lambda r, s: handle_vote("a", r, s), |
| inputs=[vote_reason, state], |
| outputs=[vote_result], |
| ) |
| vote_b_btn.click( |
| fn=lambda r, s: handle_vote("b", r, s), |
| inputs=[vote_reason, state], |
| outputs=[vote_result], |
| ) |
| vote_tie_btn.click( |
| fn=lambda r, s: handle_vote("tie", r, s), |
| inputs=[vote_reason, state], |
| outputs=[vote_result], |
| ) |
| vote_skip_btn.click( |
| fn=lambda r, s: handle_vote("skip", r, s), |
| inputs=[vote_reason, state], |
| outputs=[vote_result], |
| ) |
|
|