developer-lunark's picture
Upload folder using huggingface_hub
7b7257a verified
"""A/B Arena ํƒญ UI"""
import gradio as gr
import random
from typing import Dict, List, Tuple, Optional, Any
from models.model_registry import get_all_models, get_model_info, get_models_for_dropdown
from characters import get_character_loader, build_system_prompt
from scenarios import get_scenario_loader
from voting import get_vote_storage, get_elo_calculator
from utils import parse_thinking_response, format_thinking_for_display
def create_arena_tab(
model_manager: Any = None,
use_mock: bool = False,
):
"""A/B Arena ํƒญ ์ƒ์„ฑ"""
# ๋ฐ์ดํ„ฐ ๋กœ๋”
char_loader = get_character_loader()
scenario_loader = get_scenario_loader()
vote_storage = get_vote_storage()
elo_calculator = get_elo_calculator()
# ๋ชจ๋ธ ๋ชฉ๋ก
all_models = get_all_models()
model_choices = [(f"{get_model_info(m).get('description', m)}", m) for m in all_models]
# ์บ๋ฆญํ„ฐ ๋ชฉ๋ก
characters = char_loader.get_character_names()
# ์‹œ๋‚˜๋ฆฌ์˜ค ๋ชฉ๋ก
scenario_choices = scenario_loader.get_scenarios_for_dropdown()
# ============================================================
# UI ๊ตฌ์„ฑ
# ============================================================
gr.Markdown("## A/B ํ…Œ์ŠคํŠธ ์•„๋ ˆ๋‚˜")
gr.Markdown("๋‘ ๋ชจ๋ธ์˜ ์‘๋‹ต์„ ๋น„๊ตํ•˜๊ณ  ๋” ์ข‹์€ ์‘๋‹ต์— ํˆฌํ‘œํ•˜์„ธ์š”.")
# ์„ค์ • ํŒจ๋„
with gr.Row():
with gr.Column(scale=1):
character_dropdown = gr.Dropdown(
choices=characters,
value=characters[0] if characters else None,
label="์บ๋ฆญํ„ฐ ์„ ํƒ",
)
with gr.Column(scale=1):
scenario_dropdown = gr.Dropdown(
choices=scenario_choices,
value=scenario_choices[0][1] if scenario_choices else None,
label="์‹œ๋‚˜๋ฆฌ์˜ค ํ”„๋ฆฌ์…‹",
)
with gr.Column(scale=1):
blind_mode = gr.Checkbox(
value=True,
label="๋ธ”๋ผ์ธ๋“œ ๋ชจ๋“œ (๋ชจ๋ธ๋ช… ์ˆจ๊น€)",
)
with gr.Row():
with gr.Column(scale=2):
model_a_dropdown = gr.Dropdown(
choices=model_choices,
value=all_models[0] if all_models else None,
label="Model A",
)
with gr.Column(scale=2):
model_b_dropdown = gr.Dropdown(
choices=model_choices,
value=all_models[1] if len(all_models) > 1 else None,
label="Model B",
)
with gr.Column(scale=1):
random_models_btn = gr.Button("๋žœ๋ค ๋ชจ๋ธ", size="sm")
# ์‘๋‹ต ์˜์—ญ
with gr.Row():
# Model A Response
with gr.Column(scale=1):
model_a_label = gr.Markdown("### Model A")
with gr.Accordion("Thinking Process", open=False):
thinking_a = gr.Markdown("*(์‘๋‹ต ์ƒ์„ฑ ํ›„ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค)*")
response_a = gr.Textbox(
label="์‘๋‹ต",
lines=8,
interactive=False,
)
metadata_a = gr.Markdown("")
# Model B Response
with gr.Column(scale=1):
model_b_label = gr.Markdown("### Model B")
with gr.Accordion("Thinking Process", open=False):
thinking_b = gr.Markdown("*(์‘๋‹ต ์ƒ์„ฑ ํ›„ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค)*")
response_b = gr.Textbox(
label="์‘๋‹ต",
lines=8,
interactive=False,
)
metadata_b = gr.Markdown("")
# ์‚ฌ์šฉ์ž ์ž…๋ ฅ
with gr.Row():
user_input = gr.Textbox(
label="ํŒฌ ๋ฉ”์‹œ์ง€",
placeholder="์•„์ด๋Œ์—๊ฒŒ ๋ณด๋‚ผ ๋ฉ”์‹œ์ง€๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”...",
lines=2,
scale=4,
)
with gr.Column(scale=1):
random_scenario_btn = gr.Button("๋žœ๋ค ์‹œ๋‚˜๋ฆฌ์˜ค")
submit_btn = gr.Button("์ „์†ก", variant="primary")
# ํˆฌํ‘œ ์˜์—ญ
gr.Markdown("### ํˆฌํ‘œ")
with gr.Row():
vote_a_btn = gr.Button("A๊ฐ€ ๋” ์ข‹์Œ", variant="secondary")
vote_tie_btn = gr.Button("๋น„์Šทํ•จ", variant="secondary")
vote_b_btn = gr.Button("B๊ฐ€ ๋” ์ข‹์Œ", variant="secondary")
vote_skip_btn = gr.Button("์Šคํ‚ต", variant="secondary")
vote_reason = gr.Textbox(
label="ํˆฌํ‘œ ์ด์œ  (์„ ํƒ์‚ฌํ•ญ)",
placeholder="์™œ ์ด ์‘๋‹ต์ด ๋” ์ข‹๋‹ค๊ณ  ์ƒ๊ฐํ•˜์‹œ๋‚˜์š”?",
lines=1,
)
vote_result = gr.Markdown("")
# ์ƒํƒœ ์ €์žฅ
state = gr.State({
"model_a": None,
"model_b": None,
"response_a": None,
"response_b": None,
"character": None,
"user_input": None,
})
# ============================================================
# ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
# ============================================================
def select_random_models():
"""๋žœ๋ค ๋ชจ๋ธ ์„ ํƒ"""
if len(all_models) < 2:
return all_models[0] if all_models else None, None
selected = random.sample(all_models, 2)
return selected[0], selected[1]
def load_random_scenario(character: str):
"""๋žœ๋ค ์‹œ๋‚˜๋ฆฌ์˜ค ๋กœ๋“œ"""
scenario = scenario_loader.get_random_scenario()
if scenario:
user_msg = scenario_loader.format_user_input(scenario, character)
return user_msg, scenario["id"]
return "", None
def load_scenario_input(scenario_id: str, character: str):
"""์„ ํƒ๋œ ์‹œ๋‚˜๋ฆฌ์˜ค ๋กœ๋“œ"""
scenario = scenario_loader.get_scenario(scenario_id)
if scenario:
return scenario_loader.format_user_input(scenario, character)
return ""
def generate_responses(
model_a: str,
model_b: str,
character: str,
user_msg: str,
current_state: dict,
):
"""๋‘ ๋ชจ๋ธ์˜ ์‘๋‹ต ์ƒ์„ฑ"""
if not model_a or not model_b:
return (
"*(๋ชจ๋ธ์„ ์„ ํƒํ•ด์ฃผ์„ธ์š”)*", "", "",
"*(๋ชจ๋ธ์„ ์„ ํƒํ•ด์ฃผ์„ธ์š”)*", "", "",
current_state,
)
if not user_msg.strip():
return (
"*(๋ฉ”์‹œ์ง€๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”)*", "", "",
"*(๋ฉ”์‹œ์ง€๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”)*", "", "",
current_state,
)
system_prompt = build_system_prompt(character)
messages = [{"role": "user", "content": user_msg}]
# Mock ๋ชจ๋“œ (์‹ค์ œ ๋ชจ๋ธ ์—†์ด ํ…Œ์ŠคํŠธ)
if use_mock or model_manager is None:
response_a_full = f"<think>\n{character}์˜ ์ž…์žฅ์—์„œ ์ƒ๊ฐํ•ด๋ณด๋ฉด... ์ด ๋ฉ”์‹œ์ง€์— ์–ด๋–ป๊ฒŒ ๋ฐ˜์‘ํ•ด์•ผ ํ• ๊นŒ?\n</think>\n\n์•ˆ๋…•! ๋ฐ˜๊ฐ€์›Œ~ (Mock Response A)"
response_b_full = f"<think>\n์Œ... ์ด๋Ÿฐ ์ƒํ™ฉ์—์„œ๋Š”...\n</think>\n\nํ—ค์ด~ ๋ญํ•ด? (Mock Response B)"
meta_a = {"latency_s": 0.5, "output_tokens": 50}
meta_b = {"latency_s": 0.6, "output_tokens": 55}
else:
# ์‹ค์ œ ๋ชจ๋ธ ์ถ”๋ก 
try:
response_a_full, meta_a = model_manager.generate_response(
model_a, messages, system_prompt
)
except Exception as e:
response_a_full = f"*Error: {str(e)}*"
meta_a = {"latency_s": 0, "output_tokens": 0}
try:
response_b_full, meta_b = model_manager.generate_response(
model_b, messages, system_prompt
)
except Exception as e:
response_b_full = f"*Error: {str(e)}*"
meta_b = {"latency_s": 0, "output_tokens": 0}
# Thinking ํŒŒ์‹ฑ
think_a, clean_a = parse_thinking_response(response_a_full)
think_b, clean_b = parse_thinking_response(response_b_full)
# ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋ฌธ์ž์—ด
meta_str_a = f"โฑ๏ธ {meta_a.get('latency_s', 0):.2f}s | {meta_a.get('output_tokens', 0)} tokens"
meta_str_b = f"โฑ๏ธ {meta_b.get('latency_s', 0):.2f}s | {meta_b.get('output_tokens', 0)} tokens"
# ์ƒํƒœ ์—…๋ฐ์ดํŠธ
new_state = {
"model_a": model_a,
"model_b": model_b,
"response_a": response_a_full,
"response_b": response_b_full,
"character": character,
"user_input": user_msg,
}
return (
format_thinking_for_display(think_a) if think_a else "*No thinking*",
clean_a,
meta_str_a,
format_thinking_for_display(think_b) if think_b else "*No thinking*",
clean_b,
meta_str_b,
new_state,
)
def handle_vote(vote_type: str, reason: str, current_state: dict):
"""ํˆฌํ‘œ ์ฒ˜๋ฆฌ"""
if not current_state.get("model_a") or not current_state.get("model_b"):
return "๋จผ์ € ์‘๋‹ต์„ ์ƒ์„ฑํ•ด์ฃผ์„ธ์š”."
vote_data = {
"model_a": current_state["model_a"],
"model_b": current_state["model_b"],
"response_a": current_state.get("response_a", ""),
"response_b": current_state.get("response_b", ""),
"character": current_state.get("character", ""),
"user_input": current_state.get("user_input", ""),
"vote": vote_type,
"reason": reason,
}
vote_id = vote_storage.save_vote(vote_data)
# ELO ์—…๋ฐ์ดํŠธ
if vote_type != "skip":
new_a, new_b = elo_calculator.update_ratings(
current_state["model_a"],
current_state["model_b"],
vote_type,
)
return f"ํˆฌํ‘œ ์™„๋ฃŒ! (ID: {vote_id})\n\nELO ๋ณ€๊ฒฝ:\n- {current_state['model_a']}: {new_a:.0f}\n- {current_state['model_b']}: {new_b:.0f}"
return f"์Šคํ‚ต๋จ (ID: {vote_id})"
def update_model_labels(blind: bool, model_a: str, model_b: str):
"""๋ธ”๋ผ์ธ๋“œ ๋ชจ๋“œ์— ๋”ฐ๋ผ ๋ ˆ์ด๋ธ” ์—…๋ฐ์ดํŠธ"""
if blind:
return "### Model A", "### Model B"
else:
info_a = get_model_info(model_a)
info_b = get_model_info(model_b)
label_a = f"### {info_a.get('description', model_a)}" if info_a else f"### {model_a}"
label_b = f"### {info_b.get('description', model_b)}" if info_b else f"### {model_b}"
return label_a, label_b
# ============================================================
# ์ด๋ฒคํŠธ ๋ฐ”์ธ๋”ฉ
# ============================================================
random_models_btn.click(
fn=select_random_models,
outputs=[model_a_dropdown, model_b_dropdown],
)
random_scenario_btn.click(
fn=load_random_scenario,
inputs=[character_dropdown],
outputs=[user_input, scenario_dropdown],
)
scenario_dropdown.change(
fn=load_scenario_input,
inputs=[scenario_dropdown, character_dropdown],
outputs=[user_input],
)
submit_btn.click(
fn=generate_responses,
inputs=[model_a_dropdown, model_b_dropdown, character_dropdown, user_input, state],
outputs=[thinking_a, response_a, metadata_a, thinking_b, response_b, metadata_b, state],
)
# ๋ธ”๋ผ์ธ๋“œ ๋ชจ๋“œ ๋ณ€๊ฒฝ ์‹œ ๋ ˆ์ด๋ธ” ์—…๋ฐ์ดํŠธ
blind_mode.change(
fn=update_model_labels,
inputs=[blind_mode, model_a_dropdown, model_b_dropdown],
outputs=[model_a_label, model_b_label],
)
# ํˆฌํ‘œ ๋ฒ„ํŠผ
vote_a_btn.click(
fn=lambda r, s: handle_vote("a", r, s),
inputs=[vote_reason, state],
outputs=[vote_result],
)
vote_b_btn.click(
fn=lambda r, s: handle_vote("b", r, s),
inputs=[vote_reason, state],
outputs=[vote_result],
)
vote_tie_btn.click(
fn=lambda r, s: handle_vote("tie", r, s),
inputs=[vote_reason, state],
outputs=[vote_result],
)
vote_skip_btn.click(
fn=lambda r, s: handle_vote("skip", r, s),
inputs=[vote_reason, state],
outputs=[vote_result],
)