Arena / app.py
Elfsong's picture
refactor: Update model GPU mapping to a list format for improved readability and adjust GPU memory utilization settings for better performance during model launches.
24170ac
# coding: utf-8
# Author: Du Mingzhe (dumingzhex@gmail.com)
# Date: 2025-12-21
import os
import json
import datetime
import gradio as gr
import pandas as pd
import subprocess
import time
from pathlib import Path
from huggingface_hub import CommitScheduler
from huggingface_hub import InferenceClient
HF_TOKEN = os.getenv("HF_TOKEN")
MODELS = dict()
# Launch models via vLLM
model_gpu_mapping = [
(0, 1000), (0, 1500),
(1, 2000), (1, 2500),
(2, 3000), (2, 3500),
(3, 4000), (3, 4500),
(4, 5000), (4, 5500),
(5, 6000), (5, 6500),
(6, 7000), (6, 7500),
]
for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping):
formatted_iter_num = f"{iter_num:07d}"
model_name = f"Elfsong/VLM_stage_2_iter_{formatted_iter_num}"
arena_key = f"Local-Model-{iter_num:05d}"
port = 9000 + index
print(f"๐Ÿš€ Launching {model_name} on port {port} (GPU {gpu_id}) ...")
log_file = open(f"./logs/vllm_{formatted_iter_num}.log", "w")
subprocess.Popen(
[
"python", "-m", "vllm.entrypoints.openai.api_server",
"--model", model_name,
"--port", str(port),
"--quantization", "bitsandbytes",
"--gpu-memory-utilization", "0.4",
"--trust-remote-code",
],
env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
stdout=log_file,
stderr=log_file,
)
time.sleep(5) # Wait for initialization
MODELS[arena_key] = f"http://localhost:{port}/v1"
print(f"โœ… Launched {len(MODELS)} models. Check logs in ./logs/ directory.")
DATA_DIR = Path("logs")
DATA_DIR.mkdir(exist_ok=True)
FEEDBACK_FILE = DATA_DIR / "feedback.jsonl"
scheduler = CommitScheduler(
repo_id="Elfsong/arena_feedback",
repo_type="dataset",
folder_path=DATA_DIR,
every=5, # Sync every 5 minutes
)
def save_feedback(model_name, history, feedback_data: gr.LikeData):
new_entry = {
"timestamp": datetime.datetime.now().isoformat(),
"model_name": model_name,
"message_index": feedback_data.index,
"vote": feedback_data.value,
"is_liked": feedback_data.liked,
"conversation": history
}
with open(FEEDBACK_FILE, "a", encoding="utf-8") as f:
f.write(json.dumps(new_entry, ensure_ascii=False) + "\n")
print(f"Feedback logged for {model_name}")
def bot_response(user_message, history, model_name, system_message, thinking_mode, max_tokens, temperature, top_p):
if not user_message or user_message.strip() == "":
yield history, ""
return
token = HF_TOKEN
if model_name.startswith("Local-"):
local_endpoint = MODELS.get(model_name)
client = InferenceClient(base_url=local_endpoint, token="vllm-token")
else:
client = InferenceClient(token=token, model=model_name)
history.append({"role": "user", "content": user_message})
history.append({"role": "assistant", "content": ""})
api_messages = [{"role": "system", "content": system_message + "/set think" if thinking_mode else "/set nothink"}] + history[:-1]
try:
stream = client.chat_completion(
api_messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
model=model_name,
)
response_text = ""
for chunk in stream:
if not chunk.choices or len(chunk.choices) == 0:
continue
token_content = chunk.choices[0].delta.content
if token_content is not None:
response_text += token_content
history[-1]["content"] = response_text
# Continuously yield update UI, while keeping input box unavailable to prevent double clicks
yield history, gr.update(interactive=False)
except Exception as e:
# If error, display error message in assistant dialog
history[-1]["content"] = f"**Error:** {str(e)}"
# --- Final Yield: Restore input box availability and clear content ---
yield history, gr.update(value="", interactive=True)
with gr.Blocks() as demo:
with gr.Sidebar():
gr.Markdown("## Configuration")
# gr.LoginButton()
system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Prompt")
thinking_mode = gr.Checkbox(value=False, label="Thinking Mode")
max_t = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
temp = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.05, label="Temperature")
top_p_val = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="Top-p")
gr.Markdown("# โš”๏ธ Chatbot Arena")
with gr.Row():
# --- Model A ---
with gr.Column():
model_a_name = gr.Dropdown(list(MODELS.keys()), label="Model A", value=list(MODELS.keys())[0])
chatbot_a = gr.Chatbot(label="Model A Output")
msg_a = gr.Textbox(placeholder="Send message to Model A...", label="Model A Input")
btn_a = gr.Button("Send to Model A")
# --- Model B ---
with gr.Column():
model_b_name = gr.Dropdown(list(MODELS.keys()), label="Model B", value=list(MODELS.keys())[-1])
chatbot_b = gr.Chatbot(label="Model B Output")
msg_b = gr.Textbox(placeholder="Send message to Model B...", label="Model B Input")
btn_b = gr.Button("Send to Model B")
# --- Bind Events ---
a_inputs = [msg_a, chatbot_a, model_a_name, system_msg, thinking_mode, max_t, temp, top_p_val]
msg_a.submit(bot_response, a_inputs, [chatbot_a, msg_a])
btn_a.click(bot_response, a_inputs, [chatbot_a, msg_a])
chatbot_a.like(save_feedback, [model_a_name, chatbot_a], None)
b_inputs = [msg_b, chatbot_b, model_b_name, system_msg, thinking_mode, max_t, temp, top_p_val]
msg_b.submit(bot_response, b_inputs, [chatbot_b, msg_b])
btn_b.click(bot_response, b_inputs, [chatbot_b, msg_b])
chatbot_b.like(save_feedback, [model_b_name, chatbot_b], None)
def clear_chats():
return [], []
gr.Button("๐Ÿ—‘๏ธ Clear Chats").click(
fn=clear_chats,
inputs=None,
outputs=[chatbot_a, chatbot_b]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", share=False)