Spaces:

Elfsong
/

Arena

Runtime error

Arena / app.py

refactor: Update model GPU mapping to a list format for improved readability and adjust GPU memory utilization settings for better performance during model launches.

24170ac 1 day ago

raw

history blame contribute delete

6.4 kB

	# coding: utf-8

	# Author: Du Mingzhe (dumingzhex@gmail.com)
	# Date: 2025-12-21

	import os
	import json
	import datetime
	import gradio as gr
	import pandas as pd
	import subprocess
	import time
	from pathlib import Path
	from huggingface_hub import CommitScheduler
	from huggingface_hub import InferenceClient

	HF_TOKEN = os.getenv("HF_TOKEN")

	MODELS = dict()

	# Launch models via vLLM
	model_gpu_mapping = [
	(0, 1000), (0, 1500),
	(1, 2000), (1, 2500),
	(2, 3000), (2, 3500),
	(3, 4000), (3, 4500),
	(4, 5000), (4, 5500),
	(5, 6000), (5, 6500),
	(6, 7000), (6, 7500),
	]

	for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping):
	formatted_iter_num = f"{iter_num:07d}"
	model_name = f"Elfsong/VLM_stage_2_iter_{formatted_iter_num}"
	arena_key = f"Local-Model-{iter_num:05d}"

	port = 9000 + index
	print(f"🚀 Launching {model_name} on port {port} (GPU {gpu_id}) ...")
	log_file = open(f"./logs/vllm_{formatted_iter_num}.log", "w")

	subprocess.Popen(
	[
	"python", "-m", "vllm.entrypoints.openai.api_server",
	"--model", model_name,
	"--port", str(port),
	"--quantization", "bitsandbytes",
	"--gpu-memory-utilization", "0.4",
	"--trust-remote-code",
	],
	env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
	stdout=log_file,
	stderr=log_file,
	)

	time.sleep(5) # Wait for initialization
	MODELS[arena_key] = f"http://localhost:{port}/v1"
	print(f"✅ Launched {len(MODELS)} models. Check logs in ./logs/ directory.")

	DATA_DIR = Path("logs")
	DATA_DIR.mkdir(exist_ok=True)
	FEEDBACK_FILE = DATA_DIR / "feedback.jsonl"

	scheduler = CommitScheduler(
	repo_id="Elfsong/arena_feedback",
	repo_type="dataset",
	folder_path=DATA_DIR,
	every=5, # Sync every 5 minutes
	)

	def save_feedback(model_name, history, feedback_data: gr.LikeData):
	new_entry = {
	"timestamp": datetime.datetime.now().isoformat(),
	"model_name": model_name,
	"message_index": feedback_data.index,
	"vote": feedback_data.value,
	"is_liked": feedback_data.liked,
	"conversation": history
	}
	with open(FEEDBACK_FILE, "a", encoding="utf-8") as f:
	f.write(json.dumps(new_entry, ensure_ascii=False) + "\n")

	print(f"Feedback logged for {model_name}")

	def bot_response(user_message, history, model_name, system_message, thinking_mode, max_tokens, temperature, top_p):
	if not user_message or user_message.strip() == "":
	yield history, ""
	return

	token = HF_TOKEN

	if model_name.startswith("Local-"):
	local_endpoint = MODELS.get(model_name)
	client = InferenceClient(base_url=local_endpoint, token="vllm-token")
	else:
	client = InferenceClient(token=token, model=model_name)

	history.append({"role": "user", "content": user_message})
	history.append({"role": "assistant", "content": ""})

	api_messages = [{"role": "system", "content": system_message + "/set think" if thinking_mode else "/set nothink"}] + history[:-1]

	try:
	stream = client.chat_completion(
	api_messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	model=model_name,
	)

	response_text = ""
	for chunk in stream:
	if not chunk.choices or len(chunk.choices) == 0:
	continue

	token_content = chunk.choices[0].delta.content
	if token_content is not None:
	response_text += token_content
	history[-1]["content"] = response_text
	# Continuously yield update UI, while keeping input box unavailable to prevent double clicks
	yield history, gr.update(interactive=False)

	except Exception as e:
	# If error, display error message in assistant dialog
	history[-1]["content"] = f"Error: {str(e)}"

	# --- Final Yield: Restore input box availability and clear content ---
	yield history, gr.update(value="", interactive=True)

	with gr.Blocks() as demo:
	with gr.Sidebar():
	gr.Markdown("## Configuration")
	# gr.LoginButton()

	system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Prompt")
	thinking_mode = gr.Checkbox(value=False, label="Thinking Mode")
	max_t = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
	temp = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.05, label="Temperature")
	top_p_val = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="Top-p")

	gr.Markdown("# ⚔️ Chatbot Arena")

	with gr.Row():
	# --- Model A ---
	with gr.Column():
	model_a_name = gr.Dropdown(list(MODELS.keys()), label="Model A", value=list(MODELS.keys())[0])
	chatbot_a = gr.Chatbot(label="Model A Output")
	msg_a = gr.Textbox(placeholder="Send message to Model A...", label="Model A Input")
	btn_a = gr.Button("Send to Model A")

	# --- Model B ---
	with gr.Column():
	model_b_name = gr.Dropdown(list(MODELS.keys()), label="Model B", value=list(MODELS.keys())[-1])
	chatbot_b = gr.Chatbot(label="Model B Output")
	msg_b = gr.Textbox(placeholder="Send message to Model B...", label="Model B Input")
	btn_b = gr.Button("Send to Model B")

	# --- Bind Events ---
	a_inputs = [msg_a, chatbot_a, model_a_name, system_msg, thinking_mode, max_t, temp, top_p_val]
	msg_a.submit(bot_response, a_inputs, [chatbot_a, msg_a])
	btn_a.click(bot_response, a_inputs, [chatbot_a, msg_a])
	chatbot_a.like(save_feedback, [model_a_name, chatbot_a], None)

	b_inputs = [msg_b, chatbot_b, model_b_name, system_msg, thinking_mode, max_t, temp, top_p_val]
	msg_b.submit(bot_response, b_inputs, [chatbot_b, msg_b])
	btn_b.click(bot_response, b_inputs, [chatbot_b, msg_b])
	chatbot_b.like(save_feedback, [model_b_name, chatbot_b], None)


	def clear_chats():
	return [], []

	gr.Button("🗑️ Clear Chats").click(
	fn=clear_chats,
	inputs=None,
	outputs=[chatbot_a, chatbot_b]
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", share=False)