|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import json |
|
|
import datetime |
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import subprocess |
|
|
import time |
|
|
from pathlib import Path |
|
|
from huggingface_hub import CommitScheduler |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
MODELS = dict() |
|
|
|
|
|
|
|
|
model_gpu_mapping = [ |
|
|
(0, 1000), (0, 1500), |
|
|
(1, 2000), (1, 2500), |
|
|
(2, 3000), (2, 3500), |
|
|
(3, 4000), (3, 4500), |
|
|
(4, 5000), (4, 5500), |
|
|
(5, 6000), (5, 6500), |
|
|
(6, 7000), (6, 7500), |
|
|
] |
|
|
|
|
|
for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping): |
|
|
formatted_iter_num = f"{iter_num:07d}" |
|
|
model_name = f"Elfsong/VLM_stage_2_iter_{formatted_iter_num}" |
|
|
arena_key = f"Local-Model-{iter_num:05d}" |
|
|
|
|
|
port = 9000 + index |
|
|
print(f"๐ Launching {model_name} on port {port} (GPU {gpu_id}) ...") |
|
|
log_file = open(f"./logs/vllm_{formatted_iter_num}.log", "w") |
|
|
|
|
|
subprocess.Popen( |
|
|
[ |
|
|
"python", "-m", "vllm.entrypoints.openai.api_server", |
|
|
"--model", model_name, |
|
|
"--port", str(port), |
|
|
"--quantization", "bitsandbytes", |
|
|
"--gpu-memory-utilization", "0.4", |
|
|
"--trust-remote-code", |
|
|
], |
|
|
env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)}, |
|
|
stdout=log_file, |
|
|
stderr=log_file, |
|
|
) |
|
|
|
|
|
time.sleep(5) |
|
|
MODELS[arena_key] = f"http://localhost:{port}/v1" |
|
|
print(f"โ
Launched {len(MODELS)} models. Check logs in ./logs/ directory.") |
|
|
|
|
|
DATA_DIR = Path("logs") |
|
|
DATA_DIR.mkdir(exist_ok=True) |
|
|
FEEDBACK_FILE = DATA_DIR / "feedback.jsonl" |
|
|
|
|
|
scheduler = CommitScheduler( |
|
|
repo_id="Elfsong/arena_feedback", |
|
|
repo_type="dataset", |
|
|
folder_path=DATA_DIR, |
|
|
every=5, |
|
|
) |
|
|
|
|
|
def save_feedback(model_name, history, feedback_data: gr.LikeData): |
|
|
new_entry = { |
|
|
"timestamp": datetime.datetime.now().isoformat(), |
|
|
"model_name": model_name, |
|
|
"message_index": feedback_data.index, |
|
|
"vote": feedback_data.value, |
|
|
"is_liked": feedback_data.liked, |
|
|
"conversation": history |
|
|
} |
|
|
with open(FEEDBACK_FILE, "a", encoding="utf-8") as f: |
|
|
f.write(json.dumps(new_entry, ensure_ascii=False) + "\n") |
|
|
|
|
|
print(f"Feedback logged for {model_name}") |
|
|
|
|
|
def bot_response(user_message, history, model_name, system_message, thinking_mode, max_tokens, temperature, top_p): |
|
|
if not user_message or user_message.strip() == "": |
|
|
yield history, "" |
|
|
return |
|
|
|
|
|
token = HF_TOKEN |
|
|
|
|
|
if model_name.startswith("Local-"): |
|
|
local_endpoint = MODELS.get(model_name) |
|
|
client = InferenceClient(base_url=local_endpoint, token="vllm-token") |
|
|
else: |
|
|
client = InferenceClient(token=token, model=model_name) |
|
|
|
|
|
history.append({"role": "user", "content": user_message}) |
|
|
history.append({"role": "assistant", "content": ""}) |
|
|
|
|
|
api_messages = [{"role": "system", "content": system_message + "/set think" if thinking_mode else "/set nothink"}] + history[:-1] |
|
|
|
|
|
try: |
|
|
stream = client.chat_completion( |
|
|
api_messages, |
|
|
max_tokens=max_tokens, |
|
|
stream=True, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
model=model_name, |
|
|
) |
|
|
|
|
|
response_text = "" |
|
|
for chunk in stream: |
|
|
if not chunk.choices or len(chunk.choices) == 0: |
|
|
continue |
|
|
|
|
|
token_content = chunk.choices[0].delta.content |
|
|
if token_content is not None: |
|
|
response_text += token_content |
|
|
history[-1]["content"] = response_text |
|
|
|
|
|
yield history, gr.update(interactive=False) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
history[-1]["content"] = f"**Error:** {str(e)}" |
|
|
|
|
|
|
|
|
yield history, gr.update(value="", interactive=True) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
with gr.Sidebar(): |
|
|
gr.Markdown("## Configuration") |
|
|
|
|
|
|
|
|
system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Prompt") |
|
|
thinking_mode = gr.Checkbox(value=False, label="Thinking Mode") |
|
|
max_t = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens") |
|
|
temp = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.05, label="Temperature") |
|
|
top_p_val = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="Top-p") |
|
|
|
|
|
gr.Markdown("# โ๏ธ Chatbot Arena") |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(): |
|
|
model_a_name = gr.Dropdown(list(MODELS.keys()), label="Model A", value=list(MODELS.keys())[0]) |
|
|
chatbot_a = gr.Chatbot(label="Model A Output") |
|
|
msg_a = gr.Textbox(placeholder="Send message to Model A...", label="Model A Input") |
|
|
btn_a = gr.Button("Send to Model A") |
|
|
|
|
|
|
|
|
with gr.Column(): |
|
|
model_b_name = gr.Dropdown(list(MODELS.keys()), label="Model B", value=list(MODELS.keys())[-1]) |
|
|
chatbot_b = gr.Chatbot(label="Model B Output") |
|
|
msg_b = gr.Textbox(placeholder="Send message to Model B...", label="Model B Input") |
|
|
btn_b = gr.Button("Send to Model B") |
|
|
|
|
|
|
|
|
a_inputs = [msg_a, chatbot_a, model_a_name, system_msg, thinking_mode, max_t, temp, top_p_val] |
|
|
msg_a.submit(bot_response, a_inputs, [chatbot_a, msg_a]) |
|
|
btn_a.click(bot_response, a_inputs, [chatbot_a, msg_a]) |
|
|
chatbot_a.like(save_feedback, [model_a_name, chatbot_a], None) |
|
|
|
|
|
b_inputs = [msg_b, chatbot_b, model_b_name, system_msg, thinking_mode, max_t, temp, top_p_val] |
|
|
msg_b.submit(bot_response, b_inputs, [chatbot_b, msg_b]) |
|
|
btn_b.click(bot_response, b_inputs, [chatbot_b, msg_b]) |
|
|
chatbot_b.like(save_feedback, [model_b_name, chatbot_b], None) |
|
|
|
|
|
|
|
|
def clear_chats(): |
|
|
return [], [] |
|
|
|
|
|
gr.Button("๐๏ธ Clear Chats").click( |
|
|
fn=clear_chats, |
|
|
inputs=None, |
|
|
outputs=[chatbot_a, chatbot_b] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", share=False) |
|
|
|