Spaces:

Elfsong
/

Arena

Build error

App Files Files Community

Elfsong commited on 30 days ago

Commit

6768a50

1 Parent(s): 329ef45

feat: Implement dynamic model launching with GPU mapping and update chatbot response handling to include thinking mode functionality.

Browse files

Files changed (1) hide show

app.py +59 -44

app.py CHANGED Viewed

@@ -5,24 +5,52 @@
 import os
 import json
-import spaces
 import datetime
 import gradio as gr
 import pandas as pd
 from pathlib import Path
 from huggingface_hub import CommitScheduler
 from huggingface_hub import InferenceClient
 HF_TOKEN = os.getenv("HF_TOKEN")
-MODELS = [
-    "Local-Model-1",
-    "Local-Model-2",
-    "Elfsong/VLM-iter_0000500",
-    "Elfsong/VLM-iter_0001000",
-    "Elfsong/VLM-iter_0001500",
-]
 DATA_DIR = Path("logs")
 DATA_DIR.mkdir(exist_ok=True)
@@ -49,25 +77,23 @@ def save_feedback(model_name, history, feedback_data: gr.LikeData):
     print(f"Feedback logged for {model_name}")
-@spaces.GPU
-def model_inference(user_message, history, model_name, system_message, max_tokens, temperature, top_p, oauth_token: gr.OAuthToken | None, local_endpoint: str):
     if not user_message or user_message.strip() == "":
         yield history, ""
         return
-    token = oauth_token.token if oauth_token else None
     if model_name.startswith("Local-"):
         client = InferenceClient(base_url=local_endpoint, token="vllm-token")
     else:
         client = InferenceClient(token=token, model=model_name)
-    # Construct message list
     history.append({"role": "user", "content": user_message})
     history.append({"role": "assistant", "content": ""})
-    # Construct API messages (including system prompt)
-    api_messages = [{"role": "system", "content": system_message}] + history[:-1]
     try:
         stream = client.chat_completion(
@@ -101,51 +127,40 @@ def model_inference(user_message, history, model_name, system_message, max_token
 with gr.Blocks() as demo:
     with gr.Sidebar():
         gr.Markdown("## Configuration")
-        gr.LoginButton()
-        # Local vLLM endpoint setting
-        local_endpoint_a = gr.Textbox(
-            value="http://localhost:8000/v1",
-            label="Local vLLM Endpoint A",
-            placeholder="http://127.0.0.1:8000/v1"
-        )
-        local_endpoint_b = gr.Textbox(
-            value="http://localhost:8001/v1",
-            label="Local vLLM Endpoint B",
-            placeholder="http://127.0.0.1:8001/v1"
-        )
         system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Prompt")
         max_t = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
-        temp = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
-        top_p_val = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p")
     gr.Markdown("# ⚔️ Chatbot Arena")
     with gr.Row():
         # --- Model A ---
         with gr.Column():
-            model_a_name = gr.Dropdown(MODELS, label="Model A", value=MODELS[0])
-            chatbot_a = gr.Chatbot(label="Model A Output", type="messages")
             msg_a = gr.Textbox(placeholder="Send message to Model A...", label="Model A Input")
-            # btn_a = gr.Button("Send to Model A")
         # --- Model B ---
         with gr.Column():
-            model_b_name = gr.Dropdown(MODELS, label="Model B", value=MODELS[1])
-            chatbot_b = gr.Chatbot(label="Model B Output", type="messages")
             msg_b = gr.Textbox(placeholder="Send message to Model B...", label="Model B Input")
-            # btn_b = gr.Button("Send to Model B")
     # --- Bind Events ---
-    a_inputs = [msg_a, chatbot_a, model_a_name, system_msg, max_t, temp, top_p_val, local_endpoint_a]
-    msg_a.submit(model_inference, a_inputs, [chatbot_a, msg_a])
-    # btn_a.click(model_inference, a_inputs, [chatbot_a, msg_a])
     chatbot_a.like(save_feedback, [model_a_name, chatbot_a], None)
-    b_inputs = [msg_b, chatbot_b, model_b_name, system_msg, max_t, temp, top_p_val, local_endpoint_b]
-    msg_b.submit(model_inference, b_inputs, [chatbot_b, msg_b])
-    # btn_b.click(model_inference, b_inputs, [chatbot_b, msg_b])
     chatbot_b.like(save_feedback, [model_b_name, chatbot_b], None)
@@ -159,4 +174,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch(share=False)

 import os
 import json
 import datetime
 import gradio as gr
 import pandas as pd
+import subprocess
+import time
 from pathlib import Path
 from huggingface_hub import CommitScheduler
 from huggingface_hub import InferenceClient
 HF_TOKEN = os.getenv("HF_TOKEN")
+MODELS = dict()
+# Launch models via vLLM
+model_gpu_mapping = {
+    0: 1000,
+    1: 2000,
+}
+for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping.items()):
+    formatted_iter_num = f"{iter_num:07d}"
+    model_name = f"Elfsong/VLM_stage_2_iter_{formatted_iter_num}"
+    key = f"Local-Model-{iter_num:05d}"
+    port = 9000 + index
+    print(f"Launching {model_name} on port {port} (GPU {gpu_id})")
+    # Create a log file for each model
+    log_file = open(f"./logs/vllm_{formatted_iter_num}.log", "w")
+    subprocess.Popen(
+        [
+            "python", "-m", "vllm.entrypoints.openai.api_server",
+            "--model", model_name,
+            "--port", str(port),
+            "--quantization", "bitsandbytes",
+            "--gpu-memory-utilization", "0.9",
+            "--trust-remote-code",
+        ],
+        env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
+        stdout=log_file,
+        stderr=log_file,
+    )
+    time.sleep(10) # Wait for initialization
+    MODELS[key] = f"http://localhost:{port}/v1"
 DATA_DIR = Path("logs")
 DATA_DIR.mkdir(exist_ok=True)
     print(f"Feedback logged for {model_name}")
+def bot_response(user_message, history, model_name, system_message, thinking_mode, max_tokens, temperature, top_p):
     if not user_message or user_message.strip() == "":
         yield history, ""
         return
+    token = HF_TOKEN
     if model_name.startswith("Local-"):
+        local_endpoint = MODELS.get(model_name)
         client = InferenceClient(base_url=local_endpoint, token="vllm-token")
     else:
         client = InferenceClient(token=token, model=model_name)
     history.append({"role": "user", "content": user_message})
     history.append({"role": "assistant", "content": ""})
+    api_messages = [{"role": "system", "content": system_message + "/set think" if thinking_mode else "/set nothink"}] + history[:-1]
     try:
         stream = client.chat_completion(
 with gr.Blocks() as demo:
     with gr.Sidebar():
         gr.Markdown("## Configuration")
+        # gr.LoginButton()
         system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Prompt")
+        thinking_mode = gr.Checkbox(value=False, label="Thinking Mode")
         max_t = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
+        temp = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.05, label="Temperature")
+        top_p_val = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="Top-p")
     gr.Markdown("# ⚔️ Chatbot Arena")
     with gr.Row():
         # --- Model A ---
         with gr.Column():
+            model_a_name = gr.Dropdown(list(MODELS.keys()), label="Model A", value=list(MODELS.keys())[0])
+            chatbot_a = gr.Chatbot(label="Model A Output")
             msg_a = gr.Textbox(placeholder="Send message to Model A...", label="Model A Input")
+            btn_a = gr.Button("Send to Model A")
         # --- Model B ---
         with gr.Column():
+            model_b_name = gr.Dropdown(list(MODELS.keys()), label="Model B", value=list(MODELS.keys())[-1])
+            chatbot_b = gr.Chatbot(label="Model B Output")
             msg_b = gr.Textbox(placeholder="Send message to Model B...", label="Model B Input")
+            btn_b = gr.Button("Send to Model B")
     # --- Bind Events ---
+    a_inputs = [msg_a, chatbot_a, model_a_name, system_msg, thinking_mode, max_t, temp, top_p_val]
+    msg_a.submit(bot_response, a_inputs, [chatbot_a, msg_a])
+    btn_a.click(bot_response, a_inputs, [chatbot_a, msg_a])
     chatbot_a.like(save_feedback, [model_a_name, chatbot_a], None)
+    b_inputs = [msg_b, chatbot_b, model_b_name, system_msg, thinking_mode, max_t, temp, top_p_val]
+    msg_b.submit(bot_response, b_inputs, [chatbot_b, msg_b])
+    btn_b.click(bot_response, b_inputs, [chatbot_b, msg_b])
     chatbot_b.like(save_feedback, [model_b_name, chatbot_b], None)
     )
 if __name__ == "__main__":
+    demo.launch(share=True)