Elfsong commited on
Commit
6768a50
·
1 Parent(s): 329ef45

feat: Implement dynamic model launching with GPU mapping and update chatbot response handling to include thinking mode functionality.

Browse files
Files changed (1) hide show
  1. app.py +59 -44
app.py CHANGED
@@ -5,24 +5,52 @@
5
 
6
  import os
7
  import json
8
- import spaces
9
  import datetime
10
  import gradio as gr
11
  import pandas as pd
 
 
12
  from pathlib import Path
13
  from huggingface_hub import CommitScheduler
14
  from huggingface_hub import InferenceClient
15
 
16
-
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
 
19
- MODELS = [
20
- "Local-Model-1",
21
- "Local-Model-2",
22
- "Elfsong/VLM-iter_0000500",
23
- "Elfsong/VLM-iter_0001000",
24
- "Elfsong/VLM-iter_0001500",
25
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  DATA_DIR = Path("logs")
28
  DATA_DIR.mkdir(exist_ok=True)
@@ -49,25 +77,23 @@ def save_feedback(model_name, history, feedback_data: gr.LikeData):
49
 
50
  print(f"Feedback logged for {model_name}")
51
 
52
- @spaces.GPU
53
- def model_inference(user_message, history, model_name, system_message, max_tokens, temperature, top_p, oauth_token: gr.OAuthToken | None, local_endpoint: str):
54
  if not user_message or user_message.strip() == "":
55
  yield history, ""
56
  return
57
 
58
- token = oauth_token.token if oauth_token else None
59
 
60
  if model_name.startswith("Local-"):
 
61
  client = InferenceClient(base_url=local_endpoint, token="vllm-token")
62
  else:
63
  client = InferenceClient(token=token, model=model_name)
64
 
65
- # Construct message list
66
  history.append({"role": "user", "content": user_message})
67
  history.append({"role": "assistant", "content": ""})
68
 
69
- # Construct API messages (including system prompt)
70
- api_messages = [{"role": "system", "content": system_message}] + history[:-1]
71
 
72
  try:
73
  stream = client.chat_completion(
@@ -101,51 +127,40 @@ def model_inference(user_message, history, model_name, system_message, max_token
101
  with gr.Blocks() as demo:
102
  with gr.Sidebar():
103
  gr.Markdown("## Configuration")
104
- gr.LoginButton()
105
-
106
- # Local vLLM endpoint setting
107
- local_endpoint_a = gr.Textbox(
108
- value="http://localhost:8000/v1",
109
- label="Local vLLM Endpoint A",
110
- placeholder="http://127.0.0.1:8000/v1"
111
- )
112
-
113
- local_endpoint_b = gr.Textbox(
114
- value="http://localhost:8001/v1",
115
- label="Local vLLM Endpoint B",
116
- placeholder="http://127.0.0.1:8001/v1"
117
- )
118
  system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Prompt")
 
119
  max_t = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
120
- temp = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
121
- top_p_val = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p")
122
 
123
  gr.Markdown("# ⚔️ Chatbot Arena")
124
 
125
  with gr.Row():
126
  # --- Model A ---
127
  with gr.Column():
128
- model_a_name = gr.Dropdown(MODELS, label="Model A", value=MODELS[0])
129
- chatbot_a = gr.Chatbot(label="Model A Output", type="messages")
130
  msg_a = gr.Textbox(placeholder="Send message to Model A...", label="Model A Input")
131
- # btn_a = gr.Button("Send to Model A")
132
 
133
  # --- Model B ---
134
  with gr.Column():
135
- model_b_name = gr.Dropdown(MODELS, label="Model B", value=MODELS[1])
136
- chatbot_b = gr.Chatbot(label="Model B Output", type="messages")
137
  msg_b = gr.Textbox(placeholder="Send message to Model B...", label="Model B Input")
138
- # btn_b = gr.Button("Send to Model B")
139
 
140
  # --- Bind Events ---
141
- a_inputs = [msg_a, chatbot_a, model_a_name, system_msg, max_t, temp, top_p_val, local_endpoint_a]
142
- msg_a.submit(model_inference, a_inputs, [chatbot_a, msg_a])
143
- # btn_a.click(model_inference, a_inputs, [chatbot_a, msg_a])
144
  chatbot_a.like(save_feedback, [model_a_name, chatbot_a], None)
145
 
146
- b_inputs = [msg_b, chatbot_b, model_b_name, system_msg, max_t, temp, top_p_val, local_endpoint_b]
147
- msg_b.submit(model_inference, b_inputs, [chatbot_b, msg_b])
148
- # btn_b.click(model_inference, b_inputs, [chatbot_b, msg_b])
149
  chatbot_b.like(save_feedback, [model_b_name, chatbot_b], None)
150
 
151
 
@@ -159,4 +174,4 @@ with gr.Blocks() as demo:
159
  )
160
 
161
  if __name__ == "__main__":
162
- demo.launch(share=False)
 
5
 
6
  import os
7
  import json
 
8
  import datetime
9
  import gradio as gr
10
  import pandas as pd
11
+ import subprocess
12
+ import time
13
  from pathlib import Path
14
  from huggingface_hub import CommitScheduler
15
  from huggingface_hub import InferenceClient
16
 
 
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
 
19
+ MODELS = dict()
20
+
21
+ # Launch models via vLLM
22
+ model_gpu_mapping = {
23
+ 0: 1000,
24
+ 1: 2000,
25
+ }
26
+
27
+ for index, (gpu_id, iter_num) in enumerate(model_gpu_mapping.items()):
28
+ formatted_iter_num = f"{iter_num:07d}"
29
+ model_name = f"Elfsong/VLM_stage_2_iter_{formatted_iter_num}"
30
+ key = f"Local-Model-{iter_num:05d}"
31
+
32
+ port = 9000 + index
33
+ print(f"Launching {model_name} on port {port} (GPU {gpu_id})")
34
+
35
+ # Create a log file for each model
36
+ log_file = open(f"./logs/vllm_{formatted_iter_num}.log", "w")
37
+
38
+ subprocess.Popen(
39
+ [
40
+ "python", "-m", "vllm.entrypoints.openai.api_server",
41
+ "--model", model_name,
42
+ "--port", str(port),
43
+ "--quantization", "bitsandbytes",
44
+ "--gpu-memory-utilization", "0.9",
45
+ "--trust-remote-code",
46
+ ],
47
+ env={**os.environ, "CUDA_VISIBLE_DEVICES": str(gpu_id)},
48
+ stdout=log_file,
49
+ stderr=log_file,
50
+ )
51
+
52
+ time.sleep(10) # Wait for initialization
53
+ MODELS[key] = f"http://localhost:{port}/v1"
54
 
55
  DATA_DIR = Path("logs")
56
  DATA_DIR.mkdir(exist_ok=True)
 
77
 
78
  print(f"Feedback logged for {model_name}")
79
 
80
+ def bot_response(user_message, history, model_name, system_message, thinking_mode, max_tokens, temperature, top_p):
 
81
  if not user_message or user_message.strip() == "":
82
  yield history, ""
83
  return
84
 
85
+ token = HF_TOKEN
86
 
87
  if model_name.startswith("Local-"):
88
+ local_endpoint = MODELS.get(model_name)
89
  client = InferenceClient(base_url=local_endpoint, token="vllm-token")
90
  else:
91
  client = InferenceClient(token=token, model=model_name)
92
 
 
93
  history.append({"role": "user", "content": user_message})
94
  history.append({"role": "assistant", "content": ""})
95
 
96
+ api_messages = [{"role": "system", "content": system_message + "/set think" if thinking_mode else "/set nothink"}] + history[:-1]
 
97
 
98
  try:
99
  stream = client.chat_completion(
 
127
  with gr.Blocks() as demo:
128
  with gr.Sidebar():
129
  gr.Markdown("## Configuration")
130
+ # gr.LoginButton()
131
+
 
 
 
 
 
 
 
 
 
 
 
 
132
  system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Prompt")
133
+ thinking_mode = gr.Checkbox(value=False, label="Thinking Mode")
134
  max_t = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
135
+ temp = gr.Slider(minimum=0.0, maximum=2.0, value=0.0, step=0.05, label="Temperature")
136
+ top_p_val = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="Top-p")
137
 
138
  gr.Markdown("# ⚔️ Chatbot Arena")
139
 
140
  with gr.Row():
141
  # --- Model A ---
142
  with gr.Column():
143
+ model_a_name = gr.Dropdown(list(MODELS.keys()), label="Model A", value=list(MODELS.keys())[0])
144
+ chatbot_a = gr.Chatbot(label="Model A Output")
145
  msg_a = gr.Textbox(placeholder="Send message to Model A...", label="Model A Input")
146
+ btn_a = gr.Button("Send to Model A")
147
 
148
  # --- Model B ---
149
  with gr.Column():
150
+ model_b_name = gr.Dropdown(list(MODELS.keys()), label="Model B", value=list(MODELS.keys())[-1])
151
+ chatbot_b = gr.Chatbot(label="Model B Output")
152
  msg_b = gr.Textbox(placeholder="Send message to Model B...", label="Model B Input")
153
+ btn_b = gr.Button("Send to Model B")
154
 
155
  # --- Bind Events ---
156
+ a_inputs = [msg_a, chatbot_a, model_a_name, system_msg, thinking_mode, max_t, temp, top_p_val]
157
+ msg_a.submit(bot_response, a_inputs, [chatbot_a, msg_a])
158
+ btn_a.click(bot_response, a_inputs, [chatbot_a, msg_a])
159
  chatbot_a.like(save_feedback, [model_a_name, chatbot_a], None)
160
 
161
+ b_inputs = [msg_b, chatbot_b, model_b_name, system_msg, thinking_mode, max_t, temp, top_p_val]
162
+ msg_b.submit(bot_response, b_inputs, [chatbot_b, msg_b])
163
+ btn_b.click(bot_response, b_inputs, [chatbot_b, msg_b])
164
  chatbot_b.like(save_feedback, [model_b_name, chatbot_b], None)
165
 
166
 
 
174
  )
175
 
176
  if __name__ == "__main__":
177
+ demo.launch(share=True)