moevis commited on
Commit
01ee73c
·
1 Parent(s): 2cea1bc

添加 Dockerfile、app.py、requirements.txt 和启动脚本,构建 Gradio 多模态聊天界面

Browse files
Files changed (4) hide show
  1. Dockerfile +25 -0
  2. app.py +418 -72
  3. requirements.txt +2 -0
  4. start_gradio.sh +66 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM stepfun2025/vllm:step-audio-2-v20250909
2
+
3
+ WORKDIR /root/app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ # 复制应用文件
9
+ COPY app.py .
10
+ COPY start_gradio.sh .
11
+
12
+ # 创建模型目录
13
+ RUN mkdir -p /root/models
14
+
15
+ # 设置脚本可执行权限
16
+ RUN chmod +x start_gradio.sh
17
+
18
+ # 暴露 Gradio 端口
19
+ EXPOSE 7860
20
+
21
+ # 设置环境变量
22
+ ENV VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
23
+
24
+ # 启动脚本
25
+ CMD ["./start_gradio.sh"]
app.py CHANGED
@@ -1,81 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from vllm import LLM, SamplingParams
3
-
4
- llm = LLM(
5
- model="stepfun-ai/Step-Audio-2-mini-Think", # 修改为你需要的模型
6
- trust_remote_code=True,
7
- tensor_parallel_size=4, # 如果有多张GPU,设置并行数量
8
- # gpu_memory_utilization=0.9, # GPU显存利用率
9
- served_model_name="step-audio-2-mini-think",
10
- tokenizer_mode="step_audio_2",
11
- max_model_len=8192,
12
- )
13
-
14
-
15
- def respond(
16
- message,
17
- history: list[dict[str, str]],
18
- system_message,
19
- max_tokens,
20
- temperature,
21
- top_p,
22
- hf_token: gr.OAuthToken,
23
- ):
24
- """
25
- 使用 vllm 在本地进行推理
26
- """
27
- # 构建对话消息
28
- messages = [{"role": "system", "content": system_message}]
29
- messages.extend(history)
30
- messages.append({"role": "user", "content": message})
31
-
32
- # 设置采样参数
33
- sampling_params = SamplingParams(
34
- temperature=temperature,
35
- top_p=top_p,
36
- max_tokens=max_tokens,
37
- )
38
 
39
- # 使用 vllm 的 chat 接口进行推理
40
- outputs = llm.chat(
41
- messages=messages,
42
- sampling_params=sampling_params,
43
- use_tqdm=False,
44
- )
45
 
46
- # 获取生成的文本
47
- response = outputs[0].outputs[0].text
 
 
 
 
48
 
49
- # 模拟流式输出效果(逐字符yield)
50
- for i in range(1, len(response) + 1):
51
- yield response[:i]
 
 
 
 
 
52
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- """
55
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
56
- """
57
- chatbot = gr.ChatInterface(
58
- respond,
59
- type="messages",
60
- additional_inputs=[
61
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
62
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
63
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
64
- gr.Slider(
65
- minimum=0.1,
66
- maximum=1.0,
67
- value=0.95,
68
- step=0.05,
69
- label="Top-p (nucleus sampling)",
70
- ),
71
- ],
72
- )
73
-
74
- with gr.Blocks() as demo:
75
- with gr.Sidebar():
76
- gr.LoginButton()
77
- chatbot.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
 
80
  if __name__ == "__main__":
81
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Gradio 多模态聊天界面:直接在 app.py 内部调用 vLLM.LLM 进行推理
5
+ """
6
+
7
+ import base64
8
+ import os
9
+ import threading
10
+ import time
11
+ from typing import Optional, Tuple
12
+
13
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # 检查是否启用 vLLM 模式
16
+ ENABLE_VLLM = os.getenv("ENABLE_VLLM", "true").lower() in ("true", "1", "yes")
 
 
 
 
17
 
18
+ if ENABLE_VLLM:
19
+ from vllm import LLM, SamplingParams
20
+ else:
21
+ LLM = None
22
+ SamplingParams = None
23
+ print("[INFO] 运行在界面预览模式,不加载 vLLM")
24
 
25
+ # 默认配置,可通过环境变量或 CLI 覆盖
26
+ DEFAULT_MODEL_ID = os.getenv("MODEL_NAME", "stepfun-ai/Step-Audio-2-mini-Think")
27
+ DEFAULT_MODEL_PATH = os.getenv("MODEL_PATH", DEFAULT_MODEL_ID)
28
+ DEFAULT_TP = int(os.getenv("TENSOR_PARALLEL_SIZE", "4"))
29
+ DEFAULT_MAX_MODEL_LEN = int(os.getenv("MAX_MODEL_LEN", "8192"))
30
+ DEFAULT_GPU_UTIL = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.9"))
31
+ DEFAULT_TOKENIZER_MODE = os.getenv("TOKENIZER_MODE", "step_audio_2")
32
+ DEFAULT_SERVED_NAME = os.getenv("SERVED_MODEL_NAME", "step-audio-2-mini-think")
33
 
34
+ _llm: Optional[LLM] = None
35
+ _llm_lock = threading.Lock()
36
+ LLM_ARGS = {
37
+ "model": DEFAULT_MODEL_PATH,
38
+ "trust_remote_code": True,
39
+ "tensor_parallel_size": DEFAULT_TP,
40
+ "tokenizer_mode": DEFAULT_TOKENIZER_MODE,
41
+ "max_model_len": DEFAULT_MAX_MODEL_LEN,
42
+ "served_model_name": DEFAULT_SERVED_NAME,
43
+ "gpu_memory_utilization": DEFAULT_GPU_UTIL,
44
+ }
45
 
46
+
47
+ def encode_audio_to_base64(audio_path: Optional[str]) -> Optional[dict]:
48
+ """将音频文件编码为 base64"""
49
+ if audio_path is None:
50
+ return None
51
+
52
+ try:
53
+ with open(audio_path, "rb") as audio_file:
54
+ audio_data = audio_file.read()
55
+ audio_base64 = base64.b64encode(audio_data).decode('utf-8')
56
+ # 尝试从文件扩展名推断格式
57
+ ext = os.path.splitext(audio_path)[1].lower().lstrip('.')
58
+ if not ext:
59
+ ext = "wav" # 默认格式
60
+ return {
61
+ "data": audio_base64,
62
+ "format": ext
63
+ }
64
+ except Exception as e:
65
+ print(f"Error encoding audio: {e}")
66
+ return None
67
+
68
+
69
+ def format_messages(
70
+ system_prompt: str,
71
+ chat_history: list,
72
+ user_text: str,
73
+ audio_file: Optional[str]
74
+ ) -> list:
75
+ """格式化消息为 OpenAI API 格式"""
76
+ messages = []
77
+
78
+ # 添加 system prompt
79
+ if system_prompt and system_prompt.strip():
80
+ messages.append({
81
+ "role": "system",
82
+ "content": system_prompt.strip()
83
+ })
84
+
85
+ # 添加历史对话
86
+ for human, assistant in chat_history:
87
+ if human:
88
+ messages.append({"role": "user", "content": human})
89
+ if assistant:
90
+ messages.append({"role": "assistant", "content": assistant})
91
+
92
+ # 添加当前用户输入
93
+ content_parts = []
94
+
95
+ # 添加文本输入
96
+ if user_text and user_text.strip():
97
+ content_parts.append({
98
+ "type": "text",
99
+ "text": user_text.strip()
100
+ })
101
+
102
+ # 添加音频输入
103
+ if audio_file:
104
+ audio_data = encode_audio_to_base64(audio_file)
105
+ if audio_data:
106
+ content_parts.append({
107
+ "type": "input_audio",
108
+ "input_audio": audio_data
109
+ })
110
+
111
+ if content_parts:
112
+ # 如果只有一个文本部分,直接使用字符串
113
+ if len(content_parts) == 1 and content_parts[0]["type"] == "text":
114
+ messages.append({
115
+ "role": "user",
116
+ "content": content_parts[0]["text"]
117
+ })
118
+ else:
119
+ messages.append({
120
+ "role": "user",
121
+ "content": content_parts
122
+ })
123
+
124
+ return messages
125
+
126
+
127
+ def chat_predict(
128
+ system_prompt: str,
129
+ user_text: str,
130
+ audio_file: Optional[str],
131
+ chat_history: list,
132
+ max_tokens: int,
133
+ temperature: float,
134
+ top_p: float
135
+ ) -> Tuple[list, str]:
136
+ """调用本地 vLLM LLM 完成推理"""
137
+ if not user_text and not audio_file:
138
+ return chat_history, "⚠ 请提供文本或音频输入"
139
+
140
+ # 如果是预览模式,返回模拟响应
141
+ if not ENABLE_VLLM:
142
+ user_display = user_text if user_text else "[音频输入]"
143
+ mock_response = f"[预览模式] 这是一个模拟回复。您说: {user_text[:50] if user_text else '音频'}"
144
+ chat_history.append((user_display, mock_response))
145
+ return chat_history, "✓ 预览模式(未启用 vLLM)"
146
+
147
+ messages = format_messages(system_prompt, chat_history, user_text, audio_file)
148
+ if not messages:
149
+ return chat_history, "⚠ 无有效输入"
150
+
151
+ try:
152
+ llm = _get_llm()
153
+ sampling_params = SamplingParams(
154
+ max_tokens=max_tokens,
155
+ temperature=temperature,
156
+ top_p=top_p,
157
+ )
158
+ start_time = time.time()
159
+ outputs = llm.chat(messages, sampling_params=sampling_params, use_tqdm=False)
160
+ latency = time.time() - start_time
161
+
162
+ if not outputs or not outputs[0].outputs:
163
+ return chat_history, "⚠ 模型未返回结果"
164
+
165
+ assistant_message = outputs[0].outputs[0].text
166
+ user_display = user_text if user_text else "[音频输入]"
167
+ chat_history.append((user_display, assistant_message))
168
+ status = f"✓ 推理完成(耗时 {latency:.2f}s)"
169
+ return chat_history, status
170
+ except Exception as e:
171
+ import traceback
172
+ traceback.print_exc()
173
+ return chat_history, f"✗ 推理失败: {e}"
174
+
175
+
176
+ def _get_llm() -> LLM:
177
+ """单例方式初始化 LLM"""
178
+ if not ENABLE_VLLM:
179
+ raise RuntimeError("vLLM 未启用,无法加载模型")
180
+
181
+ global _llm
182
+ if _llm is not None:
183
+ return _llm
184
+
185
+ with _llm_lock:
186
+ if _llm is not None:
187
+ return _llm
188
+ print(f"[LLM] 初始化中,参数: {LLM_ARGS}")
189
+ _llm = LLM(**LLM_ARGS)
190
+ return _llm
191
+
192
+
193
+ def _set_llm_args(**kwargs) -> None:
194
+ """更新 LLM 初始化参数"""
195
+ global LLM_ARGS, _llm
196
+ LLM_ARGS = kwargs
197
+ _llm = None # 确保使用新配置重新加载
198
+
199
+
200
+ def check_model_status() -> str:
201
+ """返回模型当前加载状态"""
202
+ if not ENABLE_VLLM:
203
+ return "⚙ 界面预览模式(vLLM 未启用)"
204
+
205
+ model_path = LLM_ARGS["model"]
206
+ if _llm is None:
207
+ return f"等待加载:{model_path}"
208
+ return f"✓ 已加载模型:{model_path}"
209
+
210
+
211
+ def warmup_model() -> str:
212
+ """主动加载模型"""
213
+ if not ENABLE_VLLM:
214
+ return "⚙ 界面预览模式(vLLM 未启用)"
215
+
216
+ try:
217
+ _get_llm()
218
+ return check_model_status()
219
+ except Exception as exc:
220
+ import traceback
221
+ traceback.print_exc()
222
+ return f"✗ 模型加载失败: {exc}"
223
+
224
+
225
+ # 构建 Gradio 界面
226
+ with gr.Blocks(title="Step Audio 2 Chat", theme=gr.themes.Soft()) as demo:
227
+ gr.Markdown(
228
+ """
229
+ # Step Audio 2 Chat Interface
230
+
231
+ 支持文本和音频输入的聊天界面,直接在本地 vLLM 引擎上推理。
232
+ """
233
+ )
234
+
235
+ # 模型状态
236
+ with gr.Row():
237
+ status_text = gr.Textbox(
238
+ label="模型状态",
239
+ value="检查中...",
240
+ interactive=False
241
+ )
242
+ check_btn = gr.Button("加载/检查模型", variant="secondary")
243
+
244
+ with gr.Row():
245
+ # 左侧:输入区域
246
+ with gr.Column(scale=1):
247
+ gr.Markdown("### 输入设置")
248
+
249
+ system_prompt = gr.Textbox(
250
+ label="System Prompt",
251
+ placeholder="输入系统提示词...",
252
+ lines=3,
253
+ value=""
254
+ )
255
+
256
+ user_text = gr.Textbox(
257
+ label="文本输入",
258
+ placeholder="输入您的消息...",
259
+ lines=3
260
+ )
261
+
262
+ audio_file = gr.Audio(
263
+ label="音频输入",
264
+ type="filepath",
265
+ sources=["upload", "microphone"]
266
+ )
267
+
268
+ with gr.Row():
269
+ max_tokens = gr.Slider(
270
+ label="Max Tokens",
271
+ minimum=1,
272
+ maximum=8192,
273
+ value=2048,
274
+ step=1
275
+ )
276
+
277
+ with gr.Row():
278
+ temperature = gr.Slider(
279
+ label="Temperature",
280
+ minimum=0.0,
281
+ maximum=2.0,
282
+ value=0.7,
283
+ step=0.1
284
+ )
285
+
286
+ top_p = gr.Slider(
287
+ label="Top P",
288
+ minimum=0.0,
289
+ maximum=1.0,
290
+ value=0.9,
291
+ step=0.05
292
+ )
293
+
294
+ submit_btn = gr.Button("提交", variant="primary", size="lg")
295
+ clear_btn = gr.Button("清空", variant="secondary")
296
+
297
+ # 右侧:聊天历史
298
+ with gr.Column(scale=1):
299
+ gr.Markdown("### 聊天历史")
300
+ chatbot = gr.Chatbot(
301
+ label="对话",
302
+ height=600,
303
+ show_copy_button=True
304
+ )
305
+
306
+ # 事件绑定
307
+ check_btn.click(fn=warmup_model, outputs=status_text)
308
+
309
+ submit_btn.click(
310
+ fn=chat_predict,
311
+ inputs=[
312
+ system_prompt,
313
+ user_text,
314
+ audio_file,
315
+ chatbot,
316
+ max_tokens,
317
+ temperature,
318
+ top_p
319
+ ],
320
+ outputs=[chatbot, status_text]
321
+ )
322
+
323
+ clear_btn.click(
324
+ fn=lambda: ([], "", None),
325
+ outputs=[chatbot, user_text, audio_file]
326
+ )
327
+
328
+ # 页面加载时显示状态
329
+ demo.load(fn=check_model_status, outputs=status_text)
330
 
331
 
332
  if __name__ == "__main__":
333
+ import argparse
334
+
335
+ parser = argparse.ArgumentParser(description="Step Audio 2 Gradio Chat Interface")
336
+ parser.add_argument(
337
+ "--host",
338
+ type=str,
339
+ default="0.0.0.0",
340
+ help="服务器主机地址"
341
+ )
342
+ parser.add_argument(
343
+ "--port",
344
+ type=int,
345
+ default=7860,
346
+ help="服务器端口"
347
+ )
348
+ parser.add_argument(
349
+ "--model",
350
+ type=str,
351
+ default=DEFAULT_MODEL_PATH,
352
+ help="模型名称或本地路径"
353
+ )
354
+ parser.add_argument(
355
+ "--tensor-parallel-size",
356
+ type=int,
357
+ default=DEFAULT_TP,
358
+ help="张量并行数量"
359
+ )
360
+ parser.add_argument(
361
+ "--max-model-len",
362
+ type=int,
363
+ default=DEFAULT_MAX_MODEL_LEN,
364
+ help="最大上下文长度"
365
+ )
366
+ parser.add_argument(
367
+ "--gpu-memory-utilization",
368
+ type=float,
369
+ default=DEFAULT_GPU_UTIL,
370
+ help="GPU 显存利用率"
371
+ )
372
+ parser.add_argument(
373
+ "--tokenizer-mode",
374
+ type=str,
375
+ default=DEFAULT_TOKENIZER_MODE,
376
+ help="tokenizer 模式"
377
+ )
378
+ parser.add_argument(
379
+ "--served-model-name",
380
+ type=str,
381
+ default=DEFAULT_SERVED_NAME,
382
+ help="对外暴露的模型名称"
383
+ )
384
+ parser.add_argument(
385
+ "--no-vllm",
386
+ action="store_true",
387
+ help="禁用 vLLM,仅启动界面预览模式"
388
+ )
389
+
390
+ args = parser.parse_args()
391
+
392
+ # 如果指定了 --no-vllm,覆盖环境变量
393
+ if args.no_vllm:
394
+ global ENABLE_VLLM
395
+ ENABLE_VLLM = False
396
+ print("[INFO] 已禁用 vLLM,运行在界面预览模式")
397
+
398
+ _set_llm_args(
399
+ model=args.model,
400
+ trust_remote_code=True,
401
+ tensor_parallel_size=args.tensor_parallel_size,
402
+ tokenizer_mode=args.tokenizer_mode,
403
+ max_model_len=args.max_model_len,
404
+ served_model_name=args.served_model_name,
405
+ gpu_memory_utilization=args.gpu_memory_utilization,
406
+ )
407
+
408
+ print("==========================================")
409
+ print("Step Audio 2 Gradio Chat")
410
+ if ENABLE_VLLM:
411
+ print(f"模式: vLLM 推理模式")
412
+ print(f"模型: {args.model}")
413
+ print(f"Tensor Parallel Size: {args.tensor_parallel_size}")
414
+ print(f"Max Model Len: {args.max_model_len}")
415
+ print(f"Tokenizer Mode: {args.tokenizer_mode}")
416
+ print(f"Served Model Name: {args.served_model_name}")
417
+ else:
418
+ print(f"模式: 界面预览模式(无 vLLM)")
419
+ print(f"Gradio 地址: http://{args.host}:{args.port}")
420
+ print("==========================================")
421
+
422
+ demo.queue().launch(
423
+ server_name=args.host,
424
+ server_port=args.port,
425
+ share=False
426
+ )
427
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=4.0.0
2
+ huggingface-hub>=0.20.0
start_gradio.sh ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # 启动脚本:可选下载模型,并直接启动 Gradio 应用(内部调用 vLLM.LLM)
3
+
4
+ set -euo pipefail
5
+
6
+ MODEL_REPO="${MODEL_REPO:-stepfun-ai/Step-Audio-2-mini-Think}"
7
+ MODEL_DIR="${MODEL_DIR:-/root/models/Step-Audio-2-mini-Think}"
8
+ PRELOAD_MODEL="${PRELOAD_MODEL:-1}"
9
+ GRADIO_PORT=${GRADIO_PORT:-7860}
10
+ HOST=${HOST:-0.0.0.0}
11
+ TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-4}
12
+ MAX_MODEL_LEN=${MAX_MODEL_LEN:-8192}
13
+ GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.9}
14
+ TOKENIZER_MODE=${TOKENIZER_MODE:-step_audio_2}
15
+ SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-step-audio-2-mini-think}
16
+
17
+ echo "=========================================="
18
+ echo "Step Audio 2 Gradio 启动脚本"
19
+ echo "MODEL_REPO: $MODEL_REPO"
20
+ echo "MODEL_DIR : $MODEL_DIR"
21
+ echo "PRELOAD_MODEL: $PRELOAD_MODEL"
22
+ echo "HOST/PORT: $HOST:$GRADIO_PORT"
23
+ echo "TP: $TENSOR_PARALLEL_SIZE | MAX_LEN: $MAX_MODEL_LEN"
24
+ echo "=========================================="
25
+
26
+ download_model() {
27
+ if command -v huggingface-cli &> /dev/null; then
28
+ echo "[Download] 使用 huggingface-cli"
29
+ huggingface-cli download "$MODEL_REPO" --local-dir "$MODEL_DIR" --local-dir-use-symlinks False
30
+ else
31
+ echo "[Download] 使用 python + huggingface_hub"
32
+ python3 -c "
33
+ from huggingface_hub import snapshot_download
34
+ print('开始下载: $MODEL_REPO')
35
+ snapshot_download(repo_id='$MODEL_REPO', local_dir='$MODEL_DIR', local_dir_use_symlinks=False)
36
+ print('下载完成')
37
+ "
38
+ fi
39
+ }
40
+
41
+ if [[ "$PRELOAD_MODEL" == "1" ]]; then
42
+ if [[ ! -d "$MODEL_DIR" ]] || [[ ! -f "$MODEL_DIR/config.json" ]]; then
43
+ echo "模型未就绪,开始下载..."
44
+ mkdir -p "$MODEL_DIR"
45
+ download_model
46
+ else
47
+ echo "检测到本地模型: $MODEL_DIR"
48
+ fi
49
+ export MODEL_PATH="$MODEL_DIR"
50
+ else
51
+ echo "跳过预下载,直接使用仓库名称加载"
52
+ export MODEL_PATH="${MODEL_PATH:-$MODEL_REPO}"
53
+ fi
54
+
55
+ echo "模型路径: ${MODEL_PATH}"
56
+ echo "启动 Gradio..."
57
+
58
+ python app.py \
59
+ --host "$HOST" \
60
+ --port "$GRADIO_PORT" \
61
+ --model "$MODEL_PATH" \
62
+ --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
63
+ --max-model-len "$MAX_MODEL_LEN" \
64
+ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
65
+ --tokenizer-mode "$TOKENIZER_MODE" \
66
+ --served-model-name "$SERVED_MODEL_NAME"