TearDropAi commited on
Commit
60c7d39
·
verified ·
1 Parent(s): 1f4097b

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +13 -3
  2. app.py +1189 -0
  3. gitattributes +35 -0
  4. requirements.txt +3 -0
README.md CHANGED
@@ -1,3 +1,13 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Qwen3.5 Omni Offline Demo
3
+ emoji: 🌍
4
+ colorFrom: purple
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+ Open in ModelScope Studio: https://modelscope.cn/studios/Qwen/Qwen3.5-Omni-Offline-Demo
app.py ADDED
@@ -0,0 +1,1189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import os
4
+ import subprocess
5
+ import time
6
+ import uuid
7
+ from argparse import ArgumentParser
8
+
9
+ import gradio as gr
10
+ import gradio.processing_utils as processing_utils
11
+ import numpy as np
12
+ import oss2
13
+ import soundfile as sf
14
+ from gradio_client import utils as client_utils
15
+ from openai import OpenAI
16
+
17
+ OSS_RETRY = 10
18
+ OSS_RETRY_DELAY = 3
19
+ WAV_BIT_RATE = 16
20
+ WAV_SAMPLE_RATE = os.environ.get("WAV_SAMPLE_RATE", 16000)
21
+
22
+ # from env import *
23
+
24
+ region = os.getenv("OSS_REGION")
25
+ endpoint = os.getenv("OSS_ENDPOINT")
26
+ bucket_name = os.getenv("OSS_BUCKET_NAME")
27
+ OSS_TEMP_BUCKET_DIR = os.getenv("OSS_TEMP_BUCKET_DIR", "")
28
+ API_KEY = os.environ['API_KEY']
29
+ OSS_ACCESS_KEY_ID = os.environ['OSS_ACCESS_KEY_ID']
30
+ OSS_ACCESS_KEY_SECRET = os.environ['OSS_ACCESS_KEY_SECRET']
31
+
32
+
33
+ OSS_CONFIG_PATH = {}
34
+
35
+
36
+ class OSSReader:
37
+
38
+ def __init__(self):
39
+ # 初始化OSS配置
40
+ self.bucket2object = {
41
+ bucket_name:
42
+ oss2.Bucket(oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET),
43
+ endpoint, bucket_name),
44
+ }
45
+ print(
46
+ f"Loaded OSS config from: {OSS_CONFIG_PATH}\nSupported buckets: {list(self.bucket2object.keys())}"
47
+ )
48
+
49
+ def _parse_oss_path(self, oss_path):
50
+ """解析oss路径,返回bucket名称和实际路径"""
51
+ assert oss_path.startswith("oss://"), f"Invalid oss path {oss_path}"
52
+ bucket_name, object_key = oss_path.split("oss://")[-1].split("/", 1)
53
+ object_key = f"studio-temp/Qwen3-Omni-Demo/{object_key}"
54
+ return bucket_name, object_key
55
+
56
+ def _retry_operation(self,
57
+ func,
58
+ *args,
59
+ retries=OSS_RETRY,
60
+ delay=OSS_RETRY,
61
+ **kwargs):
62
+ """通用的重试机制"""
63
+ for _ in range(retries):
64
+ try:
65
+ return func(*args, **kwargs)
66
+ except Exception as e:
67
+ print(f"Retry: {_} Error: {str(e)}")
68
+ if _ == retries - 1:
69
+ raise e
70
+ time.sleep(delay)
71
+
72
+ def get_public_url(self, oss_path):
73
+ bucket_name, object_key = self._parse_oss_path(oss_path)
74
+ url = self._retry_operation(self.bucket2object[bucket_name].sign_url,
75
+ 'GET',
76
+ object_key,
77
+ 600,
78
+ slash_safe=True).replace(
79
+ 'http://', 'https://')
80
+ return url.replace("-internal", '')
81
+
82
+ def file_exists(self, oss_path):
83
+ """判断文件是否存在"""
84
+ bucket_name, object_key = self._parse_oss_path(oss_path)
85
+ return self._retry_operation(
86
+ self.bucket2object[bucket_name].object_exists, object_key)
87
+
88
+ def download_file(self, oss_path, local_path):
89
+ """下载OSS上的文件到本地"""
90
+ bucket_name, object_key = self._parse_oss_path(oss_path)
91
+ self._retry_operation(
92
+ self.bucket2object[bucket_name].get_object_to_file, object_key,
93
+ local_path)
94
+
95
+ def upload_file(self, local_path, oss_path, overwrite=True):
96
+ """上传本地文件到OSS"""
97
+ bucket_name, object_key = self._parse_oss_path(oss_path)
98
+ # 检查文件是否存在
99
+ if not os.path.exists(local_path):
100
+ raise FileNotFoundError(f"Local file {local_path} does not exist")
101
+ # 检查目标文件是否存在(当overwrite=False时)
102
+ if not overwrite and self.file_exists(oss_path):
103
+ print(f"File {oss_path} already exists, skip upload")
104
+ return False
105
+ # 执行上传操作
106
+ try:
107
+ self._retry_operation(
108
+ self.bucket2object[bucket_name].put_object_from_file,
109
+ object_key, local_path)
110
+ return True
111
+ except Exception as e:
112
+ print(f"Upload failed: {str(e)}")
113
+ return False
114
+
115
+ def upload_audio_from_array(self,
116
+ data,
117
+ sample_rate,
118
+ oss_path,
119
+ overwrite=True):
120
+ """将音频数据保存为WAV格式并上传到OSS"""
121
+ bucket_name, object_key = self._parse_oss_path(oss_path)
122
+
123
+ # 检查目标文件是否存在(当overwrite=False时)
124
+ if not overwrite and self.file_exists(oss_path):
125
+ print(f"File {oss_path} already exists, skip upload")
126
+ return False
127
+
128
+ try:
129
+ # 使用 BytesIO 在内存中生成 WAV 格式数据
130
+ import wave
131
+ from io import BytesIO
132
+
133
+ byte_io = BytesIO()
134
+ with wave.open(byte_io, 'wb') as wf:
135
+ wf.setnchannels(1) # 单声道
136
+ wf.setsampwidth(2) # 16-bit PCM
137
+ wf.setframerate(sample_rate) # 设置采样率
138
+ # 将 float32 数据转换为 int16 并写入 WAV
139
+ data_int16 = np.clip(data, -1, 1) * 32767
140
+ data_int16 = data_int16.astype(np.int16)
141
+ wf.writeframes(data_int16.tobytes())
142
+
143
+ # 上传到 OSS
144
+ self._retry_operation(self.bucket2object[bucket_name].put_object,
145
+ object_key, byte_io.getvalue())
146
+ return True
147
+ except Exception as e:
148
+ print(f"Upload failed: {str(e)}")
149
+ return False
150
+
151
+ def get_object(self, oss_path):
152
+ """读取OSS上的音频文件,返回音频数据和采样率"""
153
+ bucket_name, object_key = self._parse_oss_path(oss_path)
154
+ return self._retry_operation(
155
+ self.bucket2object[bucket_name].get_object, object_key)
156
+
157
+ def read_text_file(self, oss_path):
158
+ """读取OSS上的文本文件"""
159
+ bucket_name, object_key = self._parse_oss_path(oss_path)
160
+ result = self._retry_operation(
161
+ self.bucket2object[bucket_name].get_object, object_key)
162
+ return result.read().decode('utf-8')
163
+
164
+ def read_audio_file(self, oss_path):
165
+ """读取OSS上的音频文件,返回音频数据和采样率"""
166
+ bucket_name, object_key = self._parse_oss_path(oss_path)
167
+ result = self._retry_operation(
168
+ self.bucket2object[bucket_name].get_object, object_key)
169
+ # ffmpeg 命令:从标准输入读取音频并输出PCM浮点数据
170
+ command = [
171
+ 'ffmpeg',
172
+ '-i',
173
+ '-', # 输入来自管道
174
+ '-ar',
175
+ str(WAV_SAMPLE_RATE), # 输出采样率
176
+ '-ac',
177
+ '1', # 单声道
178
+ '-f',
179
+ 'f32le', # 指定输出格式
180
+ '-' # 输出到管道
181
+ ]
182
+ # 启动ffmpeg子进程
183
+ process = subprocess.Popen(command,
184
+ stdin=subprocess.PIPE,
185
+ stdout=subprocess.PIPE,
186
+ stderr=subprocess.PIPE)
187
+ # 写入音频字节并获取输出
188
+ stdout_data, stderr_data = process.communicate(input=result.read())
189
+ if process.returncode != 0:
190
+ raise RuntimeError(f"FFmpeg error: {stderr_data.decode('utf-8')}")
191
+ # 将PCM数据转换为numpy数组
192
+ wav_data = np.frombuffer(stdout_data, dtype=np.float32)
193
+ return wav_data, WAV_SAMPLE_RATE
194
+
195
+ def get_wav_duration_from_bin(self, oss_path):
196
+ oss_bin_path = oss_path + ".ar16k.bin"
197
+ bucket_name, object_key = self._parse_oss_path(oss_bin_path)
198
+ metadata = self._retry_operation(
199
+ self.bucket2object[bucket_name].get_object_meta, object_key)
200
+ duration = float(metadata.headers['Content-Length']) / (16000 * 2)
201
+ return duration
202
+
203
+ def read_wavdata_from_oss(self,
204
+ oss_path,
205
+ start=None,
206
+ end=None,
207
+ force_bin=False):
208
+ bucket_name, object_key = self._parse_oss_path(oss_path)
209
+ oss_bin_key = object_key + ".ar16k.bin"
210
+ if start is None or end is None:
211
+ if self.bucket2object[bucket_name].object_exists(oss_bin_key):
212
+ wav_data = self._retry_operation(
213
+ self.bucket2object[bucket_name].get_object,
214
+ oss_bin_key).read()
215
+ elif not force_bin:
216
+ wav_data, _ = self.read_audio_file(oss_path)
217
+ else:
218
+ raise ValueError(f"Cannot find bin file for {oss_path}")
219
+ else:
220
+ bytes_per_second = WAV_SAMPLE_RATE * (WAV_BIT_RATE // 8)
221
+ # 计算字节偏移量
222
+ start_offset = round(start * bytes_per_second)
223
+ end_offset = round(end * bytes_per_second)
224
+ if not (end_offset - start_offset) % 2:
225
+ end_offset -= 1
226
+ # 使用范围请求只获取指定字节范围的数据
227
+ wav_data = self._retry_operation(
228
+ self.bucket2object[bucket_name].get_object,
229
+ oss_bin_key,
230
+ byte_range=(start_offset, end_offset),
231
+ headers={
232
+ 'x-oss-range-behavior': 'standard'
233
+ }).read()
234
+ if not isinstance(wav_data, np.ndarray):
235
+ wav_data = np.frombuffer(wav_data, np.int16).flatten() / 32768.0
236
+ return wav_data.astype(np.float32)
237
+
238
+ def _list_files_by_suffix(self, oss_dir, suffix):
239
+ """递归搜索以某个后缀结尾的所有文件,返回所有文件的OSS路径列表"""
240
+ bucket_name, dir_key = self._parse_oss_path(oss_dir)
241
+ file_list = []
242
+
243
+ def _recursive_list(prefix):
244
+ for obj in oss2.ObjectIterator(self.bucket2object[bucket_name],
245
+ prefix=prefix,
246
+ delimiter='/'):
247
+ if obj.is_prefix(): # 如果是目录,递归搜索
248
+ _recursive_list(obj.key)
249
+ elif obj.key.endswith(suffix):
250
+ file_list.append(f"oss://{bucket_name}/{obj.key}")
251
+
252
+ _recursive_list(dir_key)
253
+ return file_list
254
+
255
+ def list_files_by_suffix(self, oss_dir, suffix):
256
+ return self._retry_operation(self._list_files_by_suffix, oss_dir,
257
+ suffix)
258
+
259
+ def _list_files_by_prefix(self, oss_dir, file_prefix):
260
+ """递归搜索以某个后缀结尾的所有文件,返回所有文件的OSS路径列表"""
261
+ bucket_name, dir_key = self._parse_oss_path(oss_dir)
262
+ file_list = []
263
+
264
+ def _recursive_list(prefix):
265
+ for obj in oss2.ObjectIterator(self.bucket2object[bucket_name],
266
+ prefix=prefix,
267
+ delimiter='/'):
268
+ if obj.is_prefix(): # 如果是目录,递归搜索
269
+ _recursive_list(obj.key)
270
+ elif os.path.basename(obj.key).startswith(file_prefix):
271
+ file_list.append(f"oss://{bucket_name}/{obj.key}")
272
+
273
+ _recursive_list(dir_key)
274
+ return file_list
275
+
276
+ def list_files_by_prefix(self, oss_dir, file_prefix):
277
+ return self._retry_operation(self._list_files_by_prefix, oss_dir,
278
+ file_prefix)
279
+
280
+
281
+ def _launch_offline_demo(args, model, oss_reader, model_name):
282
+ default_system_prompt = ''
283
+
284
+ def to_mp4(path):
285
+ if path and path.endswith(".webm"):
286
+ mp4_path = path.replace(".webm", ".mp4")
287
+ subprocess.run([
288
+ "ffmpeg", "-y",
289
+ "-i", path,
290
+ "-c:v", "libx264",
291
+ "-pix_fmt", "yuv420p",
292
+ "-c:a", "aac",
293
+ "-movflags", "+faststart",
294
+ "-f", "mp4",
295
+ mp4_path
296
+ ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
297
+ return mp4_path
298
+ return path
299
+
300
+ def format_history(history: list, system_prompt: str):
301
+ print(history)
302
+ messages = []
303
+ if system_prompt != "":
304
+ messages.append({
305
+ "role": "system",
306
+ "content": [{
307
+ "type": "text",
308
+ "text": system_prompt
309
+ }]
310
+ })
311
+
312
+ current_user_content = []
313
+
314
+ for item in history:
315
+ role = item['role']
316
+ content = item['content']
317
+
318
+ if role != "user":
319
+ if current_user_content:
320
+ messages.append({
321
+ "role": "user",
322
+ "content": current_user_content
323
+ })
324
+ current_user_content = []
325
+
326
+ if isinstance(content, str):
327
+ messages.append({
328
+ "role": role,
329
+ "content": [{
330
+ "type": "text",
331
+ "text": content
332
+ }]
333
+ })
334
+ else:
335
+ pass
336
+ continue
337
+
338
+ if isinstance(content, str):
339
+ current_user_content.append({"type": "text", "text": content})
340
+ elif isinstance(content, (list, tuple)):
341
+ for file_path in content:
342
+ mime_type = client_utils.get_mimetype(file_path)
343
+ media_type = None
344
+
345
+ if mime_type.startswith("image"):
346
+ media_type = "image_url"
347
+ elif mime_type.startswith("video"):
348
+ media_type = "video_url"
349
+ file_path = to_mp4(file_path)
350
+ elif mime_type.startswith("audio"):
351
+ media_type = "input_audio"
352
+
353
+ if media_type:
354
+ request_id = str(uuid.uuid4())
355
+ oss_path = f"oss://{bucket_name}/{OSS_TEMP_BUCKET_DIR}" + request_id
356
+ oss_reader.upload_file(file_path, oss_path)
357
+ media_url = oss_reader.get_public_url(oss_path)
358
+ if media_type == "input_audio":
359
+ current_user_content.append({
360
+ "type": "input_audio",
361
+ "input_audio": {
362
+ "data": media_url,
363
+ "format": "wav",
364
+ },
365
+ })
366
+ if media_type == "image_url":
367
+ current_user_content.append({
368
+ "type": "image_url",
369
+ "image_url": {
370
+ "url": media_url
371
+ },
372
+ })
373
+ if media_type == "video_url":
374
+ current_user_content.append({
375
+ "type": "video_url",
376
+ "video_url": {
377
+ "url": media_url
378
+ },
379
+ "fps": 1,
380
+ })
381
+ else:
382
+ current_user_content.append({
383
+ "type": "text",
384
+ "text": file_path
385
+ })
386
+
387
+ if current_user_content:
388
+ media_items = []
389
+ text_items = []
390
+
391
+ for item in current_user_content:
392
+ if item["type"] == "text":
393
+ text_items.append(item)
394
+ else:
395
+ media_items.append(item)
396
+
397
+ messages.append({
398
+ "role": "user",
399
+ "content": media_items + text_items
400
+ })
401
+
402
+ return messages
403
+
404
+ def predict(messages,
405
+ temperature=0.7,
406
+ top_p=0.8,
407
+ top_k=20):
408
+ completion = model.chat.completions.create(
409
+ model=model_name,
410
+ messages=messages,
411
+ modalities=["text"],
412
+ extra_body={
413
+ "top_k": top_k
414
+ },
415
+ stream_options={"include_usage": True},
416
+ stream=True,
417
+ temperature=temperature,
418
+ top_p=top_p,
419
+ )
420
+
421
+ output_text = ""
422
+ request_id = ""
423
+ request_id_prefixed = False
424
+
425
+ for chunk in completion:
426
+ if request_id == "" and hasattr(chunk, "id") and chunk.id:
427
+ request_id = chunk.id
428
+
429
+ if chunk.choices:
430
+ delta = chunk.choices[0].delta
431
+ if hasattr(delta, "content") and delta.content:
432
+ if args.debug and not request_id_prefixed and request_id:
433
+ output_text += f"[Request ID: {request_id}]\n\n"
434
+ request_id_prefixed = True
435
+ output_text += delta.content
436
+ yield {"type": "text", "data": output_text}
437
+ else:
438
+ print(chunk.usage)
439
+
440
+ def chat_predict(text,
441
+ audio,
442
+ image,
443
+ video,
444
+ history,
445
+ system_prompt,
446
+ temperature,
447
+ top_p,
448
+ top_k):
449
+
450
+ if audio:
451
+ history.append({"role": "user", "content": (audio, )})
452
+
453
+ if text:
454
+ history.append({"role": "user", "content": text})
455
+
456
+ if image:
457
+ history.append({"role": "user", "content": (image, )})
458
+
459
+ if video:
460
+ history.append({"role": "user", "content": (video, )})
461
+
462
+ formatted_history = format_history(history=history,
463
+ system_prompt=system_prompt)
464
+
465
+ yield None, None, None, None, history, gr.update(visible=False), gr.update(visible=True)
466
+
467
+ history.append({"role": "assistant", "content": ""})
468
+ for chunk in predict(formatted_history, temperature, top_p, top_k):
469
+ print('chat_predict chunk', chunk)
470
+ if chunk["type"] == "text":
471
+ history[-1]["content"] = chunk["data"]
472
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history, gr.update(visible=False), gr.update(visible=True)
473
+
474
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history, gr.update(visible=True), gr.update(visible=False)
475
+
476
+ with gr.Blocks(
477
+ theme=gr.themes.Soft(font=[
478
+ gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"
479
+ ]),
480
+ css=".gradio-container {max-width: none !important;}") as demo:
481
+ gr.Markdown("# Qwen3.5 Omni Offline Demo")
482
+ gr.Markdown(
483
+ "**Instructions**: Interact with the model by entering task text and optionally uploading one modality input, such as audio, image, or video. Then press Submit to get the response."
484
+ )
485
+ gr.Markdown(
486
+ "**使用说明**:1️⃣ 输入你希望模型执行的任务文本 2️⃣ 可选上传一种模态数据(音频、图片或视频)3️⃣ 点击提交并等待模型回答"
487
+ )
488
+
489
+ with gr.Row(equal_height=False):
490
+ with gr.Column(scale=1):
491
+ gr.Markdown("### ⚙️ Parameters (参数)")
492
+ system_prompt_textbox = gr.Textbox(label="System Prompt",
493
+ value=default_system_prompt,
494
+ lines=4,
495
+ max_lines=8)
496
+ temperature = gr.Slider(label="Temperature",
497
+ minimum=0.1,
498
+ maximum=2.0,
499
+ value=0.7,
500
+ step=0.1)
501
+ top_p = gr.Slider(label="Top P",
502
+ minimum=0.05,
503
+ maximum=1.0,
504
+ value=0.8,
505
+ step=0.05)
506
+ top_k = gr.Slider(label="Top K",
507
+ minimum=1,
508
+ maximum=100,
509
+ value=20,
510
+ step=1)
511
+
512
+ with gr.Column(scale=3):
513
+ chatbot = gr.Chatbot(label="Chat History (对话历史)",
514
+ type="messages",
515
+ height=420,
516
+ layout="panel",
517
+ bubble_full_width=False,
518
+ render=False)
519
+ chatbot.render()
520
+
521
+ with gr.Accordion(
522
+ "📎 Click to upload multimodal files (点击上传多模态文件)",
523
+ open=True):
524
+ with gr.Row():
525
+ audio_input = gr.Audio(
526
+ sources=["upload", 'microphone'],
527
+ type="filepath",
528
+ label="Audio (<1 h)",
529
+ elem_classes="media-upload")
530
+ image_input = gr.Image(
531
+ sources=["upload", 'webcam'],
532
+ type="filepath",
533
+ label="Image (<10 MB)",
534
+ elem_classes="media-upload")
535
+ video_input = gr.Video(
536
+ sources=["upload", 'webcam'],
537
+ label="Video (<1 h)",
538
+ elem_classes="media-upload")
539
+
540
+ with gr.Row():
541
+ text_input = gr.Textbox(
542
+ show_label=False,
543
+ placeholder=
544
+ "Enter text or upload files and press Submit... (输入文本或者上传文件并点击提交)",
545
+ scale=7)
546
+ submit_btn_offline = gr.Button("Submit (提交)",
547
+ variant="primary",
548
+ scale=1)
549
+ stop_btn_offline = gr.Button("Stop (停止)",
550
+ visible=False,
551
+ scale=1)
552
+ clear_btn_offline = gr.Button("Clear (清空) ",
553
+ scale=1)
554
+
555
+ def clear_history_offline():
556
+ return [], None, None, None, None
557
+
558
+ submit_event_offline = gr.on(
559
+ triggers=[
560
+ submit_btn_offline.click, text_input.submit
561
+ ],
562
+ fn=chat_predict,
563
+ inputs=[
564
+ text_input, audio_input, image_input,
565
+ video_input, chatbot, system_prompt_textbox,
566
+ temperature, top_p, top_k
567
+ ],
568
+ outputs=[
569
+ text_input, audio_input, image_input,
570
+ video_input, chatbot, submit_btn_offline, stop_btn_offline
571
+ ])
572
+ stop_btn_offline.click(
573
+ fn=lambda: (gr.update(visible=True),
574
+ gr.update(visible=False)),
575
+ outputs=[submit_btn_offline, stop_btn_offline],
576
+ cancels=[submit_event_offline],
577
+ queue=False)
578
+ clear_btn_offline.click(fn=clear_history_offline,
579
+ outputs=[
580
+ chatbot, text_input,
581
+ audio_input, image_input,
582
+ video_input
583
+ ])
584
+
585
+ gr.HTML("""
586
+ <style>
587
+ .media-upload { min-height: 160px; border: 2px dashed #ccc; border-radius: 8px; display: flex; align-items: center; justify-content: center; }
588
+ .media-upload:hover { border-color: #666; }
589
+ </style>
590
+ """)
591
+
592
+ demo.queue(default_concurrency_limit=100, max_size=100).launch(
593
+ max_threads=100,
594
+ ssr_mode=False,
595
+ share=args.share,
596
+ inbrowser=args.inbrowser,
597
+ server_port=args.server_port,
598
+ server_name=args.server_name,
599
+ )
600
+
601
+
602
+ def _launch_realtime_demo(args, model, oss_reader, model_name):
603
+ VOICE_OPTIONS = {
604
+ "Tina / 中文-甜甜": "Tina",
605
+ "Cindy / 中文-台湾口音-林欣宜": "Cindy",
606
+ "Liora Mira / 中文-清欢": "Liora Mira",
607
+ "Sunnybobi / 中文-知芝": "Sunnybobi",
608
+ "Raymond / 中文-林川野": "Raymond",
609
+ "Ethan / 中文-晨煦": "Ethan",
610
+ "Theo Calm / 中文-予安": "Theo Calm",
611
+ "Serena / 中文-苏瑶": "Serena",
612
+ "Harvey / 英语-厚": "Harvey",
613
+ "Maia / 中文-四月": "Maia",
614
+ "Evan / 中文-江��": "Evan",
615
+ "Qiao / 中文-台湾口音-小乔妹": "Qiao",
616
+ "Momo / 中文-茉兔": "Momo",
617
+ "Wil / 中文-港台腔-伟伦": "Wil",
618
+ "Angel / 中文-台普-安琪": "Angel",
619
+ "Li Cassian / 中文-东厂-李公公": "Li Cassian",
620
+ "Mia / 英语-温柔生活博主-舒然": "Mia",
621
+ "Joyner / 英语-喜剧担当-阿逗": "Joyner",
622
+ "Gold / 英语-金爷": "Gold",
623
+ "Katerina / 英语-卡捷琳娜": "Katerina",
624
+ "Ryan / 英语-甜茶": "Ryan",
625
+ "Jennifer / 英语-詹妮弗": "Jennifer",
626
+ "Aiden / 英语-艾登": "Aiden",
627
+ "Mione / 英语-敏儿": "Mione",
628
+ "Sunny / 四川话-晴儿": "Sunny",
629
+ "Dylan / 北京话-晓东": "Dylan",
630
+ "Eric / 四川话-程川": "Eric",
631
+ "Peter / 天津话-李彼得": "Peter",
632
+ "Joseph Chen / 闽南话-阿樸伯": "Joseph Chen",
633
+ "Marcus / 陕西话-秦川": "Marcus",
634
+ "Li / 南京话-老李": "Li",
635
+ "Rocky / 广东话-阿强": "Rocky",
636
+ "Sohee (Korean) / 韩语-素熙": "Sohee",
637
+ "Lenn (German) / 德语-莱恩": "Lenn",
638
+ "Ono Anna (Japanese) / 日语-小野杏": "Ono Anna",
639
+ "Sonrisa (Spanish) / 西班牙语-索尼莎": "Sonrisa",
640
+ "Bodega (Spanish) / 西班牙语-博德加": "Bodega",
641
+ "Emilien (French) / 法语-埃米尔安": "Emilien",
642
+ "Andre (Portuguese) / 葡萄牙语-安德雷": "Andre",
643
+ "Radio Gol (Portuguese) / 葡萄牙语-拉迪奥·戈尔": "Radio Gol",
644
+ "Alek (Russian) / 俄语-阿列克": "Alek",
645
+ "Rizky (Indonesian) / 印尼语-阿力": "Rizky",
646
+ "Roya (Persian) / 波斯语-萝雅": "Roya",
647
+ "Arda (Turkish) / 土耳其语-阿尔达": "Arda",
648
+ "Hana (Vietnamese) / 越南语-阿幸": "Hana",
649
+ "Dolce (Italian) / 意大利语-多尔切": "Dolce",
650
+ "Jakub (Polish) / 波兰语-雅克": "Jakub",
651
+ "Griet (Dutch) / 荷兰语-海娜": "Griet",
652
+ "Eliska (Czech) / 捷克语-艾莉卡": "Eliska",
653
+ "Marina (Hebrew) / 希伯来语-玛丽娜": "Marina",
654
+ "Siiri (Finnish) / 芬兰语-西芮": "Siiri",
655
+ "Ingrid (Norwegian) / 挪威语-林恩": "Ingrid",
656
+ "Sigga (Icelandic) / 冰岛语-海娜": "Sigga",
657
+ "Bea (Filipino) / 菲律宾语-雅娜": "Bea",
658
+ "Chloe (Malay) / 马来语-思怡": "Chloe",
659
+ }
660
+ DEFAULT_VOICE = "Tina / 中文-甜甜"
661
+
662
+ VOICE_GROUPS = {
663
+ "Recommended Voices / 推荐音色": [
664
+ "Tina / 中文-甜甜",
665
+ "Cindy / 中文-台湾口音-林欣宜",
666
+ "Liora Mira / 中文-清欢",
667
+ "Sunnybobi / 中文-知芝",
668
+ "Raymond / 中文-林川野",
669
+ ],
670
+ "Chinese Dialects / 中文方言": [
671
+ "Sunny / 四川话-晴儿",
672
+ "Dylan / 北京话-晓东",
673
+ "Eric / 四川话-程川",
674
+ "Peter / 天津话-李彼得",
675
+ "Joseph Chen / 闽南话-阿樸伯",
676
+ "Marcus / 陕西话-秦川",
677
+ "Li / 南京话-老李",
678
+ "Rocky / 广东话-阿强",
679
+ ],
680
+ "Multilingual / 多语言": [
681
+ "Sohee (Korean) / 韩语-素熙",
682
+ "Lenn (German) / 德语-莱恩",
683
+ "Ono Anna (Japanese) / 日语-小野杏",
684
+ "Sonrisa (Spanish) / 西班牙语-索尼莎",
685
+ "Bodega (Spanish) / 西班牙语-博德加",
686
+ "Emilien (French) / 法语-埃米尔安",
687
+ "Andre (Portuguese) / 葡萄牙语-安德雷",
688
+ "Radio Gol (Portuguese) / 葡萄牙语-拉迪奥·戈尔",
689
+ "Alek (Russian) / 俄语-阿列克",
690
+ "Rizky (Indonesian) / 印尼语-阿力",
691
+ "Roya (Persian) / 波斯语-萝雅",
692
+ "Arda (Turkish) / 土耳其语-阿尔达",
693
+ "Hana (Vietnamese) / 越南语-阿幸",
694
+ "Dolce (Italian) / 意大利语-多尔切",
695
+ "Jakub (Polish) / 波兰语-雅克",
696
+ "Griet (Dutch) / 荷兰语-海娜",
697
+ "Eliska (Czech) / 捷克语-艾莉卡",
698
+ "Marina (Hebrew) / 希伯来语-玛丽娜",
699
+ "Siiri (Finnish) / 芬兰语-西芮",
700
+ "Ingrid (Norwegian) / 挪威语-林恩",
701
+ "Sigga (Icelandic) / 冰岛语-海娜",
702
+ "Bea (Filipino) / 菲律宾语-雅娜",
703
+ "Chloe (Malay) / 马来语-思怡",
704
+ ],
705
+ "Healing & Warmth / 情感陪伴-治愈温暖": [
706
+ "Ethan / 中文-晨煦",
707
+ "Theo Calm / 中文-予安",
708
+ "Serena / 中文-苏瑶",
709
+ "Harvey / 英语-厚",
710
+ "Maia / 中文-四月",
711
+ ],
712
+ "Energetic & Playful / 情感陪伴-活力个性": [
713
+ "Evan / 中文-江晨",
714
+ "Qiao / 中文-台湾口音-小乔妹",
715
+ "Momo / 中文-茉兔",
716
+ "Wil / 中文-港台腔-伟伦",
717
+ "Angel / 中文-台普-安琪",
718
+ ],
719
+ "Roleplay / 角色扮演": [
720
+ "Li Cassian / 中文-东厂-李公公",
721
+ "Mia / 英语-温柔生活博主-舒然",
722
+ "Joyner / 英语-喜剧担当-阿逗",
723
+ "Gold / 英语-金爷",
724
+ ],
725
+ "Game & Anime / 游戏动漫配音": [
726
+ "Katerina / 英语-卡捷琳娜",
727
+ "Ryan / 英语-甜茶",
728
+ "Jennifer / 英语-詹妮弗",
729
+ "Aiden / 英语-艾登",
730
+ "Mione / 英语-敏儿",
731
+ ],
732
+ }
733
+ DEFAULT_VOICE_GROUP = "Recommended Voices / 推荐音色"
734
+
735
+ default_system_prompt = """# Voice Style
736
+
737
+ You may output **at most one** voice style tag **only when the user explicitly asks to control speaking style** (emotion / speech rate / volume), e.g., “say it angrily”, “speak faster”, “whisper”, “in a calm tone”, etc.
738
+ You have the following voice style tags:
739
+ <tags>
740
+ brisk, rapid, leisurely, sluggish,
741
+ loud, shouting, soft-spoken, whispering,
742
+ irritated, furious, distasteful, repulsed,
743
+ nervous, terrified, cheerful, ecstatic,
744
+ gloomy, despairing, startled, shocked
745
+ </tags>
746
+
747
+ Selection rules (only when the user explicitly requests control):
748
+ - Choose the tag that best matches the user’s explicit instruction and intended delivery.
749
+ - If multiple cues are present, choose the **single most dominant** one (emotion, speed, or volume).
750
+
751
+ If you choose to output a tag ONLY reply in the following format with NO perfix:
752
+
753
+ <voice_style>
754
+ voice style tag
755
+ </voice_style>
756
+
757
+ your response
758
+
759
+ <IMPORTANT>
760
+ Reminder:
761
+ - **If the user does NOT explicitly request a speaking style**, **do NOT output any tag**. Reply normally with your current knowledge and do not tell the user about voice tags.
762
+ - If you choose to output a voice style tag, MUST follow the specified format: the tag must be nested within <voice_style></voice_style>.
763
+ - **Do not** explain your choice.
764
+ - **Never** output more than one tag.
765
+ - **Never** add any extra content before the tag.
766
+ </IMPORTANT>
767
+
768
+ Please strictly follow the following guidelines when generating responses. Avoid using any formatting markers, special symbols, or structured layouts. Do not include bold, italic, numbering, bullet points, emojis, or other visual elements. The response must be natural conversational language with smooth sentences and a human-like dialogue flow. Use standard punctuation—such as periods, commas, and question marks—to separate ideas clearly. Refrain from complex sentence structures, and above all, avoid redundant or wordy expressions. Be concise and direct. When listing information, use continuous narration instead of bullet points."""
769
+
770
+ def to_mp4(path):
771
+ if path and path.endswith(".webm"):
772
+ mp4_path = path.replace(".webm", ".mp4")
773
+ subprocess.run([
774
+ "ffmpeg", "-y",
775
+ "-i", path,
776
+ "-c:v", "libx264",
777
+ "-pix_fmt", "yuv420p",
778
+ "-c:a", "aac",
779
+ "-movflags", "+faststart",
780
+ "-f", "mp4",
781
+ mp4_path
782
+ ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
783
+ return mp4_path
784
+ return path
785
+
786
+ def format_history(history: list, system_prompt: str):
787
+ print(history)
788
+ messages = []
789
+ if system_prompt != "":
790
+ messages.append({
791
+ "role": "system",
792
+ "content": [{
793
+ "type": "text",
794
+ "text": system_prompt
795
+ }]
796
+ })
797
+
798
+ current_user_content = []
799
+
800
+ for item in history:
801
+ role = item['role']
802
+ content = item['content']
803
+
804
+ if role != "user":
805
+ if current_user_content:
806
+ messages.append({
807
+ "role": "user",
808
+ "content": current_user_content
809
+ })
810
+ current_user_content = []
811
+
812
+ if isinstance(content, str):
813
+ messages.append({
814
+ "role": role,
815
+ "content": [{
816
+ "type": "text",
817
+ "text": content
818
+ }]
819
+ })
820
+ else:
821
+ pass
822
+ continue
823
+
824
+ if isinstance(content, str):
825
+ current_user_content.append({"type": "text", "text": content})
826
+ elif isinstance(content, (list, tuple)):
827
+ for file_path in content:
828
+ mime_type = client_utils.get_mimetype(file_path)
829
+ media_type = None
830
+
831
+ if mime_type.startswith("image"):
832
+ media_type = "image_url"
833
+ elif mime_type.startswith("video"):
834
+ media_type = "video_url"
835
+ file_path = to_mp4(file_path)
836
+ elif mime_type.startswith("audio"):
837
+ media_type = "input_audio"
838
+
839
+ if media_type:
840
+ request_id = str(uuid.uuid4())
841
+ oss_path = f"oss://{bucket_name}/{OSS_TEMP_BUCKET_DIR}" + request_id
842
+ oss_reader.upload_file(file_path, oss_path)
843
+ media_url = oss_reader.get_public_url(oss_path)
844
+ if media_type == "input_audio":
845
+ current_user_content.append({
846
+ "type": "input_audio",
847
+ "input_audio": {
848
+ "data": media_url,
849
+ "format": "wav",
850
+ },
851
+ })
852
+ if media_type == "image_url":
853
+ current_user_content.append({
854
+ "type": "image_url",
855
+ "image_url": {
856
+ "url": media_url
857
+ },
858
+ })
859
+ if media_type == "video_url":
860
+ current_user_content.append({
861
+ "type": "video_url",
862
+ "video_url": {
863
+ "url": media_url
864
+ },
865
+ "fps": 1,
866
+ })
867
+ else:
868
+ current_user_content.append({
869
+ "type": "text",
870
+ "text": file_path
871
+ })
872
+
873
+ if current_user_content:
874
+ media_items = []
875
+ text_items = []
876
+
877
+ for item in current_user_content:
878
+ if item["type"] == "text":
879
+ text_items.append(item)
880
+ else:
881
+ media_items.append(item)
882
+
883
+ messages.append({
884
+ "role": "user",
885
+ "content": media_items + text_items
886
+ })
887
+
888
+ return messages
889
+
890
+ def predict(messages,
891
+ voice_choice=DEFAULT_VOICE,
892
+ temperature=0.7,
893
+ top_p=0.8,
894
+ top_k=20):
895
+ completion = model.chat.completions.create(
896
+ model=model_name,
897
+ messages=messages,
898
+ modalities=["text", "audio"],
899
+ audio={
900
+ "voice": VOICE_OPTIONS[voice_choice],
901
+ "format": "wav"
902
+ },
903
+ extra_body={
904
+ "top_k": top_k,
905
+ "enable_search": True,
906
+ },
907
+ stream_options={"include_usage": True},
908
+ stream=True,
909
+ temperature=temperature,
910
+ top_p=top_p,
911
+ )
912
+
913
+ audio_string = ""
914
+ output_text = ""
915
+ request_id = ""
916
+ request_id_prefixed = False
917
+
918
+ for chunk in completion:
919
+ if request_id == "" and hasattr(chunk, "id") and chunk.id:
920
+ request_id = chunk.id
921
+
922
+ if chunk.choices:
923
+ if hasattr(chunk.choices[0].delta, "audio"):
924
+ try:
925
+ audio_string += chunk.choices[0].delta.audio["data"]
926
+ except Exception:
927
+ transcript = chunk.choices[0].delta.audio["transcript"]
928
+ if args.debug and not request_id_prefixed and request_id:
929
+ output_text += f"[Request ID: {request_id}]\n\n"
930
+ request_id_prefixed = True
931
+ output_text += transcript
932
+ yield {"type": "text", "data": output_text}
933
+ else:
934
+ delta = chunk.choices[0].delta
935
+ if hasattr(delta, "content") and delta.content:
936
+ if args.debug and not request_id_prefixed and request_id:
937
+ output_text += f"[Request ID: {request_id}]\n\n"
938
+ request_id_prefixed = True
939
+ output_text += delta.content
940
+ yield {"type": "text", "data": output_text}
941
+ else:
942
+ print(chunk.usage)
943
+
944
+ wav_bytes = base64.b64decode(audio_string)
945
+ audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
946
+
947
+ if audio_string != "":
948
+ wav_io = io.BytesIO()
949
+ sf.write(wav_io, audio_np, samplerate=24000, format="WAV")
950
+ wav_io.seek(0)
951
+ wav_bytes = wav_io.getvalue()
952
+ audio_path = processing_utils.save_bytes_to_cache(
953
+ wav_bytes, "audio.wav", cache_dir=demo.GRADIO_CACHE)
954
+ yield {"type": "audio", "data": audio_path}
955
+
956
+ def media_predict(audio,
957
+ video,
958
+ history,
959
+ voice_choice,
960
+ temperature,
961
+ top_p,
962
+ top_k):
963
+ yield (
964
+ None,
965
+ None,
966
+ history,
967
+ gr.update(visible=False),
968
+ gr.update(visible=True),
969
+ )
970
+
971
+ files = [audio, video]
972
+
973
+ for f in files:
974
+ if f:
975
+ history.append({"role": "user", "content": (f, )})
976
+
977
+ yield (
978
+ None,
979
+ None,
980
+ history,
981
+ gr.update(visible=True),
982
+ gr.update(visible=False),
983
+ )
984
+
985
+ formatted_history = format_history(
986
+ history=history,
987
+ system_prompt=default_system_prompt,
988
+ )
989
+
990
+ history.append({"role": "assistant", "content": ""})
991
+
992
+ for chunk in predict(formatted_history, voice_choice, temperature,
993
+ top_p, top_k):
994
+ print('chunk', chunk)
995
+ if chunk["type"] == "text":
996
+ history[-1]["content"] = chunk["data"]
997
+ yield (
998
+ None,
999
+ None,
1000
+ history,
1001
+ gr.update(visible=False),
1002
+ gr.update(visible=True),
1003
+ )
1004
+ if chunk["type"] == "audio":
1005
+ history.append({
1006
+ "role": "assistant",
1007
+ "content": gr.Audio(chunk["data"])
1008
+ })
1009
+
1010
+ yield (
1011
+ None,
1012
+ None,
1013
+ history,
1014
+ gr.update(visible=True),
1015
+ gr.update(visible=False),
1016
+ )
1017
+
1018
+ def update_voice_choices(voice_group):
1019
+ choices = VOICE_GROUPS[voice_group]
1020
+ value = choices[0] if choices else None
1021
+ return gr.update(choices=choices, value=value)
1022
+
1023
+ with gr.Blocks(
1024
+ theme=gr.themes.Soft(font=[
1025
+ gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"
1026
+ ]),
1027
+ css=".gradio-container {max-width: none !important;}") as demo:
1028
+ gr.Markdown("# Qwen3.5 Omni Realtime Interaction Demo")
1029
+ gr.Markdown(
1030
+ "**Instructions**: Click the audio recording button or the camera recording button, provide audio or video input, then click Submit and wait for the model's response."
1031
+ )
1032
+ gr.Markdown(
1033
+ "**使用说明**:1️⃣ 点击音频录制按钮,或摄像头-录制按钮 2️⃣ 输入音频或者视频 3️⃣ 点击提交并等待模型的回答"
1034
+ )
1035
+
1036
+ with gr.Row(equal_height=False):
1037
+ with gr.Column(scale=1):
1038
+ gr.Markdown("### ⚙️ Parameters (参数)")
1039
+ with gr.Group():
1040
+ voice_group = gr.Radio(
1041
+ label="Voice Category(音色类别)",
1042
+ choices=list(VOICE_GROUPS.keys()),
1043
+ value=DEFAULT_VOICE_GROUP)
1044
+ voice_choice = gr.Dropdown(
1045
+ label="Voice Choice(音色选择)",
1046
+ choices=VOICE_GROUPS[DEFAULT_VOICE_GROUP],
1047
+ value=DEFAULT_VOICE,
1048
+ visible=True)
1049
+ temperature = gr.Slider(label="Temperature",
1050
+ minimum=0.1,
1051
+ maximum=2.0,
1052
+ value=0.7,
1053
+ step=0.1)
1054
+ top_p = gr.Slider(label="Top P",
1055
+ minimum=0.05,
1056
+ maximum=1.0,
1057
+ value=0.8,
1058
+ step=0.05)
1059
+ top_k = gr.Slider(label="Top K",
1060
+ minimum=1,
1061
+ maximum=100,
1062
+ value=20,
1063
+ step=1)
1064
+
1065
+ with gr.Column(scale=3):
1066
+ with gr.Row():
1067
+ with gr.Column(scale=1):
1068
+ gr.Markdown("### Audio-Video Input (音视频输入)")
1069
+ microphone = gr.Audio(
1070
+ sources=["microphone", 'upload'],
1071
+ type="filepath",
1072
+ label="Record Audio (录制音频)")
1073
+ webcam = gr.Video(
1074
+ sources=['webcam', "upload"],
1075
+ label="Record/Upload Video (录制/上传视频)",
1076
+ elem_classes="media-upload")
1077
+ with gr.Row():
1078
+ submit_btn_online = gr.Button(
1079
+ "Submit (提交)",
1080
+ variant="primary",
1081
+ scale=2)
1082
+ stop_btn_online = gr.Button("Stop (停止)",
1083
+ visible=False,
1084
+ scale=1)
1085
+ clear_btn_online = gr.Button(
1086
+ "Clear History (清除历史)")
1087
+ with gr.Column(scale=2):
1088
+ media_chatbot = gr.Chatbot(
1089
+ label="Chat History (对话历史)",
1090
+ type="messages",
1091
+ height=650,
1092
+ layout="panel",
1093
+ bubble_full_width=False,
1094
+ render=False)
1095
+ media_chatbot.render()
1096
+
1097
+ def clear_history_online():
1098
+ return [], None, None
1099
+
1100
+ voice_group.change(
1101
+ fn=update_voice_choices,
1102
+ inputs=[voice_group],
1103
+ outputs=[voice_choice])
1104
+
1105
+ submit_event_online = submit_btn_online.click(
1106
+ fn=media_predict,
1107
+ inputs=[
1108
+ microphone, webcam, media_chatbot,
1109
+ voice_choice, temperature, top_p, top_k
1110
+ ],
1111
+ outputs=[
1112
+ microphone, webcam, media_chatbot,
1113
+ submit_btn_online, stop_btn_online
1114
+ ])
1115
+ stop_btn_online.click(
1116
+ fn=lambda: (gr.update(visible=True),
1117
+ gr.update(visible=False)),
1118
+ outputs=[submit_btn_online, stop_btn_online],
1119
+ cancels=[submit_event_online],
1120
+ queue=False)
1121
+ clear_btn_online.click(
1122
+ fn=clear_history_online,
1123
+ outputs=[media_chatbot, microphone, webcam])
1124
+
1125
+ gr.HTML("""
1126
+ <style>
1127
+ .media-upload { min-height: 160px; border: 2px dashed #ccc; border-radius: 8px; display: flex; align-items: center; justify-content: center; }
1128
+ .media-upload:hover { border-color: #666; }
1129
+ </style>
1130
+ """)
1131
+
1132
+ demo.queue(default_concurrency_limit=100, max_size=100).launch(
1133
+ max_threads=100,
1134
+ ssr_mode=False,
1135
+ share=args.share,
1136
+ inbrowser=args.inbrowser,
1137
+ server_port=args.server_port,
1138
+ server_name=args.server_name,
1139
+ )
1140
+
1141
+
1142
+ def _get_args():
1143
+ parser = ArgumentParser()
1144
+ parser.add_argument(
1145
+ '--share',
1146
+ action='store_true',
1147
+ default=False,
1148
+ help='Create a publicly shareable link for the interface.')
1149
+ parser.add_argument(
1150
+ '--inbrowser',
1151
+ action='store_true',
1152
+ default=False,
1153
+ help='Automatically launch the interface in a new tab on the default browser.'
1154
+ )
1155
+ parser.add_argument('--server-port',
1156
+ type=int,
1157
+ default=7860,
1158
+ help='Demo server port.')
1159
+ parser.add_argument('--server-name',
1160
+ type=str,
1161
+ default='0.0.0.0',
1162
+ help='Demo server name.')
1163
+ parser.add_argument('--demo-mode',
1164
+ type=str,
1165
+ default='offline',
1166
+ choices=['offline', 'realtime'],
1167
+ help='Choose which demo mode to launch.')
1168
+ parser.add_argument('--debug',
1169
+ action='store_true',
1170
+ default=False,
1171
+ help='Enable debug mode and show request id at the beginning of assistant replies.')
1172
+
1173
+ args = parser.parse_args()
1174
+ return args
1175
+
1176
+
1177
+ if __name__ == "__main__":
1178
+ args = _get_args()
1179
+ oss_reader = OSSReader()
1180
+ model = OpenAI(
1181
+ api_key=API_KEY,
1182
+ base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
1183
+ )
1184
+ model_name = "qwen3.5-omni-plus"
1185
+
1186
+ if args.demo_mode == "offline":
1187
+ _launch_offline_demo(args, model, oss_reader, model_name)
1188
+ elif args.demo_mode == "realtime":
1189
+ _launch_realtime_demo(args, model, oss_reader, model_name)
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ openai
2
+ soundfile
3
+ oss2