Spaces:
Runtime error
Runtime error
| import base64 | |
| import io | |
| import os | |
| import subprocess | |
| import time | |
| import uuid | |
| from argparse import ArgumentParser | |
| import gradio as gr | |
| import gradio.processing_utils as processing_utils | |
| import numpy as np | |
| import oss2 | |
| import soundfile as sf | |
| from gradio_client import utils as client_utils | |
| from openai import OpenAI | |
| OSS_RETRY = 10 | |
| OSS_RETRY_DELAY = 3 | |
| WAV_BIT_RATE = 16 | |
| WAV_SAMPLE_RATE = os.environ.get("WAV_SAMPLE_RATE", 16000) | |
| # from env import * | |
| region = os.getenv("OSS_REGION") | |
| endpoint = os.getenv("OSS_ENDPOINT") | |
| bucket_name = os.getenv("OSS_BUCKET_NAME") | |
| OSS_TEMP_BUCKET_DIR = os.getenv("OSS_TEMP_BUCKET_DIR", "") | |
| API_KEY = os.environ['API_KEY'] | |
| OSS_ACCESS_KEY_ID = os.environ['OSS_ACCESS_KEY_ID'] | |
| OSS_ACCESS_KEY_SECRET = os.environ['OSS_ACCESS_KEY_SECRET'] | |
| OSS_CONFIG_PATH = {} | |
| class OSSReader: | |
| def __init__(self): | |
| # 初始化OSS配置 | |
| self.bucket2object = { | |
| bucket_name: | |
| oss2.Bucket(oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET), | |
| endpoint, bucket_name), | |
| } | |
| print( | |
| f"Loaded OSS config from: {OSS_CONFIG_PATH}\nSupported buckets: {list(self.bucket2object.keys())}" | |
| ) | |
| def _parse_oss_path(self, oss_path): | |
| """解析oss路径,返回bucket名称和实际路径""" | |
| assert oss_path.startswith("oss://"), f"Invalid oss path {oss_path}" | |
| bucket_name, object_key = oss_path.split("oss://")[-1].split("/", 1) | |
| object_key = f"studio-temp/Qwen3-Omni-Demo/{object_key}" | |
| return bucket_name, object_key | |
| def _retry_operation(self, | |
| func, | |
| *args, | |
| retries=OSS_RETRY, | |
| delay=OSS_RETRY, | |
| **kwargs): | |
| """通用的重试机制""" | |
| for _ in range(retries): | |
| try: | |
| return func(*args, **kwargs) | |
| except Exception as e: | |
| print(f"Retry: {_} Error: {str(e)}") | |
| if _ == retries - 1: | |
| raise e | |
| time.sleep(delay) | |
| def get_public_url(self, oss_path): | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| url = self._retry_operation(self.bucket2object[bucket_name].sign_url, | |
| 'GET', | |
| object_key, | |
| 600, | |
| slash_safe=True).replace( | |
| 'http://', 'https://') | |
| return url.replace("-internal", '') | |
| def file_exists(self, oss_path): | |
| """判断文件是否存在""" | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| return self._retry_operation( | |
| self.bucket2object[bucket_name].object_exists, object_key) | |
| def download_file(self, oss_path, local_path): | |
| """下载OSS上的文件到本地""" | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| self._retry_operation( | |
| self.bucket2object[bucket_name].get_object_to_file, object_key, | |
| local_path) | |
| def upload_file(self, local_path, oss_path, overwrite=True): | |
| """上传本地文件到OSS""" | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| # 检查文件是否存在 | |
| if not os.path.exists(local_path): | |
| raise FileNotFoundError(f"Local file {local_path} does not exist") | |
| # 检查目标文件是否存在(当overwrite=False时) | |
| if not overwrite and self.file_exists(oss_path): | |
| print(f"File {oss_path} already exists, skip upload") | |
| return False | |
| # 执行上传操作 | |
| try: | |
| self._retry_operation( | |
| self.bucket2object[bucket_name].put_object_from_file, | |
| object_key, local_path) | |
| return True | |
| except Exception as e: | |
| print(f"Upload failed: {str(e)}") | |
| return False | |
| def upload_audio_from_array(self, | |
| data, | |
| sample_rate, | |
| oss_path, | |
| overwrite=True): | |
| """将音频数据保存为WAV格式并上传到OSS""" | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| # 检查目标文件是否存在(当overwrite=False时) | |
| if not overwrite and self.file_exists(oss_path): | |
| print(f"File {oss_path} already exists, skip upload") | |
| return False | |
| try: | |
| # 使用 BytesIO 在内存中生成 WAV 格式数据 | |
| import wave | |
| from io import BytesIO | |
| byte_io = BytesIO() | |
| with wave.open(byte_io, 'wb') as wf: | |
| wf.setnchannels(1) # 单声道 | |
| wf.setsampwidth(2) # 16-bit PCM | |
| wf.setframerate(sample_rate) # 设置采样率 | |
| # 将 float32 数据转换为 int16 并写入 WAV | |
| data_int16 = np.clip(data, -1, 1) * 32767 | |
| data_int16 = data_int16.astype(np.int16) | |
| wf.writeframes(data_int16.tobytes()) | |
| # 上传到 OSS | |
| self._retry_operation(self.bucket2object[bucket_name].put_object, | |
| object_key, byte_io.getvalue()) | |
| return True | |
| except Exception as e: | |
| print(f"Upload failed: {str(e)}") | |
| return False | |
| def get_object(self, oss_path): | |
| """读取OSS上的音频文件,返回音频数据和采样率""" | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| return self._retry_operation( | |
| self.bucket2object[bucket_name].get_object, object_key) | |
| def read_text_file(self, oss_path): | |
| """读取OSS上的文本文件""" | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| result = self._retry_operation( | |
| self.bucket2object[bucket_name].get_object, object_key) | |
| return result.read().decode('utf-8') | |
| def read_audio_file(self, oss_path): | |
| """读取OSS上的音频文件,返回音频数据和采样率""" | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| result = self._retry_operation( | |
| self.bucket2object[bucket_name].get_object, object_key) | |
| # ffmpeg 命令:从标准输入读取音频并输出PCM浮点数据 | |
| command = [ | |
| 'ffmpeg', | |
| '-i', | |
| '-', # 输入来自管道 | |
| '-ar', | |
| str(WAV_SAMPLE_RATE), # 输出采样率 | |
| '-ac', | |
| '1', # 单声道 | |
| '-f', | |
| 'f32le', # 指定输出格式 | |
| '-' # 输出到管道 | |
| ] | |
| # 启动ffmpeg子进程 | |
| process = subprocess.Popen(command, | |
| stdin=subprocess.PIPE, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE) | |
| # 写入音频字节并获取输出 | |
| stdout_data, stderr_data = process.communicate(input=result.read()) | |
| if process.returncode != 0: | |
| raise RuntimeError(f"FFmpeg error: {stderr_data.decode('utf-8')}") | |
| # 将PCM数据转换为numpy数组 | |
| wav_data = np.frombuffer(stdout_data, dtype=np.float32) | |
| return wav_data, WAV_SAMPLE_RATE | |
| def get_wav_duration_from_bin(self, oss_path): | |
| oss_bin_path = oss_path + ".ar16k.bin" | |
| bucket_name, object_key = self._parse_oss_path(oss_bin_path) | |
| metadata = self._retry_operation( | |
| self.bucket2object[bucket_name].get_object_meta, object_key) | |
| duration = float(metadata.headers['Content-Length']) / (16000 * 2) | |
| return duration | |
| def read_wavdata_from_oss(self, | |
| oss_path, | |
| start=None, | |
| end=None, | |
| force_bin=False): | |
| bucket_name, object_key = self._parse_oss_path(oss_path) | |
| oss_bin_key = object_key + ".ar16k.bin" | |
| if start is None or end is None: | |
| if self.bucket2object[bucket_name].object_exists(oss_bin_key): | |
| wav_data = self._retry_operation( | |
| self.bucket2object[bucket_name].get_object, | |
| oss_bin_key).read() | |
| elif not force_bin: | |
| wav_data, _ = self.read_audio_file(oss_path) | |
| else: | |
| raise ValueError(f"Cannot find bin file for {oss_path}") | |
| else: | |
| bytes_per_second = WAV_SAMPLE_RATE * (WAV_BIT_RATE // 8) | |
| # 计算字节偏移量 | |
| start_offset = round(start * bytes_per_second) | |
| end_offset = round(end * bytes_per_second) | |
| if not (end_offset - start_offset) % 2: | |
| end_offset -= 1 | |
| # 使用范围请求只获取指定字节范围的数据 | |
| wav_data = self._retry_operation( | |
| self.bucket2object[bucket_name].get_object, | |
| oss_bin_key, | |
| byte_range=(start_offset, end_offset), | |
| headers={ | |
| 'x-oss-range-behavior': 'standard' | |
| }).read() | |
| if not isinstance(wav_data, np.ndarray): | |
| wav_data = np.frombuffer(wav_data, np.int16).flatten() / 32768.0 | |
| return wav_data.astype(np.float32) | |
| def _list_files_by_suffix(self, oss_dir, suffix): | |
| """递归搜索以某个后缀结尾的所有文件,返回所有文件的OSS路径列表""" | |
| bucket_name, dir_key = self._parse_oss_path(oss_dir) | |
| file_list = [] | |
| def _recursive_list(prefix): | |
| for obj in oss2.ObjectIterator(self.bucket2object[bucket_name], | |
| prefix=prefix, | |
| delimiter='/'): | |
| if obj.is_prefix(): # 如果是目录,递归搜索 | |
| _recursive_list(obj.key) | |
| elif obj.key.endswith(suffix): | |
| file_list.append(f"oss://{bucket_name}/{obj.key}") | |
| _recursive_list(dir_key) | |
| return file_list | |
| def list_files_by_suffix(self, oss_dir, suffix): | |
| return self._retry_operation(self._list_files_by_suffix, oss_dir, | |
| suffix) | |
| def _list_files_by_prefix(self, oss_dir, file_prefix): | |
| """递归搜索以某个后缀结尾的所有文件,返回所有文件的OSS路径列表""" | |
| bucket_name, dir_key = self._parse_oss_path(oss_dir) | |
| file_list = [] | |
| def _recursive_list(prefix): | |
| for obj in oss2.ObjectIterator(self.bucket2object[bucket_name], | |
| prefix=prefix, | |
| delimiter='/'): | |
| if obj.is_prefix(): # 如果是目录,递归搜索 | |
| _recursive_list(obj.key) | |
| elif os.path.basename(obj.key).startswith(file_prefix): | |
| file_list.append(f"oss://{bucket_name}/{obj.key}") | |
| _recursive_list(dir_key) | |
| return file_list | |
| def list_files_by_prefix(self, oss_dir, file_prefix): | |
| return self._retry_operation(self._list_files_by_prefix, oss_dir, | |
| file_prefix) | |
| def _launch_offline_demo(args, model, oss_reader, model_name): | |
| default_system_prompt = '' | |
| def to_mp4(path): | |
| if path and path.endswith(".webm"): | |
| mp4_path = path.replace(".webm", ".mp4") | |
| subprocess.run([ | |
| "ffmpeg", "-y", | |
| "-i", path, | |
| "-c:v", "libx264", | |
| "-pix_fmt", "yuv420p", | |
| "-c:a", "aac", | |
| "-movflags", "+faststart", | |
| "-f", "mp4", | |
| mp4_path | |
| ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| return mp4_path | |
| return path | |
| def format_history(history: list, system_prompt: str): | |
| print(history) | |
| messages = [] | |
| if system_prompt != "": | |
| messages.append({ | |
| "role": "system", | |
| "content": [{ | |
| "type": "text", | |
| "text": system_prompt | |
| }] | |
| }) | |
| current_user_content = [] | |
| for item in history: | |
| role = item['role'] | |
| content = item['content'] | |
| if role != "user": | |
| if current_user_content: | |
| messages.append({ | |
| "role": "user", | |
| "content": current_user_content | |
| }) | |
| current_user_content = [] | |
| if isinstance(content, str): | |
| messages.append({ | |
| "role": role, | |
| "content": [{ | |
| "type": "text", | |
| "text": content | |
| }] | |
| }) | |
| else: | |
| pass | |
| continue | |
| if isinstance(content, str): | |
| current_user_content.append({"type": "text", "text": content}) | |
| elif isinstance(content, (list, tuple)): | |
| for file_path in content: | |
| mime_type = client_utils.get_mimetype(file_path) | |
| media_type = None | |
| if mime_type.startswith("image"): | |
| media_type = "image_url" | |
| elif mime_type.startswith("video"): | |
| media_type = "video_url" | |
| file_path = to_mp4(file_path) | |
| elif mime_type.startswith("audio"): | |
| media_type = "input_audio" | |
| if media_type: | |
| request_id = str(uuid.uuid4()) | |
| oss_path = f"oss://{bucket_name}/{OSS_TEMP_BUCKET_DIR}" + request_id | |
| oss_reader.upload_file(file_path, oss_path) | |
| media_url = oss_reader.get_public_url(oss_path) | |
| if media_type == "input_audio": | |
| current_user_content.append({ | |
| "type": "input_audio", | |
| "input_audio": { | |
| "data": media_url, | |
| "format": "wav", | |
| }, | |
| }) | |
| if media_type == "image_url": | |
| current_user_content.append({ | |
| "type": "image_url", | |
| "image_url": { | |
| "url": media_url | |
| }, | |
| }) | |
| if media_type == "video_url": | |
| current_user_content.append({ | |
| "type": "video_url", | |
| "video_url": { | |
| "url": media_url | |
| }, | |
| "fps": 1, | |
| }) | |
| else: | |
| current_user_content.append({ | |
| "type": "text", | |
| "text": file_path | |
| }) | |
| if current_user_content: | |
| media_items = [] | |
| text_items = [] | |
| for item in current_user_content: | |
| if item["type"] == "text": | |
| text_items.append(item) | |
| else: | |
| media_items.append(item) | |
| messages.append({ | |
| "role": "user", | |
| "content": media_items + text_items | |
| }) | |
| return messages | |
| def predict(messages, | |
| temperature=0.7, | |
| top_p=0.8, | |
| top_k=20): | |
| completion = model.chat.completions.create( | |
| model=model_name, | |
| messages=messages, | |
| modalities=["text"], | |
| extra_body={ | |
| "top_k": top_k | |
| }, | |
| stream_options={"include_usage": True}, | |
| stream=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ) | |
| output_text = "" | |
| request_id = "" | |
| request_id_prefixed = False | |
| for chunk in completion: | |
| if request_id == "" and hasattr(chunk, "id") and chunk.id: | |
| request_id = chunk.id | |
| if chunk.choices: | |
| delta = chunk.choices[0].delta | |
| if hasattr(delta, "content") and delta.content: | |
| if args.debug and not request_id_prefixed and request_id: | |
| output_text += f"[Request ID: {request_id}]\n\n" | |
| request_id_prefixed = True | |
| output_text += delta.content | |
| yield {"type": "text", "data": output_text} | |
| else: | |
| print(chunk.usage) | |
| def chat_predict(text, | |
| audio, | |
| image, | |
| video, | |
| history, | |
| system_prompt, | |
| temperature, | |
| top_p, | |
| top_k): | |
| if audio: | |
| history.append({"role": "user", "content": (audio, )}) | |
| if text: | |
| history.append({"role": "user", "content": text}) | |
| if image: | |
| history.append({"role": "user", "content": (image, )}) | |
| if video: | |
| history.append({"role": "user", "content": (video, )}) | |
| formatted_history = format_history(history=history, | |
| system_prompt=system_prompt) | |
| yield None, None, None, None, history, gr.update(visible=False), gr.update(visible=True) | |
| history.append({"role": "assistant", "content": ""}) | |
| for chunk in predict(formatted_history, temperature, top_p, top_k): | |
| print('chat_predict chunk', chunk) | |
| if chunk["type"] == "text": | |
| history[-1]["content"] = chunk["data"] | |
| yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history, gr.update(visible=False), gr.update(visible=True) | |
| yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history, gr.update(visible=True), gr.update(visible=False) | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(font=[ | |
| gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif" | |
| ]), | |
| css=".gradio-container {max-width: none !important;}") as demo: | |
| gr.Markdown("# Qwen3.5 Omni Offline Demo") | |
| gr.Markdown( | |
| "**Instructions**: Interact with the model by entering task text and optionally uploading one modality input, such as audio, image, or video. Then press Submit to get the response." | |
| ) | |
| gr.Markdown( | |
| "**使用说明**:1️⃣ 输入你希望模型执行的任务文本 2️⃣ 可选上传一种模态数据(音频、图片或视频)3️⃣ 点击提交并等待模型回答" | |
| ) | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ⚙️ Parameters (参数)") | |
| system_prompt_textbox = gr.Textbox(label="System Prompt", | |
| value=default_system_prompt, | |
| lines=4, | |
| max_lines=8) | |
| temperature = gr.Slider(label="Temperature", | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1) | |
| top_p = gr.Slider(label="Top P", | |
| minimum=0.05, | |
| maximum=1.0, | |
| value=0.8, | |
| step=0.05) | |
| top_k = gr.Slider(label="Top K", | |
| minimum=1, | |
| maximum=100, | |
| value=20, | |
| step=1) | |
| with gr.Column(scale=3): | |
| chatbot = gr.Chatbot(label="Chat History (对话历史)", | |
| type="messages", | |
| height=420, | |
| layout="panel", | |
| bubble_full_width=False, | |
| render=False) | |
| chatbot.render() | |
| with gr.Accordion( | |
| "📎 Click to upload multimodal files (点击上传多模态文件)", | |
| open=True): | |
| with gr.Row(): | |
| audio_input = gr.Audio( | |
| sources=["upload", 'microphone'], | |
| type="filepath", | |
| label="Audio (<1 h)", | |
| elem_classes="media-upload") | |
| image_input = gr.Image( | |
| sources=["upload", 'webcam'], | |
| type="filepath", | |
| label="Image (<10 MB)", | |
| elem_classes="media-upload") | |
| video_input = gr.Video( | |
| sources=["upload", 'webcam'], | |
| label="Video (<1 h)", | |
| elem_classes="media-upload") | |
| with gr.Row(): | |
| text_input = gr.Textbox( | |
| show_label=False, | |
| placeholder= | |
| "Enter text or upload files and press Submit... (输入文本或者上传文件并点击提交)", | |
| scale=7) | |
| submit_btn_offline = gr.Button("Submit (提交)", | |
| variant="primary", | |
| scale=1) | |
| stop_btn_offline = gr.Button("Stop (停止)", | |
| visible=False, | |
| scale=1) | |
| clear_btn_offline = gr.Button("Clear (清空) ", | |
| scale=1) | |
| def clear_history_offline(): | |
| return [], None, None, None, None | |
| submit_event_offline = gr.on( | |
| triggers=[ | |
| submit_btn_offline.click, text_input.submit | |
| ], | |
| fn=chat_predict, | |
| inputs=[ | |
| text_input, audio_input, image_input, | |
| video_input, chatbot, system_prompt_textbox, | |
| temperature, top_p, top_k | |
| ], | |
| outputs=[ | |
| text_input, audio_input, image_input, | |
| video_input, chatbot, submit_btn_offline, stop_btn_offline | |
| ]) | |
| stop_btn_offline.click( | |
| fn=lambda: (gr.update(visible=True), | |
| gr.update(visible=False)), | |
| outputs=[submit_btn_offline, stop_btn_offline], | |
| cancels=[submit_event_offline], | |
| queue=False) | |
| clear_btn_offline.click(fn=clear_history_offline, | |
| outputs=[ | |
| chatbot, text_input, | |
| audio_input, image_input, | |
| video_input | |
| ]) | |
| gr.HTML(""" | |
| <style> | |
| .media-upload { min-height: 160px; border: 2px dashed #ccc; border-radius: 8px; display: flex; align-items: center; justify-content: center; } | |
| .media-upload:hover { border-color: #666; } | |
| </style> | |
| """) | |
| demo.queue(default_concurrency_limit=100, max_size=100).launch( | |
| max_threads=100, | |
| ssr_mode=False, | |
| share=args.share, | |
| inbrowser=args.inbrowser, | |
| server_port=args.server_port, | |
| server_name=args.server_name, | |
| ) | |
| def _launch_realtime_demo(args, model, oss_reader, model_name): | |
| VOICE_OPTIONS = { | |
| "Tina / 中文-甜甜": "Tina", | |
| "Cindy / 中文-台湾口音-林欣宜": "Cindy", | |
| "Liora Mira / 中文-清欢": "Liora Mira", | |
| "Sunnybobi / 中文-知芝": "Sunnybobi", | |
| "Raymond / 中文-林川野": "Raymond", | |
| "Ethan / 中文-晨煦": "Ethan", | |
| "Theo Calm / 中文-予安": "Theo Calm", | |
| "Serena / 中文-苏瑶": "Serena", | |
| "Harvey / 英语-厚": "Harvey", | |
| "Maia / 中文-四月": "Maia", | |
| "Evan / 中文-江晨": "Evan", | |
| "Qiao / 中文-台湾口音-小乔妹": "Qiao", | |
| "Momo / 中文-茉兔": "Momo", | |
| "Wil / 中文-港台腔-伟伦": "Wil", | |
| "Angel / 中文-台普-安琪": "Angel", | |
| "Li Cassian / 中文-东厂-李公公": "Li Cassian", | |
| "Mia / 英语-温柔生活博主-舒然": "Mia", | |
| "Joyner / 英语-喜剧担当-阿逗": "Joyner", | |
| "Gold / 英语-金爷": "Gold", | |
| "Katerina / 英语-卡捷琳娜": "Katerina", | |
| "Ryan / 英语-甜茶": "Ryan", | |
| "Jennifer / 英语-詹妮弗": "Jennifer", | |
| "Aiden / 英语-艾登": "Aiden", | |
| "Mione / 英语-敏儿": "Mione", | |
| "Sunny / 四川话-晴儿": "Sunny", | |
| "Dylan / 北京话-晓东": "Dylan", | |
| "Eric / 四川话-程川": "Eric", | |
| "Peter / 天津话-李彼得": "Peter", | |
| "Joseph Chen / 闽南话-阿樸伯": "Joseph Chen", | |
| "Marcus / 陕西话-秦川": "Marcus", | |
| "Li / 南京话-老李": "Li", | |
| "Rocky / 广东话-阿强": "Rocky", | |
| "Sohee (Korean) / 韩语-素熙": "Sohee", | |
| "Lenn (German) / 德语-莱恩": "Lenn", | |
| "Ono Anna (Japanese) / 日语-小野杏": "Ono Anna", | |
| "Sonrisa (Spanish) / 西班牙语-索尼莎": "Sonrisa", | |
| "Bodega (Spanish) / 西班牙语-博德加": "Bodega", | |
| "Emilien (French) / 法语-埃米尔安": "Emilien", | |
| "Andre (Portuguese) / 葡萄牙语-安德雷": "Andre", | |
| "Radio Gol (Portuguese) / 葡萄牙语-拉迪奥·戈尔": "Radio Gol", | |
| "Alek (Russian) / 俄语-阿列克": "Alek", | |
| "Rizky (Indonesian) / 印尼语-阿力": "Rizky", | |
| "Roya (Persian) / 波斯语-萝雅": "Roya", | |
| "Arda (Turkish) / 土耳其语-阿尔达": "Arda", | |
| "Hana (Vietnamese) / 越南语-阿幸": "Hana", | |
| "Dolce (Italian) / 意大利语-多尔切": "Dolce", | |
| "Jakub (Polish) / 波兰语-雅克": "Jakub", | |
| "Griet (Dutch) / 荷兰语-海娜": "Griet", | |
| "Eliska (Czech) / 捷克语-艾莉卡": "Eliska", | |
| "Marina (Hebrew) / 希伯来语-玛丽娜": "Marina", | |
| "Siiri (Finnish) / 芬兰语-西芮": "Siiri", | |
| "Ingrid (Norwegian) / 挪威语-林恩": "Ingrid", | |
| "Sigga (Icelandic) / 冰岛语-海娜": "Sigga", | |
| "Bea (Filipino) / 菲律宾语-雅娜": "Bea", | |
| "Chloe (Malay) / 马来语-思怡": "Chloe", | |
| } | |
| DEFAULT_VOICE = "Tina / 中文-甜甜" | |
| VOICE_GROUPS = { | |
| "Recommended Voices / 推荐音色": [ | |
| "Tina / 中文-甜甜", | |
| "Cindy / 中文-台湾口音-林欣宜", | |
| "Liora Mira / 中文-清欢", | |
| "Sunnybobi / 中文-知芝", | |
| "Raymond / 中文-林川野", | |
| ], | |
| "Chinese Dialects / 中文方言": [ | |
| "Sunny / 四川话-晴儿", | |
| "Dylan / 北京话-晓东", | |
| "Eric / 四川话-程川", | |
| "Peter / 天津话-李彼得", | |
| "Joseph Chen / 闽南话-阿樸伯", | |
| "Marcus / 陕西话-秦川", | |
| "Li / 南京话-老李", | |
| "Rocky / 广东话-阿强", | |
| ], | |
| "Multilingual / 多语言": [ | |
| "Sohee (Korean) / 韩语-素熙", | |
| "Lenn (German) / 德语-莱恩", | |
| "Ono Anna (Japanese) / 日语-小野杏", | |
| "Sonrisa (Spanish) / 西班牙语-索尼莎", | |
| "Bodega (Spanish) / 西班牙语-博德加", | |
| "Emilien (French) / 法语-埃米尔安", | |
| "Andre (Portuguese) / 葡萄牙语-安德雷", | |
| "Radio Gol (Portuguese) / 葡萄牙语-拉迪奥·戈尔", | |
| "Alek (Russian) / 俄语-阿列克", | |
| "Rizky (Indonesian) / 印尼语-阿力", | |
| "Roya (Persian) / 波斯语-萝雅", | |
| "Arda (Turkish) / 土耳其语-阿尔达", | |
| "Hana (Vietnamese) / 越南语-阿幸", | |
| "Dolce (Italian) / 意大利语-多尔切", | |
| "Jakub (Polish) / 波兰语-雅克", | |
| "Griet (Dutch) / 荷兰语-海娜", | |
| "Eliska (Czech) / 捷克语-艾莉卡", | |
| "Marina (Hebrew) / 希伯来语-玛丽娜", | |
| "Siiri (Finnish) / 芬兰语-西芮", | |
| "Ingrid (Norwegian) / 挪威语-林恩", | |
| "Sigga (Icelandic) / 冰岛语-海娜", | |
| "Bea (Filipino) / 菲律宾语-雅娜", | |
| "Chloe (Malay) / 马来语-思怡", | |
| ], | |
| "Healing & Warmth / 情感陪伴-治愈温暖": [ | |
| "Ethan / 中文-晨煦", | |
| "Theo Calm / 中文-予安", | |
| "Serena / 中文-苏瑶", | |
| "Harvey / 英语-厚", | |
| "Maia / 中文-四月", | |
| ], | |
| "Energetic & Playful / 情感陪伴-活力个性": [ | |
| "Evan / 中文-江晨", | |
| "Qiao / 中文-台湾口音-小乔妹", | |
| "Momo / 中文-茉兔", | |
| "Wil / 中文-港台腔-伟伦", | |
| "Angel / 中文-台普-安琪", | |
| ], | |
| "Roleplay / 角色扮演": [ | |
| "Li Cassian / 中文-东厂-李公公", | |
| "Mia / 英语-温柔生活博主-舒然", | |
| "Joyner / 英语-喜剧担当-阿逗", | |
| "Gold / 英语-金爷", | |
| ], | |
| "Game & Anime / 游戏动漫配音": [ | |
| "Katerina / 英语-卡捷琳娜", | |
| "Ryan / 英语-甜茶", | |
| "Jennifer / 英语-詹妮弗", | |
| "Aiden / 英语-艾登", | |
| "Mione / 英语-敏儿", | |
| ], | |
| } | |
| DEFAULT_VOICE_GROUP = "Recommended Voices / 推荐音色" | |
| default_system_prompt = """# Voice Style | |
| You may output **at most one** voice style tag **only when the user explicitly asks to control speaking style** (emotion / speech rate / volume), e.g., “say it angrily”, “speak faster”, “whisper”, “in a calm tone”, etc. | |
| You have the following voice style tags: | |
| <tags> | |
| brisk, rapid, leisurely, sluggish, | |
| loud, shouting, soft-spoken, whispering, | |
| irritated, furious, distasteful, repulsed, | |
| nervous, terrified, cheerful, ecstatic, | |
| gloomy, despairing, startled, shocked | |
| </tags> | |
| Selection rules (only when the user explicitly requests control): | |
| - Choose the tag that best matches the user’s explicit instruction and intended delivery. | |
| - If multiple cues are present, choose the **single most dominant** one (emotion, speed, or volume). | |
| If you choose to output a tag ONLY reply in the following format with NO perfix: | |
| <voice_style> | |
| voice style tag | |
| </voice_style> | |
| your response | |
| <IMPORTANT> | |
| Reminder: | |
| - **If the user does NOT explicitly request a speaking style**, **do NOT output any tag**. Reply normally with your current knowledge and do not tell the user about voice tags. | |
| - If you choose to output a voice style tag, MUST follow the specified format: the tag must be nested within <voice_style></voice_style>. | |
| - **Do not** explain your choice. | |
| - **Never** output more than one tag. | |
| - **Never** add any extra content before the tag. | |
| </IMPORTANT> | |
| Please strictly follow the following guidelines when generating responses. Avoid using any formatting markers, special symbols, or structured layouts. Do not include bold, italic, numbering, bullet points, emojis, or other visual elements. The response must be natural conversational language with smooth sentences and a human-like dialogue flow. Use standard punctuation—such as periods, commas, and question marks—to separate ideas clearly. Refrain from complex sentence structures, and above all, avoid redundant or wordy expressions. Be concise and direct. When listing information, use continuous narration instead of bullet points.""" | |
| def to_mp4(path): | |
| if path and path.endswith(".webm"): | |
| mp4_path = path.replace(".webm", ".mp4") | |
| subprocess.run([ | |
| "ffmpeg", "-y", | |
| "-i", path, | |
| "-c:v", "libx264", | |
| "-pix_fmt", "yuv420p", | |
| "-c:a", "aac", | |
| "-movflags", "+faststart", | |
| "-f", "mp4", | |
| mp4_path | |
| ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| return mp4_path | |
| return path | |
| def format_history(history: list, system_prompt: str): | |
| print(history) | |
| messages = [] | |
| if system_prompt != "": | |
| messages.append({ | |
| "role": "system", | |
| "content": [{ | |
| "type": "text", | |
| "text": system_prompt | |
| }] | |
| }) | |
| current_user_content = [] | |
| for item in history: | |
| role = item['role'] | |
| content = item['content'] | |
| if role != "user": | |
| if current_user_content: | |
| messages.append({ | |
| "role": "user", | |
| "content": current_user_content | |
| }) | |
| current_user_content = [] | |
| if isinstance(content, str): | |
| messages.append({ | |
| "role": role, | |
| "content": [{ | |
| "type": "text", | |
| "text": content | |
| }] | |
| }) | |
| else: | |
| pass | |
| continue | |
| if isinstance(content, str): | |
| current_user_content.append({"type": "text", "text": content}) | |
| elif isinstance(content, (list, tuple)): | |
| for file_path in content: | |
| mime_type = client_utils.get_mimetype(file_path) | |
| media_type = None | |
| if mime_type.startswith("image"): | |
| media_type = "image_url" | |
| elif mime_type.startswith("video"): | |
| media_type = "video_url" | |
| file_path = to_mp4(file_path) | |
| elif mime_type.startswith("audio"): | |
| media_type = "input_audio" | |
| if media_type: | |
| request_id = str(uuid.uuid4()) | |
| oss_path = f"oss://{bucket_name}/{OSS_TEMP_BUCKET_DIR}" + request_id | |
| oss_reader.upload_file(file_path, oss_path) | |
| media_url = oss_reader.get_public_url(oss_path) | |
| if media_type == "input_audio": | |
| current_user_content.append({ | |
| "type": "input_audio", | |
| "input_audio": { | |
| "data": media_url, | |
| "format": "wav", | |
| }, | |
| }) | |
| if media_type == "image_url": | |
| current_user_content.append({ | |
| "type": "image_url", | |
| "image_url": { | |
| "url": media_url | |
| }, | |
| }) | |
| if media_type == "video_url": | |
| current_user_content.append({ | |
| "type": "video_url", | |
| "video_url": { | |
| "url": media_url | |
| }, | |
| "fps": 1, | |
| }) | |
| else: | |
| current_user_content.append({ | |
| "type": "text", | |
| "text": file_path | |
| }) | |
| if current_user_content: | |
| media_items = [] | |
| text_items = [] | |
| for item in current_user_content: | |
| if item["type"] == "text": | |
| text_items.append(item) | |
| else: | |
| media_items.append(item) | |
| messages.append({ | |
| "role": "user", | |
| "content": media_items + text_items | |
| }) | |
| return messages | |
| def predict(messages, | |
| voice_choice=DEFAULT_VOICE, | |
| temperature=0.7, | |
| top_p=0.8, | |
| top_k=20): | |
| completion = model.chat.completions.create( | |
| model=model_name, | |
| messages=messages, | |
| modalities=["text", "audio"], | |
| audio={ | |
| "voice": VOICE_OPTIONS[voice_choice], | |
| "format": "wav" | |
| }, | |
| extra_body={ | |
| "top_k": top_k, | |
| "enable_search": True, | |
| }, | |
| stream_options={"include_usage": True}, | |
| stream=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ) | |
| audio_string = "" | |
| output_text = "" | |
| request_id = "" | |
| request_id_prefixed = False | |
| for chunk in completion: | |
| if request_id == "" and hasattr(chunk, "id") and chunk.id: | |
| request_id = chunk.id | |
| if chunk.choices: | |
| if hasattr(chunk.choices[0].delta, "audio"): | |
| try: | |
| audio_string += chunk.choices[0].delta.audio["data"] | |
| except Exception: | |
| transcript = chunk.choices[0].delta.audio["transcript"] | |
| if args.debug and not request_id_prefixed and request_id: | |
| output_text += f"[Request ID: {request_id}]\n\n" | |
| request_id_prefixed = True | |
| output_text += transcript | |
| yield {"type": "text", "data": output_text} | |
| else: | |
| delta = chunk.choices[0].delta | |
| if hasattr(delta, "content") and delta.content: | |
| if args.debug and not request_id_prefixed and request_id: | |
| output_text += f"[Request ID: {request_id}]\n\n" | |
| request_id_prefixed = True | |
| output_text += delta.content | |
| yield {"type": "text", "data": output_text} | |
| else: | |
| print(chunk.usage) | |
| wav_bytes = base64.b64decode(audio_string) | |
| audio_np = np.frombuffer(wav_bytes, dtype=np.int16) | |
| if audio_string != "": | |
| wav_io = io.BytesIO() | |
| sf.write(wav_io, audio_np, samplerate=24000, format="WAV") | |
| wav_io.seek(0) | |
| wav_bytes = wav_io.getvalue() | |
| audio_path = processing_utils.save_bytes_to_cache( | |
| wav_bytes, "audio.wav", cache_dir=demo.GRADIO_CACHE) | |
| yield {"type": "audio", "data": audio_path} | |
| def media_predict(audio, | |
| video, | |
| history, | |
| voice_choice, | |
| temperature, | |
| top_p, | |
| top_k): | |
| yield ( | |
| None, | |
| None, | |
| history, | |
| gr.update(visible=False), | |
| gr.update(visible=True), | |
| ) | |
| files = [audio, video] | |
| for f in files: | |
| if f: | |
| history.append({"role": "user", "content": (f, )}) | |
| yield ( | |
| None, | |
| None, | |
| history, | |
| gr.update(visible=True), | |
| gr.update(visible=False), | |
| ) | |
| formatted_history = format_history( | |
| history=history, | |
| system_prompt=default_system_prompt, | |
| ) | |
| history.append({"role": "assistant", "content": ""}) | |
| for chunk in predict(formatted_history, voice_choice, temperature, | |
| top_p, top_k): | |
| print('chunk', chunk) | |
| if chunk["type"] == "text": | |
| history[-1]["content"] = chunk["data"] | |
| yield ( | |
| None, | |
| None, | |
| history, | |
| gr.update(visible=False), | |
| gr.update(visible=True), | |
| ) | |
| if chunk["type"] == "audio": | |
| history.append({ | |
| "role": "assistant", | |
| "content": gr.Audio(chunk["data"]) | |
| }) | |
| yield ( | |
| None, | |
| None, | |
| history, | |
| gr.update(visible=True), | |
| gr.update(visible=False), | |
| ) | |
| def update_voice_choices(voice_group): | |
| choices = VOICE_GROUPS[voice_group] | |
| value = choices[0] if choices else None | |
| return gr.update(choices=choices, value=value) | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(font=[ | |
| gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif" | |
| ]), | |
| css=".gradio-container {max-width: none !important;}") as demo: | |
| gr.Markdown("# Qwen3.5 Omni Realtime Interaction Demo") | |
| gr.Markdown( | |
| "**Instructions**: Click the audio recording button or the camera recording button, provide audio or video input, then click Submit and wait for the model's response." | |
| ) | |
| gr.Markdown( | |
| "**使用说明**:1️⃣ 点击音频录制按钮,或摄像头-录制按钮 2️⃣ 输入音频或者视频 3️⃣ 点击提交并等待模型的回答" | |
| ) | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ⚙️ Parameters (参数)") | |
| with gr.Group(): | |
| voice_group = gr.Radio( | |
| label="Voice Category(音色类别)", | |
| choices=list(VOICE_GROUPS.keys()), | |
| value=DEFAULT_VOICE_GROUP) | |
| voice_choice = gr.Dropdown( | |
| label="Voice Choice(音色选择)", | |
| choices=VOICE_GROUPS[DEFAULT_VOICE_GROUP], | |
| value=DEFAULT_VOICE, | |
| visible=True) | |
| temperature = gr.Slider(label="Temperature", | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1) | |
| top_p = gr.Slider(label="Top P", | |
| minimum=0.05, | |
| maximum=1.0, | |
| value=0.8, | |
| step=0.05) | |
| top_k = gr.Slider(label="Top K", | |
| minimum=1, | |
| maximum=100, | |
| value=20, | |
| step=1) | |
| with gr.Column(scale=3): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Audio-Video Input (音视频输入)") | |
| microphone = gr.Audio( | |
| sources=["microphone", 'upload'], | |
| type="filepath", | |
| label="Record Audio (录制音频)") | |
| webcam = gr.Video( | |
| sources=['webcam', "upload"], | |
| label="Record/Upload Video (录制/上传视频)", | |
| elem_classes="media-upload") | |
| with gr.Row(): | |
| submit_btn_online = gr.Button( | |
| "Submit (提交)", | |
| variant="primary", | |
| scale=2) | |
| stop_btn_online = gr.Button("Stop (停止)", | |
| visible=False, | |
| scale=1) | |
| clear_btn_online = gr.Button( | |
| "Clear History (清除历史)") | |
| with gr.Column(scale=2): | |
| media_chatbot = gr.Chatbot( | |
| label="Chat History (对话历史)", | |
| type="messages", | |
| height=650, | |
| layout="panel", | |
| bubble_full_width=False, | |
| render=False) | |
| media_chatbot.render() | |
| def clear_history_online(): | |
| return [], None, None | |
| voice_group.change( | |
| fn=update_voice_choices, | |
| inputs=[voice_group], | |
| outputs=[voice_choice]) | |
| submit_event_online = submit_btn_online.click( | |
| fn=media_predict, | |
| inputs=[ | |
| microphone, webcam, media_chatbot, | |
| voice_choice, temperature, top_p, top_k | |
| ], | |
| outputs=[ | |
| microphone, webcam, media_chatbot, | |
| submit_btn_online, stop_btn_online | |
| ]) | |
| stop_btn_online.click( | |
| fn=lambda: (gr.update(visible=True), | |
| gr.update(visible=False)), | |
| outputs=[submit_btn_online, stop_btn_online], | |
| cancels=[submit_event_online], | |
| queue=False) | |
| clear_btn_online.click( | |
| fn=clear_history_online, | |
| outputs=[media_chatbot, microphone, webcam]) | |
| gr.HTML(""" | |
| <style> | |
| .media-upload { min-height: 160px; border: 2px dashed #ccc; border-radius: 8px; display: flex; align-items: center; justify-content: center; } | |
| .media-upload:hover { border-color: #666; } | |
| </style> | |
| """) | |
| demo.queue(default_concurrency_limit=100, max_size=100).launch( | |
| max_threads=100, | |
| ssr_mode=False, | |
| share=args.share, | |
| inbrowser=args.inbrowser, | |
| server_port=args.server_port, | |
| server_name=args.server_name, | |
| ) | |
| def _get_args(): | |
| parser = ArgumentParser() | |
| parser.add_argument( | |
| '--share', | |
| action='store_true', | |
| default=False, | |
| help='Create a publicly shareable link for the interface.') | |
| parser.add_argument( | |
| '--inbrowser', | |
| action='store_true', | |
| default=False, | |
| help='Automatically launch the interface in a new tab on the default browser.' | |
| ) | |
| parser.add_argument('--server-port', | |
| type=int, | |
| default=7860, | |
| help='Demo server port.') | |
| parser.add_argument('--server-name', | |
| type=str, | |
| default='0.0.0.0', | |
| help='Demo server name.') | |
| parser.add_argument('--demo-mode', | |
| type=str, | |
| default='offline', | |
| choices=['offline', 'realtime'], | |
| help='Choose which demo mode to launch.') | |
| parser.add_argument('--debug', | |
| action='store_true', | |
| default=False, | |
| help='Enable debug mode and show request id at the beginning of assistant replies.') | |
| args = parser.parse_args() | |
| return args | |
| if __name__ == "__main__": | |
| args = _get_args() | |
| oss_reader = OSSReader() | |
| model = OpenAI( | |
| api_key=API_KEY, | |
| base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" | |
| ) | |
| model_name = "qwen3.5-omni-plus" | |
| if args.demo_mode == "offline": | |
| _launch_offline_demo(args, model, oss_reader, model_name) | |
| elif args.demo_mode == "realtime": | |
| _launch_realtime_demo(args, model, oss_reader, model_name) | |