Spaces:

Qwen
/

Qwen3-ASR-Demo

Running

App Files Files Community

littlebird13 commited on Sep 8

Commit

d462144

verified ·

1 Parent(s): 6824e74

Create app.py

Browse files

Files changed (1) hide show

app.py +216 -0

app.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import gradio as gr
+import os
+os.system('pip install dashscope -U')
+import dashscope
+from dashscope import MultiModalConversation
+API_KEY = os.environ['API_KEY']
+dashscope.api_key = API_KEY
+dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1"
+def asr_inference(audio_file, context, language, enable_itn):
+    if not audio_file:
+        return "请上传音频文件"
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"text": context},
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"audio": audio_file},
+            ]
+        }
+    ]
+    if language == 'auto':
+        response = MultiModalConversation.call(
+            model="qwen3-asr-flash",
+            messages=messages,
+            result_format="message",
+            asr_options={
+                "enable_lid": True,
+                "enable_itn": enable_itn
+            }
+        )
+    else:
+        response = MultiModalConversation.call(
+            model="qwen3-asr-flash",
+            messages=messages,
+            result_format="message",
+            asr_options={
+                "language": language,
+                "enable_lid": True,
+                "enable_itn": enable_itn
+            }
+        )
+    try:
+        if hasattr(response, 'status_code') and response.status_code == 200:
+            if (hasattr(response, 'output') and
+                hasattr(response.output, 'choices') and
+                len(response.output.choices) > 0):
+                choice = response.output.choices[0]
+                if (hasattr(choice, 'message') and
+                    hasattr(choice.message, 'content') and
+                    len(choice.message.content) > 0):
+                    content = choice.message.content[0]
+                    if 'text' in content:
+                        result_text = content['text']
+                        if language == 'auto' and hasattr(choice.message, "annotations"):
+                            result_lang = choice.message.annotations[0]['language']
+                        else:
+                            result_lang = None
+                    else:
+                        result_text = "未找到文本内容"
+                        result_lang = None
+                else:
+                    result_text = "响应结构不完整"
+                    result_lang = None
+            else:
+                result_text = "响应中没有找到识别结果"
+                result_lang = None
+        else:
+            status_code = getattr(response, 'status_code', '未知')
+            error_msg = getattr(response, 'message', '未知错误')
+            result_text = f"请求失败 (状态码: {status_code}): {error_msg}"
+            result_lang = None
+    except Exception as e:
+        result_text = f"处理出错: {str(e)}"
+        result_lang = None
+    # 映射 result_lang 为中文/英文名称
+    lang_display = {
+        "auto": "自动识别 / Auto Detect",
+        "zh": "中文 / Chinese",
+        "en": "英文 / English",
+        "ja": "日文 / Japanese",
+        "ko": "韩文 / Korean",
+        "es": "西班牙文 / Spanish",
+        "fr": "法文 / French",
+        "de": "德文 / German",
+        "ar": "阿拉伯文 / Arabic",
+        "it": "意大利文 / Italian",
+        "ru": "俄文 / Russian",
+        "pt": "葡萄牙文 / Portuguese"
+    }
+    if result_lang in lang_display:
+        result_lang = lang_display[result_lang]
+    elif result_lang is not None:
+        result_lang = f"未知语种 / Unknown ({result_lang})"
+    return result_text, result_lang
+with gr.Blocks(theme=gr.themes.Soft(), title="语音识别工具") as demo:
+    # ========== LOGO 区域（居中 + 放大） ==========
+    gr.Markdown("""
+    <div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;">
+        <img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png"
+             alt="Qwen-ASR Logo"
+             width="300"
+             style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/>
+    </div>
+    """, sanitize_html=False)
+    # ========== API 链接预留区域 ==========
+    gr.Markdown("""
+    <div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;">
+        🌐 <a href="https://help.aliyun.com/zh/dashscope/developer-reference/"
+              target="_blank"
+              style="color: #0066cc; text-decoration: none;">
+              查看 DashScope API 文档
+        </a>
+    </div>
+    """, sanitize_html=False)
+    gr.Markdown("上传音频文件，获取语音转文字结果。\n支持指定任意格式的上下文信息以获取定制化的识别结果。支持语言识别和逆文本标准化。")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(label="🎤 上传音频", type="filepath")
+            context_input = gr.Textbox(label="📝 上下文信息（可选）", value="", interactive=True)
+            language = gr.Dropdown(
+                label="🌍 语言设置",
+                choices=[
+                    ("自动识别 / Auto Detect", "auto"),
+                    ("中文 / Chinese", "zh"),
+                    ("英文 / English", "en"),
+                    ("日文 / Japanese", "ja"),
+                    ("韩文 / Korean", "ko"),
+                    ("西班牙文 / Spanish", "es"),
+                    ("法文 / French", "fr"),
+                    ("德文 / German", "de"),
+                    ("阿拉伯文 / Arabic", "ar"),
+                    ("意大利文 / Italian", "it"),
+                    ("俄文 / Russian", "ru"),
+                    ("葡萄牙文 / Portuguese", "pt")
+                ],
+                value="auto"
+            )
+            # enable_lid = gr.Checkbox(label="✅ 启用语言识别（LID）", value=True)
+            enable_itn = gr.Checkbox(label="🔄 启用逆文本标准化（ITN）", value=False)
+            submit_btn = gr.Button("🚀 开始识别", variant="primary")
+        with gr.Column():
+            text_output = gr.Textbox(label="📝 识别结果", interactive=False, lines=6, max_lines=12)
+            lang_output = gr.Textbox(label="📝 语种检测结果（仅在auto模式下返回）", interactive=False, lines=1, max_lines=12)
+    submit_btn.click(
+        fn=asr_inference,
+        inputs=[audio_input, context_input, language, enable_itn],
+        outputs=[text_output, lang_output]
+    )
+    gr.Markdown("---")
+    gr.Markdown("💡 **使用提示**：")
+    gr.Markdown("- 支持 MP3、WAV 等常见音频格式")
+    gr.Markdown("- 启用 LID 可自动识别语音语言")
+    gr.Markdown("- 以任意格式配置 context 信息可以获取定制化的文本结果，纠正命名实体名称等")
+    gr.Markdown("- 启用 ITN 可将数字、日期等转换为标准文本格式")
+    # 方法1：使用 HTML 手动创建示例按钮（推荐）
+    gr.Markdown("### 💡 示例")
+    # 定义示例数据
+    examples_data = {
+        "Example 1 - CSGO比赛": {
+            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav",
+            "context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.",
+            "description": "游戏解说示例（包含专业术语）"
+        },
+        "Example 2 - 噪音环境": {
+            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav",
+            "context": "",
+            "description": "噪音环境下的语音识别"
+        },
+        "Example 3 - 复杂音频": {
+            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav",
+            "context": "",
+            "description": "复杂背景音频处理"
+        }
+    }
+    # 创建示例按钮
+    with gr.Row():
+        for title, data in examples_data.items():
+            with gr.Column():
+                example_btn = gr.Button(f"📎 {title}", variant="secondary", size="sm")
+                gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"])
+                # 为每个按钮创建点击事件
+                example_btn.click(
+                    fn=lambda audio=data['audio'], context=data['context']: (audio, context),
+                    outputs=[audio_input, context_input]
+                )
+if __name__ == "__main__":
+    demo.launch()