littlebird13 commited on
Commit
d462144
·
verified ·
1 Parent(s): 6824e74

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -0
app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ os.system('pip install dashscope -U')
4
+ import dashscope
5
+ from dashscope import MultiModalConversation
6
+
7
+ API_KEY = os.environ['API_KEY']
8
+
9
+ dashscope.api_key = API_KEY
10
+ dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1"
11
+
12
+ def asr_inference(audio_file, context, language, enable_itn):
13
+ if not audio_file:
14
+ return "请上传音频文件"
15
+
16
+ messages = [
17
+ {
18
+ "role": "system",
19
+ "content": [
20
+ {"text": context},
21
+ ]
22
+ },
23
+ {
24
+ "role": "user",
25
+ "content": [
26
+ {"audio": audio_file},
27
+ ]
28
+ }
29
+ ]
30
+
31
+ if language == 'auto':
32
+ response = MultiModalConversation.call(
33
+ model="qwen3-asr-flash",
34
+ messages=messages,
35
+ result_format="message",
36
+ asr_options={
37
+ "enable_lid": True,
38
+ "enable_itn": enable_itn
39
+ }
40
+ )
41
+ else:
42
+ response = MultiModalConversation.call(
43
+ model="qwen3-asr-flash",
44
+ messages=messages,
45
+ result_format="message",
46
+ asr_options={
47
+ "language": language,
48
+ "enable_lid": True,
49
+ "enable_itn": enable_itn
50
+ }
51
+ )
52
+
53
+ try:
54
+ if hasattr(response, 'status_code') and response.status_code == 200:
55
+ if (hasattr(response, 'output') and
56
+ hasattr(response.output, 'choices') and
57
+ len(response.output.choices) > 0):
58
+
59
+ choice = response.output.choices[0]
60
+ if (hasattr(choice, 'message') and
61
+ hasattr(choice.message, 'content') and
62
+ len(choice.message.content) > 0):
63
+
64
+ content = choice.message.content[0]
65
+ if 'text' in content:
66
+ result_text = content['text']
67
+ if language == 'auto' and hasattr(choice.message, "annotations"):
68
+ result_lang = choice.message.annotations[0]['language']
69
+ else:
70
+ result_lang = None
71
+ else:
72
+ result_text = "未找到文本内容"
73
+ result_lang = None
74
+ else:
75
+ result_text = "响应结构不完整"
76
+ result_lang = None
77
+ else:
78
+ result_text = "响应中没有找到识别结果"
79
+ result_lang = None
80
+ else:
81
+ status_code = getattr(response, 'status_code', '未知')
82
+ error_msg = getattr(response, 'message', '未知错误')
83
+ result_text = f"请求失败 (状态码: {status_code}): {error_msg}"
84
+ result_lang = None
85
+
86
+ except Exception as e:
87
+ result_text = f"处理出错: {str(e)}"
88
+ result_lang = None
89
+
90
+ # 映射 result_lang 为中文/英文名称
91
+ lang_display = {
92
+ "auto": "自动识别 / Auto Detect",
93
+ "zh": "中文 / Chinese",
94
+ "en": "英文 / English",
95
+ "ja": "日文 / Japanese",
96
+ "ko": "韩文 / Korean",
97
+ "es": "西班牙文 / Spanish",
98
+ "fr": "法文 / French",
99
+ "de": "德文 / German",
100
+ "ar": "阿拉伯文 / Arabic",
101
+ "it": "意大利文 / Italian",
102
+ "ru": "俄文 / Russian",
103
+ "pt": "葡萄牙文 / Portuguese"
104
+ }
105
+ if result_lang in lang_display:
106
+ result_lang = lang_display[result_lang]
107
+ elif result_lang is not None:
108
+ result_lang = f"未知语种 / Unknown ({result_lang})"
109
+
110
+ return result_text, result_lang
111
+
112
+
113
+ with gr.Blocks(theme=gr.themes.Soft(), title="语音识别工具") as demo:
114
+ # ========== LOGO 区域(居中 + 放大) ==========
115
+ gr.Markdown("""
116
+ <div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;">
117
+ <img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png"
118
+ alt="Qwen-ASR Logo"
119
+ width="300"
120
+ style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/>
121
+ </div>
122
+ """, sanitize_html=False)
123
+
124
+ # ========== API 链接预留区域 ==========
125
+ gr.Markdown("""
126
+ <div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;">
127
+ 🌐 <a href="https://help.aliyun.com/zh/dashscope/developer-reference/"
128
+ target="_blank"
129
+ style="color: #0066cc; text-decoration: none;">
130
+ 查看 DashScope API 文档
131
+ </a>
132
+ </div>
133
+ """, sanitize_html=False)
134
+
135
+ gr.Markdown("上传音频文件,获取语音转文字结果。\n支持指定任意格式的上下文信息以获取定制化的识别结果。支持语言识别和逆文本标准化。")
136
+
137
+ with gr.Row():
138
+ with gr.Column():
139
+ audio_input = gr.Audio(label="🎤 上传音频", type="filepath")
140
+ context_input = gr.Textbox(label="📝 上下文信息(可选)", value="", interactive=True)
141
+ language = gr.Dropdown(
142
+ label="🌍 语言设置",
143
+ choices=[
144
+ ("自动识别 / Auto Detect", "auto"),
145
+ ("中文 / Chinese", "zh"),
146
+ ("英文 / English", "en"),
147
+ ("日文 / Japanese", "ja"),
148
+ ("韩文 / Korean", "ko"),
149
+ ("西班牙文 / Spanish", "es"),
150
+ ("法文 / French", "fr"),
151
+ ("德文 / German", "de"),
152
+ ("阿拉伯文 / Arabic", "ar"),
153
+ ("意大利文 / Italian", "it"),
154
+ ("俄文 / Russian", "ru"),
155
+ ("葡萄牙文 / Portuguese", "pt")
156
+ ],
157
+ value="auto"
158
+ )
159
+ # enable_lid = gr.Checkbox(label="✅ 启用语言识别(LID)", value=True)
160
+ enable_itn = gr.Checkbox(label="🔄 启用逆文本标准化(ITN)", value=False)
161
+ submit_btn = gr.Button("🚀 开始识别", variant="primary")
162
+
163
+ with gr.Column():
164
+ text_output = gr.Textbox(label="📝 识别结果", interactive=False, lines=6, max_lines=12)
165
+ lang_output = gr.Textbox(label="📝 语种检测结果(仅在auto模式下返回)", interactive=False, lines=1, max_lines=12)
166
+
167
+ submit_btn.click(
168
+ fn=asr_inference,
169
+ inputs=[audio_input, context_input, language, enable_itn],
170
+ outputs=[text_output, lang_output]
171
+ )
172
+
173
+ gr.Markdown("---")
174
+ gr.Markdown("💡 **使用提示**:")
175
+ gr.Markdown("- 支持 MP3、WAV 等常见音频格式")
176
+ gr.Markdown("- 启用 LID 可自动识别语音语言")
177
+ gr.Markdown("- 以任意格式配置 context 信息可以获取定制化的文本结果,纠正命名实体名称等")
178
+ gr.Markdown("- 启用 ITN 可将数字、日期等转换为标准文本格式")
179
+
180
+ # 方法1:使用 HTML 手动创建示例按钮(推荐)
181
+ gr.Markdown("### 💡 示例")
182
+
183
+ # 定义示例数据
184
+ examples_data = {
185
+ "Example 1 - CSGO比赛": {
186
+ "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav",
187
+ "context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.",
188
+ "description": "游戏解说示例(包含专业术语)"
189
+ },
190
+ "Example 2 - 噪音环境": {
191
+ "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav",
192
+ "context": "",
193
+ "description": "噪音环境下的语音识别"
194
+ },
195
+ "Example 3 - 复杂音频": {
196
+ "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav",
197
+ "context": "",
198
+ "description": "复杂背景音频处理"
199
+ }
200
+ }
201
+
202
+ # 创建示例按钮
203
+ with gr.Row():
204
+ for title, data in examples_data.items():
205
+ with gr.Column():
206
+ example_btn = gr.Button(f"📎 {title}", variant="secondary", size="sm")
207
+ gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"])
208
+
209
+ # 为每个按钮创建点击事件
210
+ example_btn.click(
211
+ fn=lambda audio=data['audio'], context=data['context']: (audio, context),
212
+ outputs=[audio_input, context_input]
213
+ )
214
+
215
+ if __name__ == "__main__":
216
+ demo.launch()