Files changed (4) hide show
  1. .DS_Store +0 -0
  2. README.md +6 -12
  3. app.py +189 -139
  4. requirements.txt +5 -4
.DS_Store ADDED
Binary file (8.2 kB). View file
 
README.md CHANGED
@@ -1,19 +1,13 @@
1
  ---
2
  title: SenseVoice
3
- emoji: 🎙️
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.9.1
8
  app_file: app.py
9
- pinned: true
10
  license: other
11
- suggested_hardware: zero-a10g
12
  ---
13
 
14
- # SenseVoice: Speech Recognition + Emotion + Audio Events
15
-
16
- Multi-task speech understanding model supporting 5 languages (zh/en/yue/ja/ko) with emotion detection and audio event recognition. 7x faster than Whisper-small.
17
-
18
- - **GitHub**: [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) | [FunASR](https://github.com/modelscope/FunASR)
19
- - **Paper**: [arXiv:2407.04051](https://arxiv.org/abs/2407.04051)
 
1
  ---
2
  title: SenseVoice
3
+ emoji: 🐠
4
+ colorFrom: green
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.38.1
8
  app_file: app.py
9
+ pinned: false
10
  license: other
 
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
app.py CHANGED
@@ -1,133 +1,193 @@
1
  # coding=utf-8
2
 
3
  import os
 
 
 
 
 
 
4
  import numpy as np
5
  import torch
6
  import torchaudio
7
- import gradio as gr
8
  import spaces
9
 
10
  from funasr import AutoModel
11
 
12
- model = AutoModel(
13
- model="FunAudioLLM/SenseVoiceSmall",
14
- vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
15
- vad_kwargs={"max_single_segment_time": 30000},
16
- hub="hf",
17
- device="cuda",
18
- )
 
 
19
 
20
  emo_dict = {
21
- "<|HAPPY|>": "😊", "<|SAD|>": "😔", "<|ANGRY|>": "😡",
22
- "<|NEUTRAL|>": "", "<|FEARFUL|>": "😰", "<|DISGUSTED|>": "🤢", "<|SURPRISED|>": "😮",
 
 
 
 
 
23
  }
24
 
25
  event_dict = {
26
- "<|BGM|>": "🎼", "<|Speech|>": "", "<|Applause|>": "👏",
27
- "<|Laughter|>": "😀", "<|Cry|>": "😭", "<|Sneeze|>": "🤧",
28
- "<|Breath|>": "", "<|Cough|>": "😷",
 
 
 
 
 
29
  }
30
 
31
  emoji_dict = {
32
- "<|nospeech|><|Event_UNK|>": "❓",
33
- "<|zh|>": "", "<|en|>": "", "<|yue|>": "", "<|ja|>": "", "<|ko|>": "",
34
- "<|nospeech|>": "",
35
- "<|HAPPY|>": "😊", "<|SAD|>": "😔", "<|ANGRY|>": "😡", "<|NEUTRAL|>": "",
36
- "<|BGM|>": "🎼", "<|Speech|>": "", "<|Applause|>": "👏", "<|Laughter|>": "😀",
37
- "<|FEARFUL|>": "😰", "<|DISGUSTED|>": "🤢", "<|SURPRISED|>": "😮",
38
- "<|Cry|>": "😭", "<|EMO_UNKNOWN|>": "", "<|Sneeze|>": "🤧",
39
- "<|Breath|>": "", "<|Cough|>": "😷", "<|Sing|>": "",
40
- "<|Speech_Noise|>": "", "<|withitn|>": "", "<|woitn|>": "",
41
- "<|GBG|>": "", "<|Event_UNK|>": "",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
 
44
- lang_dict = {
45
- "<|zh|>": "<|lang|>", "<|en|>": "<|lang|>", "<|yue|>": "<|lang|>",
46
- "<|ja|>": "<|lang|>", "<|ko|>": "<|lang|>", "<|nospeech|>": "<|lang|>",
 
 
 
 
47
  }
48
 
49
  emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
50
- event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
51
 
 
 
 
 
52
 
53
- def format_str_v2(s):
54
- sptk_dict = {}
55
- for sptk in emoji_dict:
56
- sptk_dict[sptk] = s.count(sptk)
57
- s = s.replace(sptk, "")
58
- emo = "<|NEUTRAL|>"
59
- for e in emo_dict:
60
- if sptk_dict.get(e, 0) > sptk_dict.get(emo, 0):
61
- emo = e
62
- for e in event_dict:
63
- if sptk_dict.get(e, 0) > 0:
64
- s = event_dict[e] + s
65
- s = s + emo_dict[emo]
66
- for emoji in emo_set.union(event_set):
67
- s = s.replace(" " + emoji, emoji)
68
- s = s.replace(emoji + " ", emoji)
69
- return s.strip()
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def format_str_v3(s):
73
- def get_emo(s):
74
- return s[-1] if s and s[-1] in emo_set else None
75
- def get_event(s):
76
- return s[0] if s and s[0] in event_set else None
77
-
78
- s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
79
- for lang in lang_dict:
80
- s = s.replace(lang, "<|lang|>")
81
- s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
82
- new_s = " " + s_list[0]
83
- cur_ent_event = get_event(new_s)
84
- for i in range(1, len(s_list)):
85
- if len(s_list[i]) == 0:
86
- continue
87
- if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) is not None:
88
- s_list[i] = s_list[i][1:]
89
- cur_ent_event = get_event(s_list[i])
90
- if get_emo(s_list[i]) is not None and get_emo(s_list[i]) == get_emo(new_s):
91
- new_s = new_s[:-1]
92
- new_s += s_list[i].strip().lstrip()
93
- new_s = new_s.replace("The.", " ")
94
- return new_s.strip()
95
-
96
 
97
  @spaces.GPU
98
  def model_inference(input_wav, language, fs=16000):
99
- language = "auto" if not language else language
100
-
101
- if isinstance(input_wav, tuple):
102
- fs, input_wav = input_wav
103
- input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
104
- if len(input_wav.shape) > 1:
105
- input_wav = input_wav.mean(-1)
106
- if fs != 16000:
107
- resampler = torchaudio.transforms.Resample(fs, 16000)
108
- input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
109
- input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
110
-
111
- text = model.generate(
112
- input=input_wav,
113
- cache={},
114
- language=language,
115
- use_itn=True,
116
- batch_size_s=500,
117
- merge_vad=True,
118
- )
119
-
120
- text = text[0]["text"]
121
- text = format_str_v3(text)
122
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
 
125
  audio_examples = [
126
- ["example/zh.mp3", "auto"],
127
- ["example/en.mp3", "auto"],
128
- ["example/yue.mp3", "auto"],
129
- ["example/ja.mp3", "auto"],
130
- ["example/ko.mp3", "auto"],
131
  ["example/emo_1.wav", "auto"],
132
  ["example/emo_2.wav", "auto"],
133
  ["example/emo_3.wav", "auto"],
@@ -135,58 +195,48 @@ audio_examples = [
135
  ["example/rich_2.wav", "auto"],
136
  ["example/longwav_1.wav", "auto"],
137
  ["example/longwav_2.wav", "auto"],
 
138
  ]
139
 
140
 
141
- description_html = """
142
- <div style="text-align: center; max-width: 800px; margin: 0 auto;">
143
- <h1 style="font-size: 2em; margin-bottom: 0.2em;">🎙️ SenseVoice</h1>
144
- <p style="font-size: 1.2em; color: #555; margin-bottom: 0.5em;">Speech Recognition + Emotion Detection + Audio Events — All in One Model</p>
145
- <p style="font-size: 1em; color: #777;">
146
- <strong>5 languages</strong> (zh/en/yue/ja/ko) · <strong>7x faster</strong> than Whisper-small · <strong>17x faster</strong> than Whisper-large
147
- </p>
148
- <p style="font-size: 0.9em; margin-top: 1em;">
149
- <a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">⭐ GitHub</a> ·
150
- <a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR Toolkit</a> ·
151
- <a href="https://arxiv.org/abs/2407.04051" target="_blank">📄 Paper</a> ·
152
- <a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">🚀 Fun-ASR (31 Languages)</a>
153
- </p>
154
- </div>
155
- """
156
 
157
- guide_html = """
158
- <div style="background: #f8f9fa; border-radius: 8px; padding: 12px 16px; margin: 8px 0; font-size: 0.9em;">
159
- <strong>How it works:</strong> Upload audio or record via microphone → SenseVoice transcribes speech and detects emotions (😊😡😔) and sound events (🎼👏😀😭🤧).
160
- Event labels appear at the front of text, emotions at the end.
 
 
 
 
 
 
161
  </div>
162
  """
163
 
164
 
165
  def launch():
166
- with gr.Blocks(theme=gr.themes.Soft(), title="SenseVoice - Speech Understanding") as demo:
167
- gr.HTML(description_html)
168
- gr.HTML(guide_html)
169
- with gr.Row():
170
- with gr.Column():
171
- audio_inputs = gr.Audio(label="Upload audio or use microphone")
172
- with gr.Accordion("Language (auto-detect by default)", open=False):
173
- language_inputs = gr.Dropdown(
174
- choices=["auto", "zh", "en", "yue", "ja", "ko"],
175
- value="auto",
176
- label="Language",
177
- )
178
- fn_button = gr.Button("Recognize", variant="primary", size="lg")
179
- text_outputs = gr.Textbox(label="Result", lines=5, show_copy_button=True)
180
- gr.Examples(
181
- examples=audio_examples,
182
- inputs=[audio_inputs, language_inputs],
183
- examples_per_page=12,
184
- )
185
-
186
- fn_button.click(model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs)
187
-
188
- demo.launch()
189
 
190
 
191
  if __name__ == "__main__":
192
- launch()
 
 
 
 
1
  # coding=utf-8
2
 
3
  import os
4
+ import librosa
5
+ import base64
6
+ import io
7
+ import gradio as gr
8
+ import re
9
+
10
  import numpy as np
11
  import torch
12
  import torchaudio
13
+
14
  import spaces
15
 
16
  from funasr import AutoModel
17
 
18
+ model = "FunAudioLLM/SenseVoiceSmall"
19
+ model = AutoModel(model=model,
20
+ vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
21
+ vad_kwargs={"max_single_segment_time": 30000},
22
+ hub="hf",
23
+ device="cuda"
24
+ )
25
+
26
+ import re
27
 
28
  emo_dict = {
29
+ "<|HAPPY|>": "😊",
30
+ "<|SAD|>": "😔",
31
+ "<|ANGRY|>": "😡",
32
+ "<|NEUTRAL|>": "",
33
+ "<|FEARFUL|>": "😰",
34
+ "<|DISGUSTED|>": "🤢",
35
+ "<|SURPRISED|>": "😮",
36
  }
37
 
38
  event_dict = {
39
+ "<|BGM|>": "🎼",
40
+ "<|Speech|>": "",
41
+ "<|Applause|>": "👏",
42
+ "<|Laughter|>": "😀",
43
+ "<|Cry|>": "😭",
44
+ "<|Sneeze|>": "🤧",
45
+ "<|Breath|>": "",
46
+ "<|Cough|>": "🤧",
47
  }
48
 
49
  emoji_dict = {
50
+ "<|nospeech|><|Event_UNK|>": "❓",
51
+ "<|zh|>": "",
52
+ "<|en|>": "",
53
+ "<|yue|>": "",
54
+ "<|ja|>": "",
55
+ "<|ko|>": "",
56
+ "<|nospeech|>": "",
57
+ "<|HAPPY|>": "😊",
58
+ "<|SAD|>": "😔",
59
+ "<|ANGRY|>": "😡",
60
+ "<|NEUTRAL|>": "",
61
+ "<|BGM|>": "🎼",
62
+ "<|Speech|>": "",
63
+ "<|Applause|>": "👏",
64
+ "<|Laughter|>": "😀",
65
+ "<|FEARFUL|>": "😰",
66
+ "<|DISGUSTED|>": "🤢",
67
+ "<|SURPRISED|>": "😮",
68
+ "<|Cry|>": "😭",
69
+ "<|EMO_UNKNOWN|>": "",
70
+ "<|Sneeze|>": "🤧",
71
+ "<|Breath|>": "",
72
+ "<|Cough|>": "😷",
73
+ "<|Sing|>": "",
74
+ "<|Speech_Noise|>": "",
75
+ "<|withitn|>": "",
76
+ "<|woitn|>": "",
77
+ "<|GBG|>": "",
78
+ "<|Event_UNK|>": "",
79
  }
80
 
81
+ lang_dict = {
82
+ "<|zh|>": "<|lang|>",
83
+ "<|en|>": "<|lang|>",
84
+ "<|yue|>": "<|lang|>",
85
+ "<|ja|>": "<|lang|>",
86
+ "<|ko|>": "<|lang|>",
87
+ "<|nospeech|>": "<|lang|>",
88
  }
89
 
90
  emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
91
+ event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
92
 
93
+ def format_str(s):
94
+ for sptk in emoji_dict:
95
+ s = s.replace(sptk, emoji_dict[sptk])
96
+ return s
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ def format_str_v2(s):
100
+ sptk_dict = {}
101
+ for sptk in emoji_dict:
102
+ sptk_dict[sptk] = s.count(sptk)
103
+ s = s.replace(sptk, "")
104
+ emo = "<|NEUTRAL|>"
105
+ for e in emo_dict:
106
+ if sptk_dict[e] > sptk_dict[emo]:
107
+ emo = e
108
+ for e in event_dict:
109
+ if sptk_dict[e] > 0:
110
+ s = event_dict[e] + s
111
+ s = s + emo_dict[emo]
112
+
113
+ for emoji in emo_set.union(event_set):
114
+ s = s.replace(" " + emoji, emoji)
115
+ s = s.replace(emoji + " ", emoji)
116
+ return s.strip()
117
 
118
  def format_str_v3(s):
119
+ def get_emo(s):
120
+ return s[-1] if s[-1] in emo_set else None
121
+ def get_event(s):
122
+ return s[0] if s[0] in event_set else None
123
+
124
+ s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
125
+ for lang in lang_dict:
126
+ s = s.replace(lang, "<|lang|>")
127
+ s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
128
+ new_s = " " + s_list[0]
129
+ cur_ent_event = get_event(new_s)
130
+ for i in range(1, len(s_list)):
131
+ if len(s_list[i]) == 0:
132
+ continue
133
+ if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
134
+ s_list[i] = s_list[i][1:]
135
+ #else:
136
+ cur_ent_event = get_event(s_list[i])
137
+ if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
138
+ new_s = new_s[:-1]
139
+ new_s += s_list[i].strip().lstrip()
140
+ new_s = new_s.replace("The.", " ")
141
+ return new_s.strip()
142
 
143
  @spaces.GPU
144
  def model_inference(input_wav, language, fs=16000):
145
+ # task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
146
+ language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
147
+ "nospeech": "nospeech"}
148
+
149
+ # task = "Speech Recognition" if task is None else task
150
+ language = "auto" if len(language) < 1 else language
151
+ selected_language = language_abbr[language]
152
+ # selected_task = task_abbr.get(task)
153
+
154
+ # print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
155
+
156
+ if isinstance(input_wav, tuple):
157
+ fs, input_wav = input_wav
158
+ input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
159
+ if len(input_wav.shape) > 1:
160
+ input_wav = input_wav.mean(-1)
161
+ if fs != 16000:
162
+ print(f"audio_fs: {fs}")
163
+ resampler = torchaudio.transforms.Resample(fs, 16000)
164
+ input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
165
+ input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
166
+
167
+
168
+ merge_vad = True #False if selected_task == "ASR" else True
169
+ print(f"language: {language}, merge_vad: {merge_vad}")
170
+ text = model.generate(input=input_wav,
171
+ cache={},
172
+ language=language,
173
+ use_itn=True,
174
+ batch_size_s=500, merge_vad=merge_vad)
175
+
176
+ print(text)
177
+ text = text[0]["text"]
178
+ text = format_str_v3(text)
179
+
180
+ print(text)
181
+
182
+ return text
183
 
184
 
185
  audio_examples = [
186
+ ["example/zh.mp3", "zh"],
187
+ ["example/yue.mp3", "yue"],
188
+ ["example/en.mp3", "en"],
189
+ ["example/ja.mp3", "ja"],
190
+ ["example/ko.mp3", "ko"],
191
  ["example/emo_1.wav", "auto"],
192
  ["example/emo_2.wav", "auto"],
193
  ["example/emo_3.wav", "auto"],
 
195
  ["example/rich_2.wav", "auto"],
196
  ["example/longwav_1.wav", "auto"],
197
  ["example/longwav_2.wav", "auto"],
198
+ ["example/longwav_3.wav", "auto"],
199
  ]
200
 
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
+ html_content = """
204
+ <div>
205
+ <h2 style="font-size: 22px;margin-left: 0px;">Voice Understanding Model: SenseVoice-Small</h2>
206
+ <p style="font-size: 18px;margin-left: 20px;">SenseVoice-Small is an encoder-only speech foundation model designed for rapid voice understanding. It encompasses a variety of features including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and acoustic event detection (AED). SenseVoice-Small supports multilingual recognition for Chinese, English, Cantonese, Japanese, and Korean. Additionally, it offers exceptionally low inference latency, performing 7 times faster than Whisper-small and 17 times faster than Whisper-large.</p>
207
+ <h2 style="font-size: 22px;margin-left: 0px;">Usage</h2> <p style="font-size: 18px;margin-left: 20px;">Upload an audio file or input through a microphone, then select the task and language. the audio is transcribed into corresponding text along with associated emotions (😊 happy, 😡 angry/exicting, 😔 sad) and types of sound events (😀 laughter, 🎼 music, 👏 applause, 🤧 cough&sneeze, 😭 cry). The event labels are placed in the front of the text and the emotion are in the back of the text.</p>
208
+ <p style="font-size: 18px;margin-left: 20px;">Recommended audio input duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended.</p>
209
+ <h2 style="font-size: 22px;margin-left: 0px;">Repo</h2>
210
+ <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">SenseVoice</a>: multilingual speech understanding model</p>
211
+ <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/FunASR" target="_blank">FunASR</a>: fundamental speech recognition toolkit</p>
212
+ <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/CosyVoice" target="_blank">CosyVoice</a>: high-quality multilingual TTS model</p>
213
  </div>
214
  """
215
 
216
 
217
  def launch():
218
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
219
+ # gr.Markdown(description)
220
+ gr.HTML(html_content)
221
+ with gr.Row():
222
+ with gr.Column():
223
+ audio_inputs = gr.Audio(label="Upload audio or use the microphone")
224
+
225
+ with gr.Accordion("Configuration"):
226
+ language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
227
+ value="auto",
228
+ label="Language")
229
+ fn_button = gr.Button("Start", variant="primary")
230
+ text_outputs = gr.Textbox(label="Results")
231
+ gr.Examples(examples=audio_examples, inputs=[audio_inputs, language_inputs], examples_per_page=20)
232
+
233
+ fn_button.click(model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs)
234
+
235
+ demo.launch()
 
 
 
 
 
236
 
237
 
238
  if __name__ == "__main__":
239
+ # iface.launch()
240
+ launch()
241
+
242
+
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  torch>=1.13
2
  torchaudio
3
- funasr>=1.2.0
4
- huggingface_hub
5
  modelscope
6
- numpy<2.0
7
- librosa
 
 
 
 
1
  torch>=1.13
2
  torchaudio
 
 
3
  modelscope
4
+ huggingface
5
+ huggingface_hub
6
+ funasr>=1.1.3
7
+ numpy<=1.26.4
8
+ gradio