xiaoyunchong.xyc commited on
Commit
59b606f
·
1 Parent(s): d666310

fix: lazy model loading for ZeroGPU compatibility

Browse files

- Models loaded inside @spaces.GPU context (ZeroGPU requires this)
- Simplified UI, removed unused proxy/download options
- Fixed remote_code path issue
- Both Fun-ASR-Nano and SenseVoice supported

Files changed (3) hide show
  1. README.md +22 -9
  2. app.py +116 -599
  3. requirements.txt +3 -9
README.md CHANGED
@@ -1,14 +1,27 @@
1
  ---
2
- title: Fun ASR Nano
3
- emoji: 📈
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.1.0
8
  app_file: app.py
9
- pinned: false
10
- license: unknown
11
- short_description: demo page for fun-asr-nano
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Fun-ASR-Nano GPU
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ suggested_hardware: zero-a10g
12
+ short_description: "LLM-powered ASR on GPU: 31 languages, Chinese dialects"
13
  ---
14
 
15
+ # Fun-ASR-Nano: LLM-Powered Speech Recognition (GPU)
16
+
17
+ End-to-end ASR model trained on tens of millions of hours, supporting **31 languages** including Chinese dialects.
18
+
19
+ ## Key Features
20
+ - 🌍 31 languages + Chinese dialect recognition
21
+ - 🎯 Native punctuation output
22
+ - ⚡ GPU-accelerated inference via ZeroGPU
23
+ - 🔥 Trained on massive multilingual data
24
+
25
+ ## Links
26
+ - **GitHub**: [Fun-ASR](https://github.com/FunAudioLLM/Fun-ASR) | [FunASR Toolkit](https://github.com/modelscope/FunASR)
27
+ - **Model**: [Fun-ASR-Nano-2512](https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512)
app.py CHANGED
@@ -1,672 +1,189 @@
1
  import os
2
  import spaces
3
- # only debug for hf now
4
- REPO_TYPE = "hf"
5
- if REPO_TYPE not in ["hf", "ms"]:
6
- raise ValueError("REPO_TYPE must be either 'hf' for Hugging Face or 'ms' for ModelScope.")
7
 
8
- if REPO_TYPE == "hf":
9
- from huggingface_hub import snapshot_download
10
- else:
11
- from modelscope.hub.snapshot_download import snapshot_download
12
 
 
13
 
14
-
15
- # 1. 定义本地路径和远程仓库ID
16
  MODEL_CACHE_DIR = "./models"
17
  FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
18
  SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
19
  VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")
20
 
21
- # 创建模型缓存目录
22
  os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
23
 
24
- # 设置ModelScope环境变量以使用本地缓存
25
- os.environ['MODELSCOPE_CACHE'] = MODEL_CACHE_DIR
26
- # 禁用远程下载,强制使用本地模型(可选,如果想要确保只使用本地模型)
27
- # os.environ['MODELSCOPE_DISABLE_REMOTE'] = '1'
28
-
29
- print(f"ModelScope缓存目录设置为: {MODEL_CACHE_DIR}")
30
 
31
- if REPO_TYPE == "ms":
32
- FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
33
- SENSE_VOICE_SMALL_REPO_ID = "iic/SenseVoiceSmall"
34
- VAD_MODEL_REPO_ID = "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
35
- else:
36
- FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
37
- SENSE_VOICE_SMALL_REPO_ID = "FunAudioLLM/SenseVoiceSmall"
38
- VAD_MODEL_REPO_ID = "funasr/fsmn-vad"
39
 
40
- # 2. 检查本地是否存在,不存在则下载
41
  def download_model_if_not_exists(repo_id, local_path, model_name):
42
- """如果本地模型不存在,则下载模型"""
43
  if not os.path.exists(local_path):
44
- print(f"正在下载模型 {model_name} {local_path} ...")
45
- snapshot_download(
46
- repo_id=repo_id,
47
- local_dir=local_path,
48
- ignore_patterns=["*.onnx"], # 如果你不需要onnx文件,可以过滤掉以节省时间和空间
49
- )
50
- print(f"{model_name} 模型下载完毕!")
51
  else:
52
- print(f"检测到本地 {model_name} 模型文件,跳过下载。")
 
53
 
54
- # 下载所有需要的模型
55
  download_model_if_not_exists(FUN_ASR_NANO_REPO_ID, FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
56
  download_model_if_not_exists(SENSE_VOICE_SMALL_REPO_ID, SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
57
  download_model_if_not_exists(VAD_MODEL_REPO_ID, VAD_MODEL_LOCAL_PATH, "VAD Model")
58
 
59
-
60
-
61
-
62
  import gradio as gr
63
  import time
64
- import sys
65
- import io
66
  import tempfile
67
- import subprocess
68
- import requests
69
- from urllib.parse import urlparse
70
- from pydub import AudioSegment
71
  import logging
72
  import torch
73
- import importlib
74
  from funasr import AutoModel
75
  from funasr.utils.postprocess_utils import rich_transcription_postprocess
76
 
77
- # Model configurations for local deployment
78
- FUN_ASR_NANO_MODEL_PATH_LIST = [
79
- FUN_ASR_NANO_LOCAL_PATH, # local path
80
- ]
81
-
82
- SENSEVOICE_MODEL_PATH_LIST = [
83
- SENSE_VOICE_SMALL_LOCAL_PATH, # local path
84
- ]
85
-
86
-
87
- # initial model like this, we have gpu
88
-
89
- MODEL_FUN_ASR = AutoModel(
90
- model=FUN_ASR_NANO_LOCAL_PATH,
91
- trust_remote_code=True,
92
- remote_code=f"./Fun-ASR/model.py", # 建议:如果本地models目录下没有这个文件,这行会报错。如果不需要魔改代码,去掉这行。
93
- vad_model=VAD_MODEL_LOCAL_PATH,
94
- vad_kwargs={"max_single_segment_time": 30000},
95
- device='cuda', # 直接指定 GPU
96
- disable_update=True,
97
- hub='ms',
98
- )
99
-
100
- # 2. 初始化 SenseVoice
101
- print("Loading SenseVoice...")
102
- MODEL_SENSE_VOICE = AutoModel(
103
- model=SENSE_VOICE_SMALL_LOCAL_PATH,
104
- trust_remote_code=False,
105
- vad_model=VAD_MODEL_LOCAL_PATH,
106
- vad_kwargs={"max_single_segment_time": 30000},
107
- device='cuda', # 直接指定 GPU
108
- disable_update=True,
109
- hub='ms',
110
- )
111
- print("所有模型全局初始化完成!")
112
-
113
- class LogCapture(io.StringIO):
114
- def __init__(self, callback):
115
- super().__init__()
116
- self.callback = callback
117
-
118
- def write(self, s):
119
- super().write(s)
120
- self.callback(s)
121
-
122
- # Set up logging
123
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
124
 
 
 
125
 
126
 
 
 
 
127
 
128
-
129
-
130
- # Check for CUDA availability
131
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
132
- # logging.info(f"Using device: {device}")
133
-
134
- def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
135
- """
136
- Downloads audio from a given URL using the specified method and proxy settings.
137
-
138
- Args:
139
- url (str): The URL of the audio.
140
- method_choice (str): The method to use for downloading audio.
141
- proxy_url (str): Proxy URL if needed.
142
- proxy_username (str): Proxy username.
143
- proxy_password (str): Proxy password.
144
-
145
- Returns:
146
- tuple: (path to the downloaded audio file, is_temp_file), or (None, False) if failed.
147
- """
148
- parsed_url = urlparse(url)
149
- logging.info(f"Downloading audio from URL: {url} using method: {method_choice}")
150
- try:
151
- if 'youtube.com' in parsed_url.netloc or 'youtu.be' in parsed_url.netloc:
152
- error_msg = f"YouTube download is not supported. Please use direct audio URLs instead."
153
- logging.error(error_msg)
154
- return None, False
155
- elif parsed_url.scheme == 'rtsp':
156
- audio_file = download_rtsp_audio(url, proxy_url)
157
- if not audio_file:
158
- error_msg = f"Failed to download RTSP audio from {url}"
159
- logging.error(error_msg)
160
- return None, False
161
- else:
162
- audio_file = download_direct_audio(url, method_choice, proxy_url, proxy_username, proxy_password)
163
- if not audio_file:
164
- error_msg = f"Failed to download audio from {url} using method {method_choice}"
165
- logging.error(error_msg)
166
- return None, False
167
- return audio_file, True
168
- except Exception as e:
169
- error_msg = f"Error downloading audio from {url} using method {method_choice}: {str(e)}"
170
- logging.error(error_msg)
171
- return None, False
172
-
173
-
174
-
175
-
176
- def download_rtsp_audio(url, proxy_url):
177
- """
178
- Downloads audio from an RTSP URL using FFmpeg.
179
-
180
- Args:
181
- url (str): The RTSP URL.
182
- proxy_url (str): Proxy URL if needed.
183
-
184
- Returns:
185
- str: Path to the downloaded audio file, or None if failed.
186
- """
187
- logging.info("Using FFmpeg to download RTSP stream")
188
- output_file = tempfile.mktemp(suffix='.mp3')
189
- command = ['ffmpeg', '-i', url, '-acodec', 'libmp3lame', '-ab', '192k', '-y', output_file]
190
- env = os.environ.copy()
191
- if proxy_url and len(proxy_url.strip()) > 0:
192
- env['http_proxy'] = proxy_url
193
- env['https_proxy'] = proxy_url
194
- try:
195
- subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
196
- logging.info(f"Downloaded RTSP audio to: {output_file}")
197
- return output_file
198
- except subprocess.CalledProcessError as e:
199
- logging.error(f"FFmpeg error: {e.stderr.decode()}")
200
- return None
201
- except Exception as e:
202
- logging.error(f"Error downloading RTSP audio: {str(e)}")
203
- return None
204
-
205
- def download_direct_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
206
- """
207
- Downloads audio from a direct URL using the specified method.
208
-
209
- Args:
210
- url (str): The direct URL of the audio file.
211
- method_choice (str): The method to use for downloading.
212
- proxy_url (str): Proxy URL if needed.
213
- proxy_username (str): Proxy username.
214
- proxy_password (str): Proxy password.
215
-
216
- Returns:
217
- str: Path to the downloaded audio file, or None if failed.
218
- """
219
- logging.info(f"Downloading direct audio from: {url} using method: {method_choice}")
220
- methods = {
221
- 'wget': wget_method,
222
- 'requests': requests_method,
223
- 'ffmpeg': ffmpeg_method,
224
- 'aria2': aria2_method,
225
- }
226
- method = methods.get(method_choice, requests_method)
227
- try:
228
- audio_file = method(url, proxy_url, proxy_username, proxy_password)
229
- if not audio_file or not os.path.exists(audio_file):
230
- error_msg = f"Failed to download direct audio from {url} using method {method_choice}"
231
- logging.error(error_msg)
232
- return None
233
- return audio_file
234
- except Exception as e:
235
- logging.error(f"Error downloading direct audio with {method_choice}: {str(e)}")
236
- return None
237
-
238
- def requests_method(url, proxy_url, proxy_username, proxy_password):
239
- """
240
- Downloads audio using the requests library.
241
-
242
- Args:
243
- url (str): The URL of the audio file.
244
- proxy_url (str): Proxy URL if needed.
245
- proxy_username (str): Proxy username.
246
- proxy_password (str): Proxy password.
247
-
248
- Returns:
249
- str: Path to the downloaded audio file, or None if failed.
250
- """
251
- try:
252
- proxies = None
253
- auth = None
254
- if proxy_url and len(proxy_url.strip()) > 0:
255
- proxies = {
256
- "http": proxy_url,
257
- "https": proxy_url
258
- }
259
- if proxy_username and proxy_password:
260
- auth = (proxy_username, proxy_password)
261
- response = requests.get(url, stream=True, proxies=proxies, auth=auth)
262
- if response.status_code == 200:
263
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
264
- for chunk in response.iter_content(chunk_size=8192):
265
- if chunk:
266
- temp_file.write(chunk)
267
- logging.info(f"Downloaded direct audio to: {temp_file.name}")
268
- return temp_file.name
269
- else:
270
- logging.error(f"Failed to download audio from {url} with status code {response.status_code}")
271
- return None
272
- except Exception as e:
273
- logging.error(f"Error in requests_method: {str(e)}")
274
- return None
275
-
276
- def wget_method(url, proxy_url, proxy_username, proxy_password):
277
- """
278
- Downloads audio using the wget command-line tool.
279
-
280
- Args:
281
- url (str): The URL of the audio file.
282
- proxy_url (str): Proxy URL if needed.
283
- proxy_username (str): Proxy username.
284
- proxy_password (str): Proxy password.
285
-
286
- Returns:
287
- str: Path to the downloaded audio file, or None if failed.
288
- """
289
- logging.info("Using wget method")
290
- output_file = tempfile.mktemp(suffix='.mp3')
291
- command = ['wget', '-O', output_file, url]
292
- env = os.environ.copy()
293
- if proxy_url and len(proxy_url.strip()) > 0:
294
- env['http_proxy'] = proxy_url
295
- env['https_proxy'] = proxy_url
296
- try:
297
- subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
298
- logging.info(f"Downloaded audio to: {output_file}")
299
- return output_file
300
- except subprocess.CalledProcessError as e:
301
- logging.error(f"Wget error: {e.stderr.decode()}")
302
- return None
303
- except Exception as e:
304
- logging.error(f"Error in wget_method: {str(e)}")
305
- return None
306
-
307
-
308
- def ffmpeg_method(url, proxy_url, proxy_username, proxy_password):
309
- """
310
- Downloads audio using FFmpeg.
311
-
312
- Args:
313
- url (str): The URL of the audio file.
314
- proxy_url (str): Proxy URL if needed.
315
- proxy_username (str): Proxy username.
316
- proxy_password (str): Proxy password.
317
-
318
- Returns:
319
- str: Path to the downloaded audio file, or None if failed.
320
- """
321
- logging.info("Using ffmpeg method")
322
- output_file = tempfile.mktemp(suffix='.mp3')
323
- command = ['ffmpeg', '-i', url, '-vn', '-acodec', 'libmp3lame', '-q:a', '2', output_file]
324
- env = os.environ.copy()
325
- if proxy_url and len(proxy_url.strip()) > 0:
326
- env['http_proxy'] = proxy_url
327
- env['https_proxy'] = proxy_url
328
- try:
329
- subprocess.run(command, check=True, capture_output=True, text=True, env=env)
330
- logging.info(f"Downloaded and converted audio to: {output_file}")
331
- return output_file
332
- except subprocess.CalledProcessError as e:
333
- logging.error(f"FFmpeg error: {e.stderr}")
334
- return None
335
- except Exception as e:
336
- logging.error(f"Error in ffmpeg_method: {str(e)}")
337
- return None
338
-
339
- def aria2_method(url, proxy_url, proxy_username, proxy_password):
340
- """
341
- Downloads audio using aria2.
342
-
343
- Args:
344
- url (str): The URL of the audio file.
345
- proxy_url (str): Proxy URL if needed.
346
- proxy_username (str): Proxy username.
347
- proxy_password (str): Proxy password.
348
-
349
- Returns:
350
- str: Path to the downloaded audio file, or None if failed.
351
- """
352
- logging.info("Using aria2 method")
353
- output_file = tempfile.mktemp(suffix='.mp3')
354
- command = ['aria2c', '--split=4', '--max-connection-per-server=4', '--out', output_file, url]
355
- if proxy_url and len(proxy_url.strip()) > 0:
356
- command.extend(['--all-proxy', proxy_url])
357
- try:
358
- subprocess.run(command, check=True, capture_output=True, text=True)
359
- logging.info(f"Downloaded audio to: {output_file}")
360
- return output_file
361
- except subprocess.CalledProcessError as e:
362
- logging.error(f"Aria2 error: {e.stderr}")
363
- return None
364
- except Exception as e:
365
- logging.error(f"Error in aria2_method: {str(e)}")
366
- return None
367
-
368
- def trim_audio(audio_path, start_time, end_time):
369
- """
370
- Trims an audio file to the specified start and end times.
371
-
372
- Args:
373
- audio_path (str): Path to the audio file.
374
- start_time (float): Start time in seconds.
375
- end_time (float): End time in seconds.
376
-
377
- Returns:
378
- str: Path to the trimmed audio file.
379
-
380
- Raises:
381
- gr.Error: If invalid start or end times are provided.
382
- """
383
- try:
384
- logging.info(f"Trimming audio from {start_time} to {end_time}")
385
- audio = AudioSegment.from_file(audio_path)
386
- audio_duration = len(audio) / 1000 # Duration in seconds
387
-
388
- # Default start and end times if None
389
- start_time = max(0, start_time) if start_time is not None else 0
390
- end_time = min(audio_duration, end_time) if end_time is not None else audio_duration
391
-
392
- # Validate times
393
- if start_time >= end_time:
394
- raise gr.Error("End time must be greater than start time.")
395
-
396
- trimmed_audio = audio[int(start_time * 1000):int(end_time * 1000)]
397
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
398
- trimmed_audio.export(temp_audio_file.name, format="wav")
399
- logging.info(f"Trimmed audio saved to: {temp_audio_file.name}")
400
- return temp_audio_file.name
401
- except Exception as e:
402
- logging.error(f"Error trimming audio: {str(e)}")
403
- raise gr.Error(f"Error trimming audio: {str(e)}")
404
-
405
- def save_transcription(transcription):
406
- """
407
- Saves the transcription text to a temporary file.
408
-
409
- Args:
410
- transcription (str): The transcription text.
411
-
412
- Returns:
413
- str: The path to the transcription file.
414
- """
415
- with tempfile.NamedTemporaryFile(delete=False, suffix='.txt', mode='w', encoding='utf-8') as temp_file:
416
- temp_file.write(transcription)
417
- logging.info(f"Transcription saved to: {temp_file.name}")
418
- return temp_file.name
419
-
420
- def get_model_options(pipeline_type):
421
- """
422
- Returns a list of model IDs based on the selected pipeline type.
423
-
424
- Args:
425
- pipeline_type (str): The type of pipeline.
426
-
427
- Returns:
428
- list: A list of model IDs.
429
- """
430
  if pipeline_type == "fun-asr-nano":
431
- return FUN_ASR_NANO_MODEL_PATH_LIST
 
 
 
 
 
 
 
 
432
  elif pipeline_type == "sensevoice":
433
- return SENSEVOICE_MODEL_PATH_LIST
 
 
 
 
 
 
 
 
434
  else:
435
- return []
436
- # if pipeline_type == "sensevoice":
437
- # return SENSEVOICE_MODEL_PATH_LIST
438
- # else:
439
- # return []
440
-
441
-
442
-
443
- @spaces.GPU(duration=40)
444
- def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, download_method, start_time=None, end_time=None, verbose=False):
445
- """
446
- Transcribes audio from a given source using SenseVoice.
447
-
448
- Args:
449
- audio_input (str): Path to uploaded audio file or recorded audio.
450
- audio_url (str): URL of audio.
451
- proxy_url (str): Proxy URL if needed.
452
- proxy_username (str): Proxy username.
453
- proxy_password (str): Proxy password.
454
- pipeline_type (str): Type of pipeline to use ('sensevoice').
455
- model_id (str): The ID of the model to use.
456
- download_method (str): Method to use for downloading audio.
457
- start_time (float, optional): Start time in seconds for trimming audio.
458
- end_time (float, optional): End time in seconds for trimming audio.
459
- verbose (bool, optional): Whether to output verbose logging.
460
-
461
- Yields:
462
- Tuple[str, str, str or None]: Metrics and messages, transcription text, path to transcription file.
463
- """
464
- try:
465
- if verbose:
466
- logging.getLogger().setLevel(logging.INFO)
467
- else:
468
- logging.getLogger().setLevel(logging.WARNING)
469
 
470
- logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, download_method={download_method}")
471
- verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nDownload Method: {download_method}\n"
472
 
473
- if verbose:
474
- yield verbose_messages, "", None
475
 
476
- # Determine the audio source
 
 
 
477
  audio_path = None
478
  is_temp_file = False
479
 
480
  if audio_input is not None and len(audio_input) > 0:
481
- # audio_input is a filepath to uploaded or recorded audio
482
  audio_path = audio_input
483
- is_temp_file = False
484
  elif audio_url is not None and len(audio_url.strip()) > 0:
485
- # audio_url is provided
486
- audio_path, is_temp_file = download_audio(audio_url, download_method, proxy_url, proxy_username, proxy_password)
487
- if not audio_path:
488
- error_msg = f"Error downloading audio from {audio_url} using method {download_method}. Check logs for details."
489
- logging.error(error_msg)
490
- yield verbose_messages + error_msg, "", None
491
- return
 
492
  else:
493
- verbose_messages += f"Successfully downloaded audio from {audio_url}\n"
494
- if verbose:
495
- yield verbose_messages, "", None
496
  else:
497
- error_msg = "No audio source provided. Please upload an audio file, record audio, or enter a URL."
498
- logging.error(error_msg)
499
- yield verbose_messages + error_msg, "", None
500
- return
501
-
502
- # Convert start_time and end_time to float or None
503
- start_time = float(start_time) if start_time else None
504
- end_time = float(end_time) if end_time else None
505
 
 
506
  if start_time is not None or end_time is not None:
507
- audio_path = trim_audio(audio_path, start_time, end_time)
508
- is_temp_file = True # The trimmed audio is a temporary file
509
- verbose_messages += f"Audio trimmed from {start_time} to {end_time}\n"
510
- if verbose:
511
- yield verbose_messages, "", None
512
-
513
-
 
 
 
 
 
 
 
 
 
514
  if pipeline_type == "fun-asr-nano":
515
- model = MODEL_FUN_ASR
516
- logging.info("Using pre-initialized Fun-ASR-Nano model")
517
- elif pipeline_type == "sensevoice":
518
- model = MODEL_SENSE_VOICE
519
- logging.info("Using pre-initialized SenseVoice model")
520
  else:
521
- error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
522
- logging.error(error_msg)
523
- yield verbose_messages + error_msg, "", None
524
- return
525
-
526
- # Perform the transcription
527
- start_time_perf = time.time()
528
-
529
- if pipeline_type == "fun-asr-nano":
530
- res = model.generate(
531
- input=[audio_path],
532
- use_itn=True,
533
- batch_size=1,
534
- )
535
- elif pipeline_type == "sensevoice":
536
  res = model.generate(
537
- input=audio_path,
538
- cache={},
539
- language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech"
540
- use_itn=True,
541
- batch_size_s=60,
542
- merge_vad=True,
543
- merge_length_s=15,
544
  )
545
 
546
  transcription = rich_transcription_postprocess(res[0]["text"])
547
- end_time_perf = time.time()
548
 
549
- # Calculate metrics
550
- transcription_time = end_time_perf - start_time_perf
551
- audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
552
 
553
- metrics_output = (
554
- f"Transcription time: {transcription_time:.2f} seconds\n"
555
- f"Audio file size: {audio_file_size:.2f} MB\n"
556
- )
557
 
558
- # Save the transcription to a file
559
- transcription_file = save_transcription(transcription)
560
-
561
- # Always yield the final result, regardless of verbose setting
562
- final_metrics = verbose_messages + metrics_output
563
- yield final_metrics, transcription, transcription_file
564
 
565
  except Exception as e:
566
- error_msg = f"An error occurred during transcription: {str(e)}"
567
- logging.error(error_msg)
568
- yield verbose_messages + error_msg, "", None
569
-
570
  finally:
571
- # Clean up temporary audio files
572
- if audio_path and is_temp_file and os.path.exists(audio_path):
573
  os.remove(audio_path)
574
-
575
 
576
- with gr.Blocks() as iface:
577
- gr.Markdown("# Audio Transcription")
578
- gr.Markdown("Transcribe audio using SenseVoice model with multilingual support.")
579
-
 
 
 
 
 
 
 
580
  with gr.Row():
581
  audio_input = gr.Audio(label="Upload or Record Audio", sources=["upload", "microphone"], type="filepath")
582
- audio_url = gr.Textbox(label="Or Enter URL of audio file (direct link only, no YouTube)")
583
-
584
- transcribe_button = gr.Button("Transcribe")
585
-
586
- with gr.Accordion("Advanced Options", open=False):
587
- with gr.Row():
588
- proxy_url = gr.Textbox(label="Proxy URL", placeholder="Enter proxy URL if needed", value="", lines=1)
589
- proxy_username = gr.Textbox(label="Proxy Username", placeholder="Proxy username (optional)", value="", lines=1)
590
- proxy_password = gr.Textbox(label="Proxy Password", placeholder="Proxy password (optional)", value="", lines=1, type="password")
591
-
592
-
593
- with gr.Row():
594
- pipeline_type = gr.Dropdown(
595
- choices=["sensevoice","fun-asr-nano"],
596
- label="Pipeline Type",
597
- value="fun-asr-nano"
598
- )
599
- model_id = gr.Dropdown(
600
- label="Model",
601
- choices=get_model_options("fun-asr-nano"),
602
- value=FUN_ASR_NANO_MODEL_PATH_LIST[0] # Default to official Local Model
603
- )
604
- with gr.Row():
605
- download_method = gr.Dropdown(
606
- choices=["requests", "ffmpeg", "aria2", "wget"],
607
- label="Download Method",
608
- value="requests"
609
- )
610
-
611
- with gr.Row():
612
- start_time = gr.Number(label="Start Time (seconds)", value=None, minimum=0)
613
- end_time = gr.Number(label="End Time (seconds)", value=None, minimum=0)
614
- verbose = gr.Checkbox(label="Verbose Output", value=False)
615
 
616
  with gr.Row():
617
- metrics_output = gr.Textbox(label="Transcription Metrics and Verbose Messages", lines=10)
 
 
 
 
 
 
 
 
 
 
 
618
  transcription_output = gr.Textbox(label="Transcription", lines=10)
619
- transcription_file = gr.File(label="Download Transcription")
620
-
621
- def update_model_dropdown(pipeline_type):
622
- """
623
- Updates the model dropdown choices based on the selected pipeline type.
624
-
625
- Args:
626
- pipeline_type (str): The selected pipeline type.
627
-
628
- Returns:
629
- gr.update: Updated model dropdown component.
630
- """
631
- try:
632
- model_choices = get_model_options(pipeline_type)
633
- logging.info(f"Model choices for {pipeline_type}: {model_choices}")
634
- if model_choices:
635
- return gr.update(choices=model_choices, value=model_choices[0], visible=True)
636
- else:
637
- return gr.update(choices=["No models available"], value=None, visible=False)
638
- except Exception as e:
639
- logging.error(f"Error in update_model_dropdown: {str(e)}")
640
- return gr.update(choices=["Error"], value="Error", visible=True)
641
-
642
- # Event handler for pipeline_type change
643
- pipeline_type.change(update_model_dropdown, inputs=[pipeline_type], outputs=[model_id])
644
-
645
- def transcribe_with_progress(*args):
646
- # The audio_input is now the first argument
647
- for result in transcribe_audio(*args):
648
- yield result
649
-
650
- transcribe_button.click(
651
- transcribe_with_progress,
652
- inputs=[audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, download_method, start_time, end_time, verbose],
653
- outputs=[metrics_output, transcription_output, transcription_file]
654
  )
655
-
656
- # Note: For examples, users should use local audio files or upload their own files
657
- # Examples with specific paths may not work for all users
658
-
659
- gr.Markdown(f"""
660
- ### Usage Examples:
661
- 1. **Upload Audio**: Click the "Upload or Record Audio" button to select your audio file
662
- 2. **Select Pipeline Type**: Choose from available pipelines:
663
- - **Fun-ASR-Nano** (default) - Large language model based ASR model
664
- - **SenseVoice** - CTC-based based ASR model with VAD
665
- 3. **Local Testing**: For development, you can use local paths as shown above
666
-
667
- Supported languages:
668
- - Fun-ASR-Nano: more than 50 languages and Chinese dialects.
669
- - SenseVoiceSmall:Chinese (zh), English (en), Cantonese (yue), Japanese (ja), Korean (ko).
670
  """)
671
 
672
- iface.queue().launch(share=False, debug=True)
 
1
  import os
2
  import spaces
 
 
 
 
3
 
4
+ REPO_TYPE = "hf"
 
 
 
5
 
6
+ from huggingface_hub import snapshot_download
7
 
 
 
8
  MODEL_CACHE_DIR = "./models"
9
  FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
10
  SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
11
  VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")
12
 
 
13
  os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
14
 
15
+ FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
16
+ SENSE_VOICE_SMALL_REPO_ID = "FunAudioLLM/SenseVoiceSmall"
17
+ VAD_MODEL_REPO_ID = "funasr/fsmn-vad"
 
 
 
18
 
 
 
 
 
 
 
 
 
19
 
 
20
  def download_model_if_not_exists(repo_id, local_path, model_name):
 
21
  if not os.path.exists(local_path):
22
+ print(f"Downloading {model_name} to {local_path} ...")
23
+ snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"])
24
+ print(f"{model_name} downloaded.")
 
 
 
 
25
  else:
26
+ print(f"{model_name} found locally, skipping download.")
27
+
28
 
 
29
  download_model_if_not_exists(FUN_ASR_NANO_REPO_ID, FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
30
  download_model_if_not_exists(SENSE_VOICE_SMALL_REPO_ID, SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
31
  download_model_if_not_exists(VAD_MODEL_REPO_ID, VAD_MODEL_LOCAL_PATH, "VAD Model")
32
 
 
 
 
33
  import gradio as gr
34
  import time
 
 
35
  import tempfile
 
 
 
 
36
  import logging
37
  import torch
 
38
  from funasr import AutoModel
39
  from funasr.utils.postprocess_utils import rich_transcription_postprocess
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
42
 
43
+ # Lazy model loading - models are loaded on first use inside @spaces.GPU
44
+ loaded_models = {}
45
 
46
 
47
+ def get_or_load_model(pipeline_type):
48
+ if pipeline_type in loaded_models:
49
+ return loaded_models[pipeline_type]
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  if pipeline_type == "fun-asr-nano":
52
+ model = AutoModel(
53
+ model=FUN_ASR_NANO_LOCAL_PATH,
54
+ trust_remote_code=True,
55
+ vad_model=VAD_MODEL_LOCAL_PATH,
56
+ vad_kwargs={"max_single_segment_time": 30000},
57
+ device="cuda",
58
+ disable_update=True,
59
+ hub="hf",
60
+ )
61
  elif pipeline_type == "sensevoice":
62
+ model = AutoModel(
63
+ model=SENSE_VOICE_SMALL_LOCAL_PATH,
64
+ trust_remote_code=False,
65
+ vad_model=VAD_MODEL_LOCAL_PATH,
66
+ vad_kwargs={"max_single_segment_time": 30000},
67
+ device="cuda",
68
+ disable_update=True,
69
+ hub="hf",
70
+ )
71
  else:
72
+ raise ValueError(f"Unknown pipeline type: {pipeline_type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ loaded_models[pipeline_type] = model
75
+ return model
76
 
 
 
77
 
78
+ @spaces.GPU(duration=120)
79
+ def transcribe_audio(audio_input, audio_url, pipeline_type, start_time=None, end_time=None):
80
+ try:
81
+ # Determine audio source
82
  audio_path = None
83
  is_temp_file = False
84
 
85
  if audio_input is not None and len(audio_input) > 0:
 
86
  audio_path = audio_input
 
87
  elif audio_url is not None and len(audio_url.strip()) > 0:
88
+ import requests as req
89
+ response = req.get(audio_url, stream=True, timeout=30)
90
+ if response.status_code == 200:
91
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
92
+ for chunk in response.iter_content(chunk_size=8192):
93
+ f.write(chunk)
94
+ audio_path = f.name
95
+ is_temp_file = True
96
  else:
97
+ return f"Failed to download audio: HTTP {response.status_code}", "", None
 
 
98
  else:
99
+ return "No audio provided. Upload a file, record, or enter a URL.", "", None
 
 
 
 
 
 
 
100
 
101
+ # Trim if needed
102
  if start_time is not None or end_time is not None:
103
+ from pydub import AudioSegment
104
+ audio = AudioSegment.from_file(audio_path)
105
+ duration = len(audio) / 1000
106
+ s = max(0, float(start_time)) if start_time else 0
107
+ e = min(duration, float(end_time)) if end_time else duration
108
+ trimmed = audio[int(s * 1000):int(e * 1000)]
109
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
110
+ trimmed.export(tmp.name, format="wav")
111
+ audio_path = tmp.name
112
+ is_temp_file = True
113
+
114
+ # Load model (lazy, inside GPU context)
115
+ model = get_or_load_model(pipeline_type)
116
+
117
+ # Transcribe
118
+ t0 = time.time()
119
  if pipeline_type == "fun-asr-nano":
120
+ res = model.generate(input=[audio_path], use_itn=True, batch_size=1)
 
 
 
 
121
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  res = model.generate(
123
+ input=audio_path, cache={}, language="auto",
124
+ use_itn=True, batch_size_s=60, merge_vad=True, merge_length_s=15,
 
 
 
 
 
125
  )
126
 
127
  transcription = rich_transcription_postprocess(res[0]["text"])
128
+ elapsed = time.time() - t0
129
 
130
+ metrics = f"Transcription time: {elapsed:.2f}s\nPipeline: {pipeline_type}\nDevice: cuda"
 
 
131
 
132
+ # Save transcription file
133
+ txt_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
134
+ txt_file.write(transcription)
135
+ txt_file.close()
136
 
137
+ return metrics, transcription, txt_file.name
 
 
 
 
 
138
 
139
  except Exception as e:
140
+ logging.error(f"Transcription error: {e}")
141
+ return f"Error: {str(e)}", "", None
 
 
142
  finally:
143
+ if is_temp_file and audio_path and os.path.exists(audio_path):
 
144
  os.remove(audio_path)
 
145
 
146
+
147
+ with gr.Blocks(title="Fun-ASR-Nano | GPU Demo") as demo:
148
+ gr.Markdown("""
149
+ # Fun-ASR-Nano: LLM-Powered Speech Recognition (GPU)
150
+
151
+ End-to-end ASR model trained on tens of millions of hours, supporting **31 languages** including Chinese dialects.
152
+
153
+ - **GitHub**: [Fun-ASR](https://github.com/FunAudioLLM/Fun-ASR) | [FunASR Toolkit](https://github.com/modelscope/FunASR)
154
+ - **Model**: [Fun-ASR-Nano-2512](https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512)
155
+ """)
156
+
157
  with gr.Row():
158
  audio_input = gr.Audio(label="Upload or Record Audio", sources=["upload", "microphone"], type="filepath")
159
+ audio_url = gr.Textbox(label="Or Enter Audio URL", placeholder="https://example.com/audio.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  with gr.Row():
162
+ pipeline_type = gr.Dropdown(
163
+ choices=["fun-asr-nano", "sensevoice"],
164
+ label="Model",
165
+ value="fun-asr-nano"
166
+ )
167
+ start_time = gr.Number(label="Start Time (s)", value=None, minimum=0)
168
+ end_time = gr.Number(label="End Time (s)", value=None, minimum=0)
169
+
170
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
171
+
172
+ with gr.Row():
173
+ metrics_output = gr.Textbox(label="Metrics", lines=4)
174
  transcription_output = gr.Textbox(label="Transcription", lines=10)
175
+ transcription_file = gr.File(label="Download")
176
+
177
+ transcribe_btn.click(
178
+ transcribe_audio,
179
+ inputs=[audio_input, audio_url, pipeline_type, start_time, end_time],
180
+ outputs=[metrics_output, transcription_output, transcription_file],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  )
182
+
183
+ gr.Markdown("""
184
+ ### Supported Languages
185
+ - **Fun-ASR-Nano**: 31 languages + Chinese dialects (Cantonese, Sichuan, Shanghai, Minnan, etc.)
186
+ - **SenseVoice**: Chinese, English, Cantonese, Japanese, Korean
 
 
 
 
 
 
 
 
 
 
187
  """)
188
 
189
+ demo.queue().launch()
requirements.txt CHANGED
@@ -1,12 +1,6 @@
1
- numpy
2
- requests
3
- ffmpeg-python
4
- pydub
5
  torch
6
- transformers
7
- funasr==1.2.9
8
  torchaudio
9
- modelscope
10
  huggingface_hub
11
- pydantic>=2.12.4
12
- dotenv
 
1
+ funasr
 
 
 
2
  torch
 
 
3
  torchaudio
 
4
  huggingface_hub
5
+ pydub
6
+ requests