Fun-ASR-Nano-GPU-Debug

Running on Zero

App Files Files Community

FFomy commited on Dec 17, 2025

Commit

399aaa2

verified ·

1 Parent(s): 3c53f92

final try

Browse files

Files changed (1) hide show

app.py +45 -45

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import spaces
-# only debug for hf now
 REPO_TYPE = "hf"
 if REPO_TYPE not in ["hf", "ms"]:
     raise ValueError("REPO_TYPE must be either 'hf' for Hugging Face or 'ms' for ModelScope.")
@@ -13,48 +13,40 @@ else:
 # 1. 定义本地路径和远程仓库ID
-MODEL_CACHE_DIR = "./models"
-FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
-SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
-VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")
-# 创建模型缓存目录
-os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
-# 设置ModelScope环境变量以使用本地缓存
-os.environ['MODELSCOPE_CACHE'] = MODEL_CACHE_DIR
-# 禁用远程下载，强制使用本地模型（可选，如果想要确保只使用本地模型）
-# os.environ['MODELSCOPE_DISABLE_REMOTE'] = '1'
-print(f"ModelScope缓存目录设置为: {MODEL_CACHE_DIR}")
 if REPO_TYPE == "ms":
     FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
     SENSE_VOICE_SMALL_REPO_ID = "iic/SenseVoiceSmall"
-    VAD_MODEL_REPO_ID = "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
 else:
     FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
     SENSE_VOICE_SMALL_REPO_ID = "FunAudioLLM/SenseVoiceSmall"
-    VAD_MODEL_REPO_ID = "funasr/fsmn-vad"
 # 2. 检查本地是否存在，不存在则下载
-def download_model_if_not_exists(repo_id, local_path, model_name):
-    """如果本地模型不存在，则下载模型"""
-    if not os.path.exists(local_path):
-        print(f"正在下载模型 {model_name} 到 {local_path} ...")
-        snapshot_download(
-            repo_id=repo_id,
-            local_dir=local_path,
-            ignore_patterns=["*.onnx"], # 如果你不需要onnx文件，可以过滤掉以节省时间和空间
-        )
-        print(f"{model_name} 模型下载完毕！")
-    else:
-        print(f"检测到本地 {model_name} 模型文件，跳过下载。")
-# 下载所有需要的模型
-download_model_if_not_exists(FUN_ASR_NANO_REPO_ID, FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
-download_model_if_not_exists(SENSE_VOICE_SMALL_REPO_ID, SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
-download_model_if_not_exists(VAD_MODEL_REPO_ID, VAD_MODEL_LOCAL_PATH, "VAD Model")
@@ -74,13 +66,13 @@ import importlib
 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess
-# Model configurations for local deployment
 FUN_ASR_NANO_MODEL_PATH_LIST = [
-    FUN_ASR_NANO_LOCAL_PATH, # local path
 ]
 SENSEVOICE_MODEL_PATH_LIST = [
-    SENSE_VOICE_SMALL_LOCAL_PATH, # local path
 ]
 class LogCapture(io.StringIO):
@@ -101,8 +93,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 # Check for CUDA availability
-# device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# logging.info(f"Using device: {device}")
 def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
     """
@@ -414,7 +406,7 @@ def get_model_options(pipeline_type):
 # Dictionary to store loaded models
 loaded_models = {}
-@spaces.GPU(duration=40)
 def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, download_method, start_time=None, end_time=None, verbose=False):
     """
     Transcribes audio from a given source using SenseVoice.
@@ -435,9 +427,6 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
     Yields:
         Tuple[str, str, str or None]: Metrics and messages, transcription text, path to transcription file.
     """
-    current_device = "cuda:0" if torch.cuda.is_available() else "cpu"
-    device = current_device
-    logging.info(f"Using device: {device}")
     try:
         if verbose:
             logging.getLogger().setLevel(logging.INFO)
@@ -489,6 +478,7 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
         # Model caching
         model_key = (pipeline_type, model_id)
         if model_key in loaded_models:
             model = loaded_models[model_key]
             logging.info("Loaded model from cache")
@@ -498,9 +488,9 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
                     model=model_id,
                     trust_remote_code=True,
                     remote_code=f"./Fun-ASR/model.py",
-                    vad_model=VAD_MODEL_LOCAL_PATH,  # Use local VAD model path
                     vad_kwargs={"max_single_segment_time": 30000},
-                    device=device,
                     disable_update=True,
                     hub='ms',
                 )
@@ -508,9 +498,9 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
                 model = AutoModel(
                     model=model_id,
                     trust_remote_code=False,
-                    vad_model=VAD_MODEL_LOCAL_PATH,  # Use local VAD model path
                     vad_kwargs={"max_single_segment_time": 30000},
-                    device=device,
                     disable_update=True,
                     hub='ms',
                 )
@@ -520,6 +510,14 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
                 yield verbose_messages + error_msg, "", None
                 return
             loaded_models[model_key] = model
         # Perform the transcription
         start_time_perf = time.time()
@@ -547,6 +545,8 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
                 merge_vad=True,
                 merge_length_s=15,
             )
         transcription = rich_transcription_postprocess(res[0]["text"])
         end_time_perf = time.time()

 import os
 import spaces
 REPO_TYPE = "hf"
 if REPO_TYPE not in ["hf", "ms"]:
     raise ValueError("REPO_TYPE must be either 'hf' for Hugging Face or 'ms' for ModelScope.")
 # 1. 定义本地路径和远程仓库ID
+FUN_ASR_NANO_LOCAL_PATH = "./Fun-ASR/model"
+SENSE_VOICE_SMALL_LOCAL_PATH = "./Fun-ASR/model/SenseVoiceSmall"
 if REPO_TYPE == "ms":
     FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
     SENSE_VOICE_SMALL_REPO_ID = "iic/SenseVoiceSmall"
 else:
     FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
     SENSE_VOICE_SMALL_REPO_ID = "FunAudioLLM/SenseVoiceSmall"
 # 2. 检查本地是否存在，不存在则下载
+if not os.path.exists(FUN_ASR_NANO_LOCAL_PATH):
+    print(f"正在下载模型 Fun-ASR-Nano 到 {FUN_ASR_NANO_LOCAL_PATH} ...")
+    snapshot_download(
+        repo_id=FUN_ASR_NANO_REPO_ID,
+        local_dir=FUN_ASR_NANO_LOCAL_PATH,
+        ignore_patterns=["*.onnx"], # 如果你不需要onnx文件，可以过滤掉以节省时间和空间
+    )
+    print("模型下载完毕！")
+else:
+    print("检测到本地模型文件，跳过下载。")
+if not os.path.exists(SENSE_VOICE_SMALL_LOCAL_PATH):
+    print(f"正在下载模型 {SENSE_VOICE_SMALL_REPO_ID} 到 {SENSE_VOICE_SMALL_LOCAL_PATH} ...")
+    snapshot_download(
+        repo_id=SENSE_VOICE_SMALL_REPO_ID,
+        local_dir=SENSE_VOICE_SMALL_LOCAL_PATH,
+        ignore_patterns=["*.onnx"], # 如果你不需要onnx文件，可以过滤掉以节省时间和空间
+    )
+    print("模型下载完毕！")
+else:
+    print("检测到本地模型文件，跳过下载。")
 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess
+# Model configurations for Hugging Face deployment
 FUN_ASR_NANO_MODEL_PATH_LIST = [
+    "Fun-ASR/model", # local path, ms
 ]
 SENSEVOICE_MODEL_PATH_LIST = [
+    "Fun-ASR/model/SenseVoiceSmall", # local path together with this hf space
 ]
 class LogCapture(io.StringIO):
 # Check for CUDA availability
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+logging.info(f"Using device: {device}")
 def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
     """
 # Dictionary to store loaded models
 loaded_models = {}
+@spaces.GPU()
 def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, download_method, start_time=None, end_time=None, verbose=False):
     """
     Transcribes audio from a given source using SenseVoice.
     Yields:
         Tuple[str, str, str or None]: Metrics and messages, transcription text, path to transcription file.
     """
     try:
         if verbose:
             logging.getLogger().setLevel(logging.INFO)
         # Model caching
         model_key = (pipeline_type, model_id)
+        model = None
         if model_key in loaded_models:
             model = loaded_models[model_key]
             logging.info("Loaded model from cache")
                     model=model_id,
                     trust_remote_code=True,
                     remote_code=f"./Fun-ASR/model.py",
+                    vad_model="fsmn-vad",
                     vad_kwargs={"max_single_segment_time": 30000},
+                    device='cpu', # 初始化在cpu，然后推理的时候移到GPU，保证利用好zeroGPU？
                     disable_update=True,
                     hub='ms',
                 )
                 model = AutoModel(
                     model=model_id,
                     trust_remote_code=False,
+                    vad_model="fsmn-vad",
                     vad_kwargs={"max_single_segment_time": 30000},
+                    device='cpu',
                     disable_update=True,
                     hub='ms',
                 )
                 yield verbose_messages + error_msg, "", None
                 return
             loaded_models[model_key] = model
+        try:
+            model.to(device)
+            logging.info(f"Model moved to device: {device}")
+        except Exception as e:
+            logging.error(f"Error moving model to device: {str(e)}")
+            yield verbose_messages + f"Error moving model to device: {str(e)}", "", None
+            return
         # Perform the transcription
         start_time_perf = time.time()
                 merge_vad=True,
                 merge_length_s=15,
             )
+        model.to('cpu')  # Move model back to CPU after inference to free GPU memory
         transcription = rich_transcription_postprocess(res[0]["text"])
         end_time_perf = time.time()