Fun-ASR-Nano-GPU-Debug

Running on Zero

App Files Files Community

FFomy commited on Dec 18, 2025

Commit

061cbc3

verified ·

1 Parent(s): a483939

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -35

app.py CHANGED Viewed

@@ -83,6 +83,33 @@ SENSEVOICE_MODEL_PATH_LIST = [
     SENSE_VOICE_SMALL_LOCAL_PATH, # local path
 ]
 class LogCapture(io.StringIO):
     def __init__(self, callback):
         super().__init__()
@@ -101,8 +128,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 # Check for CUDA availability
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-logging.info(f"Using device: {device}")
 def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
     """
@@ -491,51 +518,28 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
             logging.info("Loaded model from cache")
         else:
             if pipeline_type == "fun-asr-nano":
-                model = AutoModel(
-                    model=model_id,
-                    trust_remote_code=True,
-                    remote_code=f"./Fun-ASR/model.py",
-                    vad_model=VAD_MODEL_LOCAL_PATH,  # Use local VAD model path
-                    vad_kwargs={"max_single_segment_time": 30000},
-                    device=device,
-                    disable_update=True,
-                    hub='ms',
-                )
             elif pipeline_type == "sensevoice":
-                model = AutoModel(
-                    model=model_id,
-                    trust_remote_code=False,
-                    vad_model=VAD_MODEL_LOCAL_PATH,  # Use local VAD model path
-                    vad_kwargs={"max_single_segment_time": 30000},
-                    device=device,
-                    disable_update=True,
-                    hub='ms',
-                )
             else:
                 error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
                 logging.error(error_msg)
                 yield verbose_messages + error_msg, "", None
                 return
             loaded_models[model_key] = model
-        # move seperately?
-        model.model.to(device)
-        model.vad_model.to(device)
         # Perform the transcription
         start_time_perf = time.time()
         if pipeline_type == "fun-asr-nano":
-            system_prompt = "You are a helpful assistant."
-            user_prompt = f"语音转写：<|startofspeech|>!{audio_path}<|endofspeech|>"
-            contents_i = []
-            contents_i.append({"role": "system", "content": system_prompt})
-            contents_i.append({"role": "user", "content": user_prompt})
-            contents_i.append({"role": "assistant", "content": "null"})
-            print(audio_path)
             res = model.generate(
                 input=[audio_path],
                 use_itn=True,
                 batch_size=1,
             )
         elif pipeline_type == "sensevoice":
             res = model.generate(
@@ -547,9 +551,6 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
                 merge_vad=True,
                 merge_length_s=15,
             )
-        model.model.to("cpu")
-        model.vad_model.to("cpu")
         transcription = rich_transcription_postprocess(res[0]["text"])
         end_time_perf = time.time()

     SENSE_VOICE_SMALL_LOCAL_PATH, # local path
 ]
+# initial model like this, we have gpu
+MODEL_FUN_ASR = AutoModel(
+    model=FUN_ASR_NANO_LOCAL_PATH,
+    trust_remote_code=True,
+    remote_code=f"./Fun-ASR/model.py", # 建议：如果本地models目录下没有这个文件，这行会报错。如果不需要魔改代码，去掉这行。
+    vad_model=VAD_MODEL_LOCAL_PATH,
+    vad_kwargs={"max_single_segment_time": 30000},
+    device='cuda', # 直接指定 GPU
+    disable_update=True,
+    hub='ms',
+)
+# 2. 初始化 SenseVoice
+print("Loading SenseVoice...")
+MODEL_SENSE_VOICE = AutoModel(
+    model=SENSE_VOICE_SMALL_LOCAL_PATH,
+    trust_remote_code=False,
+    vad_model=VAD_MODEL_LOCAL_PATH,
+    vad_kwargs={"max_single_segment_time": 30000},
+    device='cuda', # 直接指定 GPU
+    disable_update=True,
+    hub='ms',
+)
+print("所有模型全局初始化完成！")
 class LogCapture(io.StringIO):
     def __init__(self, callback):
         super().__init__()
 # Check for CUDA availability
+# device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# logging.info(f"Using device: {device}")
 def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
     """
             logging.info("Loaded model from cache")
         else:
             if pipeline_type == "fun-asr-nano":
+                model = MODEL_FUN_ASR
+                logging.info("Using pre-initialized Fun-ASR-Nano model")
             elif pipeline_type == "sensevoice":
+                model = MODEL_SENSE_VOICE
+                logging.info("Using pre-initialized SenseVoice model")
             else:
                 error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
                 logging.error(error_msg)
                 yield verbose_messages + error_msg, "", None
                 return
             loaded_models[model_key] = model
         # Perform the transcription
         start_time_perf = time.time()
         if pipeline_type == "fun-asr-nano":
             res = model.generate(
                 input=[audio_path],
                 use_itn=True,
                 batch_size=1,
+                merge_vad=True,
+                merge_length_s=15,
             )
         elif pipeline_type == "sensevoice":
             res = model.generate(
                 merge_vad=True,
                 merge_length_s=15,
             )
         transcription = rich_transcription_postprocess(res[0]["text"])
         end_time_perf = time.time()