FFomy commited on
Commit
061cbc3
·
verified ·
1 Parent(s): a483939

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -35
app.py CHANGED
@@ -83,6 +83,33 @@ SENSEVOICE_MODEL_PATH_LIST = [
83
  SENSE_VOICE_SMALL_LOCAL_PATH, # local path
84
  ]
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class LogCapture(io.StringIO):
87
  def __init__(self, callback):
88
  super().__init__()
@@ -101,8 +128,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
101
 
102
 
103
  # Check for CUDA availability
104
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
105
- logging.info(f"Using device: {device}")
106
 
107
  def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
108
  """
@@ -491,51 +518,28 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
491
  logging.info("Loaded model from cache")
492
  else:
493
  if pipeline_type == "fun-asr-nano":
494
- model = AutoModel(
495
- model=model_id,
496
- trust_remote_code=True,
497
- remote_code=f"./Fun-ASR/model.py",
498
- vad_model=VAD_MODEL_LOCAL_PATH, # Use local VAD model path
499
- vad_kwargs={"max_single_segment_time": 30000},
500
- device=device,
501
- disable_update=True,
502
- hub='ms',
503
- )
504
  elif pipeline_type == "sensevoice":
505
- model = AutoModel(
506
- model=model_id,
507
- trust_remote_code=False,
508
- vad_model=VAD_MODEL_LOCAL_PATH, # Use local VAD model path
509
- vad_kwargs={"max_single_segment_time": 30000},
510
- device=device,
511
- disable_update=True,
512
- hub='ms',
513
- )
514
  else:
515
  error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
516
  logging.error(error_msg)
517
  yield verbose_messages + error_msg, "", None
518
  return
519
  loaded_models[model_key] = model
520
-
521
- # move seperately?
522
- model.model.to(device)
523
- model.vad_model.to(device)
524
  # Perform the transcription
525
  start_time_perf = time.time()
526
 
527
  if pipeline_type == "fun-asr-nano":
528
- system_prompt = "You are a helpful assistant."
529
- user_prompt = f"语音转写:<|startofspeech|>!{audio_path}<|endofspeech|>"
530
- contents_i = []
531
- contents_i.append({"role": "system", "content": system_prompt})
532
- contents_i.append({"role": "user", "content": user_prompt})
533
- contents_i.append({"role": "assistant", "content": "null"})
534
- print(audio_path)
535
  res = model.generate(
536
  input=[audio_path],
537
  use_itn=True,
538
  batch_size=1,
 
 
539
  )
540
  elif pipeline_type == "sensevoice":
541
  res = model.generate(
@@ -547,9 +551,6 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
547
  merge_vad=True,
548
  merge_length_s=15,
549
  )
550
-
551
- model.model.to("cpu")
552
- model.vad_model.to("cpu")
553
 
554
  transcription = rich_transcription_postprocess(res[0]["text"])
555
  end_time_perf = time.time()
 
83
  SENSE_VOICE_SMALL_LOCAL_PATH, # local path
84
  ]
85
 
86
+
87
+ # initial model like this, we have gpu
88
+
89
+ MODEL_FUN_ASR = AutoModel(
90
+ model=FUN_ASR_NANO_LOCAL_PATH,
91
+ trust_remote_code=True,
92
+ remote_code=f"./Fun-ASR/model.py", # 建议:如果本地models目录下没有这个文件,这行会报错。如果不需要魔改代码,去掉这行。
93
+ vad_model=VAD_MODEL_LOCAL_PATH,
94
+ vad_kwargs={"max_single_segment_time": 30000},
95
+ device='cuda', # 直接指定 GPU
96
+ disable_update=True,
97
+ hub='ms',
98
+ )
99
+
100
+ # 2. 初始化 SenseVoice
101
+ print("Loading SenseVoice...")
102
+ MODEL_SENSE_VOICE = AutoModel(
103
+ model=SENSE_VOICE_SMALL_LOCAL_PATH,
104
+ trust_remote_code=False,
105
+ vad_model=VAD_MODEL_LOCAL_PATH,
106
+ vad_kwargs={"max_single_segment_time": 30000},
107
+ device='cuda', # 直接指定 GPU
108
+ disable_update=True,
109
+ hub='ms',
110
+ )
111
+ print("所有模型全局初始化完成!")
112
+
113
  class LogCapture(io.StringIO):
114
  def __init__(self, callback):
115
  super().__init__()
 
128
 
129
 
130
  # Check for CUDA availability
131
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
132
+ # logging.info(f"Using device: {device}")
133
 
134
  def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
135
  """
 
518
  logging.info("Loaded model from cache")
519
  else:
520
  if pipeline_type == "fun-asr-nano":
521
+ model = MODEL_FUN_ASR
522
+ logging.info("Using pre-initialized Fun-ASR-Nano model")
 
 
 
 
 
 
 
 
523
  elif pipeline_type == "sensevoice":
524
+ model = MODEL_SENSE_VOICE
525
+ logging.info("Using pre-initialized SenseVoice model")
 
 
 
 
 
 
 
526
  else:
527
  error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
528
  logging.error(error_msg)
529
  yield verbose_messages + error_msg, "", None
530
  return
531
  loaded_models[model_key] = model
532
+
 
 
 
533
  # Perform the transcription
534
  start_time_perf = time.time()
535
 
536
  if pipeline_type == "fun-asr-nano":
 
 
 
 
 
 
 
537
  res = model.generate(
538
  input=[audio_path],
539
  use_itn=True,
540
  batch_size=1,
541
+ merge_vad=True,
542
+ merge_length_s=15,
543
  )
544
  elif pipeline_type == "sensevoice":
545
  res = model.generate(
 
551
  merge_vad=True,
552
  merge_length_s=15,
553
  )
 
 
 
554
 
555
  transcription = rich_transcription_postprocess(res[0]["text"])
556
  end_time_perf = time.time()