Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -83,6 +83,33 @@ SENSEVOICE_MODEL_PATH_LIST = [
|
|
| 83 |
SENSE_VOICE_SMALL_LOCAL_PATH, # local path
|
| 84 |
]
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
class LogCapture(io.StringIO):
|
| 87 |
def __init__(self, callback):
|
| 88 |
super().__init__()
|
|
@@ -101,8 +128,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|
| 101 |
|
| 102 |
|
| 103 |
# Check for CUDA availability
|
| 104 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 105 |
-
logging.info(f"Using device: {device}")
|
| 106 |
|
| 107 |
def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
|
| 108 |
"""
|
|
@@ -491,51 +518,28 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
|
|
| 491 |
logging.info("Loaded model from cache")
|
| 492 |
else:
|
| 493 |
if pipeline_type == "fun-asr-nano":
|
| 494 |
-
model =
|
| 495 |
-
|
| 496 |
-
trust_remote_code=True,
|
| 497 |
-
remote_code=f"./Fun-ASR/model.py",
|
| 498 |
-
vad_model=VAD_MODEL_LOCAL_PATH, # Use local VAD model path
|
| 499 |
-
vad_kwargs={"max_single_segment_time": 30000},
|
| 500 |
-
device=device,
|
| 501 |
-
disable_update=True,
|
| 502 |
-
hub='ms',
|
| 503 |
-
)
|
| 504 |
elif pipeline_type == "sensevoice":
|
| 505 |
-
model =
|
| 506 |
-
|
| 507 |
-
trust_remote_code=False,
|
| 508 |
-
vad_model=VAD_MODEL_LOCAL_PATH, # Use local VAD model path
|
| 509 |
-
vad_kwargs={"max_single_segment_time": 30000},
|
| 510 |
-
device=device,
|
| 511 |
-
disable_update=True,
|
| 512 |
-
hub='ms',
|
| 513 |
-
)
|
| 514 |
else:
|
| 515 |
error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
|
| 516 |
logging.error(error_msg)
|
| 517 |
yield verbose_messages + error_msg, "", None
|
| 518 |
return
|
| 519 |
loaded_models[model_key] = model
|
| 520 |
-
|
| 521 |
-
# move seperately?
|
| 522 |
-
model.model.to(device)
|
| 523 |
-
model.vad_model.to(device)
|
| 524 |
# Perform the transcription
|
| 525 |
start_time_perf = time.time()
|
| 526 |
|
| 527 |
if pipeline_type == "fun-asr-nano":
|
| 528 |
-
system_prompt = "You are a helpful assistant."
|
| 529 |
-
user_prompt = f"语音转写:<|startofspeech|>!{audio_path}<|endofspeech|>"
|
| 530 |
-
contents_i = []
|
| 531 |
-
contents_i.append({"role": "system", "content": system_prompt})
|
| 532 |
-
contents_i.append({"role": "user", "content": user_prompt})
|
| 533 |
-
contents_i.append({"role": "assistant", "content": "null"})
|
| 534 |
-
print(audio_path)
|
| 535 |
res = model.generate(
|
| 536 |
input=[audio_path],
|
| 537 |
use_itn=True,
|
| 538 |
batch_size=1,
|
|
|
|
|
|
|
| 539 |
)
|
| 540 |
elif pipeline_type == "sensevoice":
|
| 541 |
res = model.generate(
|
|
@@ -547,9 +551,6 @@ def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_pa
|
|
| 547 |
merge_vad=True,
|
| 548 |
merge_length_s=15,
|
| 549 |
)
|
| 550 |
-
|
| 551 |
-
model.model.to("cpu")
|
| 552 |
-
model.vad_model.to("cpu")
|
| 553 |
|
| 554 |
transcription = rich_transcription_postprocess(res[0]["text"])
|
| 555 |
end_time_perf = time.time()
|
|
|
|
| 83 |
SENSE_VOICE_SMALL_LOCAL_PATH, # local path
|
| 84 |
]
|
| 85 |
|
| 86 |
+
|
| 87 |
+
# initial model like this, we have gpu
|
| 88 |
+
|
| 89 |
+
MODEL_FUN_ASR = AutoModel(
|
| 90 |
+
model=FUN_ASR_NANO_LOCAL_PATH,
|
| 91 |
+
trust_remote_code=True,
|
| 92 |
+
remote_code=f"./Fun-ASR/model.py", # 建议:如果本地models目录下没有这个文件,这行会报错。如果不需要魔改代码,去掉这行。
|
| 93 |
+
vad_model=VAD_MODEL_LOCAL_PATH,
|
| 94 |
+
vad_kwargs={"max_single_segment_time": 30000},
|
| 95 |
+
device='cuda', # 直接指定 GPU
|
| 96 |
+
disable_update=True,
|
| 97 |
+
hub='ms',
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# 2. 初始化 SenseVoice
|
| 101 |
+
print("Loading SenseVoice...")
|
| 102 |
+
MODEL_SENSE_VOICE = AutoModel(
|
| 103 |
+
model=SENSE_VOICE_SMALL_LOCAL_PATH,
|
| 104 |
+
trust_remote_code=False,
|
| 105 |
+
vad_model=VAD_MODEL_LOCAL_PATH,
|
| 106 |
+
vad_kwargs={"max_single_segment_time": 30000},
|
| 107 |
+
device='cuda', # 直接指定 GPU
|
| 108 |
+
disable_update=True,
|
| 109 |
+
hub='ms',
|
| 110 |
+
)
|
| 111 |
+
print("所有模型全局初始化完成!")
|
| 112 |
+
|
| 113 |
class LogCapture(io.StringIO):
|
| 114 |
def __init__(self, callback):
|
| 115 |
super().__init__()
|
|
|
|
| 128 |
|
| 129 |
|
| 130 |
# Check for CUDA availability
|
| 131 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 132 |
+
# logging.info(f"Using device: {device}")
|
| 133 |
|
| 134 |
def download_audio(url, method_choice, proxy_url, proxy_username, proxy_password):
|
| 135 |
"""
|
|
|
|
| 518 |
logging.info("Loaded model from cache")
|
| 519 |
else:
|
| 520 |
if pipeline_type == "fun-asr-nano":
|
| 521 |
+
model = MODEL_FUN_ASR
|
| 522 |
+
logging.info("Using pre-initialized Fun-ASR-Nano model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
elif pipeline_type == "sensevoice":
|
| 524 |
+
model = MODEL_SENSE_VOICE
|
| 525 |
+
logging.info("Using pre-initialized SenseVoice model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
else:
|
| 527 |
error_msg = "Invalid pipeline type. Only 'sensevoice' is supported."
|
| 528 |
logging.error(error_msg)
|
| 529 |
yield verbose_messages + error_msg, "", None
|
| 530 |
return
|
| 531 |
loaded_models[model_key] = model
|
| 532 |
+
|
|
|
|
|
|
|
|
|
|
| 533 |
# Perform the transcription
|
| 534 |
start_time_perf = time.time()
|
| 535 |
|
| 536 |
if pipeline_type == "fun-asr-nano":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
res = model.generate(
|
| 538 |
input=[audio_path],
|
| 539 |
use_itn=True,
|
| 540 |
batch_size=1,
|
| 541 |
+
merge_vad=True,
|
| 542 |
+
merge_length_s=15,
|
| 543 |
)
|
| 544 |
elif pipeline_type == "sensevoice":
|
| 545 |
res = model.generate(
|
|
|
|
| 551 |
merge_vad=True,
|
| 552 |
merge_length_s=15,
|
| 553 |
)
|
|
|
|
|
|
|
|
|
|
| 554 |
|
| 555 |
transcription = rich_transcription_postprocess(res[0]["text"])
|
| 556 |
end_time_perf = time.time()
|