Spaces:

ASLP-lab
/

OSUM-EChat

Sleeping

App Files Files Community

xlgeng commited on Aug 21, 2025

Commit

6f7f7cd

1 Parent(s): 2b52af0

开始部署

Browse files

Files changed (2) hide show

app.py +163 -147
wenet/osum_echat/init_llmasr.py +1 -1

app.py CHANGED Viewed

@@ -50,12 +50,15 @@ cosyvoice_model_path="./CosyVoice-300M-25Hz"
 device = torch.device("cuda")
 print("开始加载模型 A...")
 model_a, tokenizer_a = load_model_and_tokenizer(CHECKPOINT_PATH_A, CONFIG_PATH)
 print("\n开始加载模型 B...")
 if CHECKPOINT_PATH_B is not None:
     model_b, tokenizer_b = load_model_and_tokenizer(CHECKPOINT_PATH_B, CONFIG_PATH)
 else:
     model_b, tokenizer_b = None, None
 loaded_models = {
     NAME_A: {"model": model_a, "tokenizer": tokenizer_a},
     NAME_B: {"model": model_b, "tokenizer": tokenizer_b},
@@ -65,6 +68,7 @@ loaded_models = {
 print("\n所有模型已加载完毕。")
 cosyvoice = CosyVoice(cosyvoice_model_path)
 # 将图片转换为 Base64
 with open("./tts/assert/实验室.png", "rb") as image_file:
@@ -109,144 +113,173 @@ for item in prompt_audio_choices:
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
-@spaces.GPU
-def do_s2t(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval().cuda()
-    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
-    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
-    if is_npu: torch_npu.npu.synchronize()
-    start_time = time.time()
-    res_text = model.generate(wavs=feat, wavs_len=feat_lens, prompt=input_prompt, cache_implementation="static")[0]
-    if is_npu: torch_npu.npu.synchronize()
-    end_time = time.time()
-    print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
-    return res_text
-@spaces.GPU
-def do_s2t4chat(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval().cuda()
-    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
-    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
-    if is_npu: torch_npu.npu.synchronize()
-    start_time = time.time()
-    res_text = model.generate4chat(wavs=feat, wavs_len=feat_lens, cache_implementation="static")[0]
-    if is_npu: torch_npu.npu.synchronize()
-    end_time = time.time()
-    print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
-    return res_text
-@spaces.GPU
-def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval().cuda()
-    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
-    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
-    if is_npu: torch_npu.npu.synchronize()
-    start_time = time.time()
-    res_text = model.generate4chat_think(wavs=feat, wavs_len=feat_lens, cache_implementation="static")[0]
-    if is_npu: torch_npu.npu.synchronize()
-    end_time = time.time()
-    print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
-    return res_text
-@spaces.GPU
-def do_t2s(model, input_prompt, text_for_tts, profile=False):  # 增加 model 参数
-    model.eval().cuda()
-    if is_npu: torch_npu.npu.synchronize()
-    start_time = time.time()
-    res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
-    res_token_list = res_tensor.tolist()
-    res_text = res_token_list[:-1]
-    if is_npu: torch_npu.npu.synchronize()
-    end_time = time.time()
-    print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
-    return res_text
 @spaces.GPU
-def do_t2t(model, question_txt, profile=False):  # 增加 model 参数
-    model.eval().cuda()
-    if is_npu: torch_npu.npu.synchronize()
-    start_time = time.time()
-    print(f'开始t2t推理, question_txt: {question_txt}')
-    res_text = model.generate_text2text(device=device, text=question_txt)[0]
-    if is_npu: torch_npu.npu.synchronize()
-    end_time = time.time()
-    print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
-    return res_text
-@spaces.GPU
-def do_s2s(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval().cuda()
-    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
-    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
-    if is_npu: torch_npu.npu.synchronize()
-    start_time = time.time()
-    output_text, text_res, speech_res = model.generate_s2s_no_stream_with_repetition_penalty(wavs=feat, wavs_len=feat_lens,)
-    if is_npu: torch_npu.npu.synchronize()
-    end_time = time.time()
-    print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
-    return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
-@spaces.GPU
-def do_s2s_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval().cuda()
-    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
-    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
-    if is_npu: torch_npu.npu.synchronize()
     start_time = time.time()
-    output_text, text_res, speech_res = model.generate_s2s_no_stream_think_with_repetition_penalty(wavs=feat, wavs_len=feat_lens,)
-    if is_npu: torch_npu.npu.synchronize()
-    end_time = time.time()
-    print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
-    return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
-@spaces.GPU
-def get_wav_from_token_list(input_list, prompt_speech):
-    cosyvoice.eval().cuda()
-    time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-    wav_path = f"./tmp/{time_str}.wav"
-    return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
-def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt):  # 增加 model 和 tokenizer 参数
-    print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
-    if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
-        print("音频信息未输入，且不是T2S或T2T任务")
-        return "错误：需要音频输入"
-    if input_prompt.endswith("_TTS"):
-        text_for_tts = input_prompt.replace("_TTS", "")
-        prompt = "恳请将如下文本转换为其对应的语音token，力求生成最为流畅、自然的语音。"
-        res_text = do_t2s(model, prompt, text_for_tts)
-    elif input_prompt.endswith("_self_prompt"):
-        prompt = input_prompt.replace("_self_prompt", "")
-        res_text = do_s2t(model, input_wav_path, prompt)
-    elif input_prompt.endswith("_T2T"):
-        question_txt = input_prompt.replace("_T2T", "")
-        res_text = do_t2t(model, question_txt)
-    elif input_prompt in ["识别语音内容，并以文字方式作出回答。",
-                          "请推断对这段语音回答时的情感，标注情感类型，撰写流畅自然的聊天回复，并生成情感语音token。",
-                          "s2s_no_think"]:
-        res_text = do_s2s(model, input_wav_path, input_prompt)
-    elif input_prompt == "THINK":
-        res_text = do_s2s_think(model, input_wav_path, input_prompt)
-    elif input_prompt == "s2t_no_think":
-        res_text = do_s2t4chat(model, input_wav_path, input_prompt)
-    elif input_prompt == "s2t_think":
-        res_text = do_s2t4chat_think(model, input_wav_path, input_prompt)
-    else:
-        res_text = do_s2t(model, input_wav_path, input_prompt)
-        res_text = res_text.replace("<youth>", "<adult>").replace("<middle_age>", "<adult>").replace("<middle>",
-                                                                                                     "<adult>")
-    print("识别结果为：", res_text)
-    return res_text
-def do_decode(model, tokenizer, input_wav_path, input_prompt):  # 增加 model 和 tokenizer 参数
-    print(f'使用模型进行推理: input_wav_path={input_wav_path}, input_prompt={input_prompt}')
-    output_res = true_decode_fuc(model, tokenizer, input_wav_path, input_prompt)
-    return output_res
 def save_to_jsonl(if_correct, wav, prompt, res):
@@ -351,24 +384,7 @@ with gr.Blocks() as demo:
         else:
             input_prompt = TASK_PROMPT_MAPPING.get(task_choice, "未知任务类型")
-        # 3. 调用重构后的推理函数，传入选择的模型
-        output_res = do_decode(model_to_use, tokenizer_to_use, input_wav_path, input_prompt)
-        # 4. 处理输出 (逻辑不变)
-        wav_path_output = input_wav_path
-        if task_choice == "TTS任务" or "empathetic_s2s_dialogue" in task_choice:
-            if isinstance(output_res, list):  # TTS case
-                wav_path_output = get_wav_from_token_list(output_res, prompt_speech_data)
-                output_res = "生成的token: " + str(output_res)
-            elif isinstance(output_res, str) and "|" in output_res:  # S2S case
-                try:
-                    text_res, token_list_str = output_res.split("|")
-                    token_list = json.loads(token_list_str)
-                    wav_path_output = get_wav_from_token_list(token_list, prompt_speech_data)
-                    output_res = text_res
-                except (ValueError, json.JSONDecodeError) as e:
-                    print(f"处理S2S输出时出错: {e}")
-                    output_res = f"错误：无法解析模型输出 - {output_res}"
         return output_res, wav_path_output

 device = torch.device("cuda")
 print("开始加载模型 A...")
 model_a, tokenizer_a = load_model_and_tokenizer(CHECKPOINT_PATH_A, CONFIG_PATH)
+model_a.eval().cuda()
 print("\n开始加载模型 B...")
 if CHECKPOINT_PATH_B is not None:
     model_b, tokenizer_b = load_model_and_tokenizer(CHECKPOINT_PATH_B, CONFIG_PATH)
+    model_b.eval().cuda()
 else:
     model_b, tokenizer_b = None, None
 loaded_models = {
     NAME_A: {"model": model_a, "tokenizer": tokenizer_a},
     NAME_B: {"model": model_b, "tokenizer": tokenizer_b},
 print("\n所有模型已加载完毕。")
 cosyvoice = CosyVoice(cosyvoice_model_path)
+cosyvoice.eval().cuda()
 # 将图片转换为 Base64
 with open("./tts/assert/实验室.png", "rb") as image_file:
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
+import time
+import datetime
+import torch
+from common_utils.utils4infer import get_feat_from_wav_path, token_list2wav
 @spaces.GPU
+def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice, prompt_speech_data):
+    """
+    合并所有推理逻辑的单个函数，处理所有任务类型
+    """
+    print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
+    # 检查音频输入合法性
+    if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
+        print("音频信息未输入，且不是T2S或T2T任务")
+        return "错误：需要音频输入"
+    # 通用初始化：模型设备设置
     start_time = time.time()
+    res_text = None
+    try:
+        # 1. 处理TTS任务
+        if input_prompt.endswith("_TTS"):
+            text_for_tts = input_prompt.replace("_TTS", "")
+            # T2S推理逻辑
+            res_tensor = model.generate_tts(device=device, text=text_for_tts)[0]
+            res_token_list = res_tensor.tolist()
+            res_text = res_token_list[:-1]
+            print(f"T2S 推理消耗时间: {time.time() - start_time:.2f} 秒")
+        # 2. 处理自定义提示任务
+        elif input_prompt.endswith("_self_prompt"):
+            prompt = input_prompt.replace("_self_prompt", "")
+            # S2T推理逻辑
+            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+            print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+            if is_npu: torch_npu.npu.synchronize()
+            res_text = model.generate(
+                wavs=feat,
+                wavs_len=feat_lens,
+                prompt=prompt,
+                cache_implementation="static"
+            )[0]
+            if is_npu: torch_npu.npu.synchronize()
+            print(f"S2T 推理消耗时间: {time.time() - start_time:.2f} 秒")
+        # 3. 处理T2T任务
+        elif input_prompt.endswith("_T2T"):
+            question_txt = input_prompt.replace("_T2T", "")
+            # T2T推理逻辑
+            print(f'开始t2t推理, question_txt: {question_txt}')
+            if is_npu: torch_npu.npu.synchronize()
+            res_text = model.generate_text2text(
+                device=device,
+                text=question_txt
+            )[0]
+            if is_npu: torch_npu.npu.synchronize()
+            print(f"T2T 推理消耗时间: {time.time() - start_time:.2f} 秒")
+        # 4. 处理S2S无思考任务
+        elif input_prompt in ["识别语音内容，并以文字方式作出回答。",
+                              "请推断对这段语音回答时的情感，标注情感类型，撰写流畅自然的聊天回复，并生成情感语音token。",
+                              "s2s_no_think"]:
+            # S2S推理逻辑
+            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+            print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+            if is_npu: torch_npu.npu.synchronize()
+            output_text, text_res, speech_res = model.generate_s2s_no_stream_with_repetition_penalty(
+                wavs=feat,
+                wavs_len=feat_lens,
+            )
+            if is_npu: torch_npu.npu.synchronize()
+            res_text = f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
+            print(f"S2S 推理消耗时间: {time.time() - start_time:.2f} 秒")
+        # 5. 处理S2S有思考任务
+        elif input_prompt == "THINK":
+            # S2S带思考推理逻辑
+            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+            print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+            if is_npu: torch_npu.npu.synchronize()
+            output_text, text_res, speech_res = model.generate_s2s_no_stream_think_with_repetition_penalty(
+                wavs=feat,
+                wavs_len=feat_lens,
+            )
+            if is_npu: torch_npu.npu.synchronize()
+            res_text = f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
+            print(f"S2S 推理消耗时间: {time.time() - start_time:.2f} 秒")
+        # 6. 处理S2T4Chat无思考任务
+        elif input_prompt == "s2t_no_think":
+            # S2T4Chat推理逻辑
+            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+            print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+            if is_npu: torch_npu.npu.synchronize()
+            res_text = model.generate4chat(
+                wavs=feat,
+                wavs_len=feat_lens,
+                cache_implementation="static"
+            )[0]
+            if is_npu: torch_npu.npu.synchronize()
+            print(f"S2T4Chat 推理消耗时间: {time.time() - start_time:.2f} 秒")
+        # 7. 处理S2T4Chat有思考任务
+        elif input_prompt == "s2t_think":
+            # S2T4Chat带思考推理逻辑
+            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+            print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+            if is_npu: torch_npu.npu.synchronize()
+            res_text = model.generate4chat_think(
+                wavs=feat,
+                wavs_len=feat_lens,
+                cache_implementation="static"
+            )[0]
+            if is_npu: torch_npu.npu.synchronize()
+            print(f"S2T4Chat 推理消耗时间: {time.time() - start_time:.2f} 秒")
+        # 8. 处理默认S2T任务
+        else:
+            # 默认S2T推理逻辑
+            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+            print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+            if is_npu: torch_npu.npu.synchronize()
+            res_text = model.generate(
+                wavs=feat,
+                wavs_len=feat_lens,
+                prompt=input_prompt,
+                cache_implementation="static"
+            )[0]
+            if is_npu: torch_npu.npu.synchronize()
+            print(f"S2T 推理消耗时间: {time.time() - start_time:.2f} 秒")
+            # 替换特定标签
+            res_text = res_text.replace("<youth>", "<adult>").replace("<middle_age>", "<adult>").replace("<middle>",
+                                                                                                         "<adult>")
+    except Exception as e:
+        print(f"推理过程出错: {str(e)}")
+        return f"错误：{str(e)}"
+    output_res =res_text
+    # 4. 处理输出 (逻辑不变)
+    wav_path_output = input_wav_path
+    if task_choice == "TTS任务" or "empathetic_s2s_dialogue" in task_choice:
+        if isinstance(output_res, list):  # TTS case
+            cosyvoice.eval()
+            time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+            wav_path = f"./tmp/{time_str}.wav"
+            wav_path_output = token_list2wav(output_res, prompt_speech_data, wav_path, cosyvoice)
+            # wav_path_output = get_wav_from_token_list(output_res, prompt_speech_data)
+            output_res = "生成的token: " + str(output_res)
+        elif isinstance(output_res, str) and "|" in output_res:  # S2S case
+            try:
+                text_res, token_list_str = output_res.split("|")
+                token_list = json.loads(token_list_str)
+                cosyvoice.eval()
+                time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+                wav_path = f"./tmp/{time_str}.wav"
+                wav_path_output = token_list2wav(token_list, prompt_speech_data, wav_path, cosyvoice)
+                # wav_path_output = get_wav_from_token_list(token_list, prompt_speech_data)
+                output_res = text_res
+            except (ValueError, json.JSONDecodeError) as e:
+                print(f"处理S2S输出时出错: {e}")
+                output_res = f"错误：无法解析模型输出 - {output_res}"
+    return output_res, wav_path_output
 def save_to_jsonl(if_correct, wav, prompt, res):
         else:
             input_prompt = TASK_PROMPT_MAPPING.get(task_choice, "未知任务类型")
+        output_res, wav_path_output = true_decode_fuc(model_to_use, tokenizer_to_use, input_wav_path, input_prompt,task_choice ,prompt_speech_data)
         return output_res, wav_path_output

wenet/osum_echat/init_llmasr.py CHANGED Viewed

@@ -121,7 +121,7 @@ def init_llmasr(args, configs, is_inference=False):
         elif fire_module == "link_and_lora":
             if k.startswith("encoder"):
                 p.requires_grad = False
-        logging.info(f"{k} {p.requires_grad} {p.shape} {p.dtype}")
     logging.info('OSUM-EChat：冻结完毕')
     logging.info(configs)

         elif fire_module == "link_and_lora":
             if k.startswith("encoder"):
                 p.requires_grad = False
+        # logging.info(f"{k} {p.requires_grad} {p.shape} {p.dtype}")
     logging.info('OSUM-EChat：冻结完毕')
     logging.info(configs)