add gradio gui code

Files changed (8) hide show

.gitattributes +2 -0
.gitignore +3 -1
main_api_ax650 +3 -0
main_api_axcl_aarch64 +3 -0
run_api_ax650.sh +23 -0
run_api_axcl_aarch64.sh +24 -0
scripts/gradio_demo.py +161 -0
scripts/requirements.txt +1 -0

.gitattributes CHANGED Viewed

@@ -111,3 +111,5 @@ token2wav-axmodels/rand_noise_1_80_300.txt filter=lfs diff=lfs merge=lfs -text
 token2wav-axmodels/speech_window_2x8x480.txt filter=lfs diff=lfs merge=lfs -text
 token2wav-axmodels/hift_p1_50_first.mnn filter=lfs diff=lfs merge=lfs -text
 main_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text

 token2wav-axmodels/speech_window_2x8x480.txt filter=lfs diff=lfs merge=lfs -text
 token2wav-axmodels/hift_p1_50_first.mnn filter=lfs diff=lfs merge=lfs -text
 main_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text
+main_api_ax650 filter=lfs diff=lfs merge=lfs -text
+main_api_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
 output*.wav
-__pycache__/

 output*.wav
+__pycache__/
+*.crt
+*.key

main_api_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed4d3c19fb9c44ebbc565dff37e6049debec2223302c1d334cbf895df5b39ab9
+size 9653912

main_api_axcl_aarch64 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:308b67412ada48aeec9ed0bf9f5af60426db4f5ca6cd9e3e2a2e8ea108e3b735
+size 4901808

run_api_ax650.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+LLM_DIR=CosyVoice-BlankEN-Ax650-prefill_512/
+TOKEN2WAV_DIR=token2wav-axmodels/
+openssl req -newkey rsa:2048 -new -nodes -x509 -days 365 -keyout server.key -out server.crt -subj "/C=CN/ST=Beijing/L=Beijing/O=YourOrg/CN=localhost"
+rm output*.wav
+./main_api_ax650 \
+--template_filename_axmodel "${LLM_DIR}/qwen2_p128_l%d_together.axmodel" \
+--token2wav_axmodel_dir $TOKEN2WAV_DIR \
+--n_timesteps 10 \
+--axmodel_num 24 \
+--bos 0 --eos 0 \
+--filename_tokenizer_model "http://127.0.0.1:12345" \
+--filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
+--filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
+--filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--filename_llm_embed "${LLM_DIR}/llm.llm_embedding.float16.bin" \
+--filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
+--continue 0 \
+--prompt_files prompt_files
+chmod 777 output*.wav

run_api_axcl_aarch64.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+LLM_DIR=CosyVoice-BlankEN-Ax650-prefill_512/
+TOKEN2WAV_DIR=token2wav-axmodels/
+openssl req -newkey rsa:2048 -new -nodes -x509 -days 365 -keyout server.key -out server.crt -subj "/C=CN/ST=Beijing/L=Beijing/O=YourOrg/CN=localhost"
+rm output*.wav
+./main_api_axcl_aarch64 \
+--template_filename_axmodel "${LLM_DIR}/qwen2_p128_l%d_together.axmodel" \
+--token2wav_axmodel_dir $TOKEN2WAV_DIR \
+--n_timesteps 10 \
+--axmodel_num 24 \
+--bos 0 --eos 0 \
+--filename_tokenizer_model "http://127.0.0.1:12345" \
+--filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
+--filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
+--filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--filename_llm_embed "${LLM_DIR}/llm.llm_embedding.float16.bin" \
+--filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
+--continue 0 \
+--devices "0," \
+--prompt_files prompt_files
+chmod 777 output*.wav

scripts/gradio_demo.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import argparse
+import shutil
+import gradio as gr
+import numpy as np
+import requests
+import time
+import os
+import torch
+from frontend import CosyVoiceFrontEnd
+import torchaudio
+import logging
+logging.basicConfig(level=logging.WARNING)
+import subprocess
+import re
+def get_all_local_ips():
+    result = subprocess.run(['ip', 'a'], capture_output=True, text=True)
+    output = result.stdout
+    # 匹配所有IPv4
+    ips = re.findall(r'inet (\d+\.\d+\.\d+\.\d+)', output)
+    # 过滤掉回环地址
+    real_ips = [ip for ip in ips if not ip.startswith('127.')]
+    return real_ips
+TTS_URL = "http://0.0.0.0:12346/tts"
+GET_URL = "http://0.0.0.0:12346/get"
+TIMESTEPS_URL = "http://0.0.0.0:12346/timesteps"
+PROMPT_FILES_URL = "http://0.0.0.0:12346/prompt_files"
+args = argparse.ArgumentParser()
+args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN", help="tokenizer configuration directionary")
+args.add_argument('--wetext_dir', type=str, default="pengzhendong/wetext", help="path to wetext")
+args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
+args = args.parse_args()
+frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
+                                args.wetext_dir,
+                                "frontend-onnx/campplus.onnx",
+                                "frontend-onnx/speech_tokenizer_v2.onnx",
+                                f"{args.model_dir}/spk2info.pt",
+                                "all")
+def update_audio(audio_input_path, audio_text):
+    def load_wav(wav, target_sr):
+        speech, sample_rate = torchaudio.load(wav, backend='soundfile')
+        speech = speech.mean(dim=0, keepdim=True)
+        if sample_rate != target_sr:
+            assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+            speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
+        return speech
+    output_dir = './output_temp'
+    # clear output_dir
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    zero_shot_spk_id = ""
+    prompt_speech_16k = load_wav(audio_input_path, 16000)
+    prompt_text = audio_text
+    print("prompt_text",prompt_text)
+    model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
+    print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
+    assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
+    for k, v in model_input.items():
+        if "_len" in k:
+            continue
+        shapes = [str(s) for s in v.shape]
+        shape_str = "_".join(shapes)
+        if v.dtype in (torch.int32, torch.int64):
+            np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
+        else:
+            np.savetxt(f"{output_dir}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")
+    try:
+        r = requests.post(PROMPT_FILES_URL, json={"prompt_files": output_dir}, timeout=5)
+        if r.status_code != 200:
+            return None, "❌ TTS 请求失败"
+    except Exception as e:
+        return None, f"❌ TTS 请求异常: {e}"
+def update_timesteps(timesteps):
+    try:
+        r = requests.post(TIMESTEPS_URL, json={"timesteps": timesteps}, timeout=5)
+        if r.status_code != 200:
+            return None, "❌ TTS 请求失败"
+    except Exception as e:
+        return None, f"❌ TTS 请求异常: {e}"
+def run_tts(text):
+    # Step1: 提交 TTS 请求
+    try:
+        r = requests.post(TTS_URL, json={"text": text}, timeout=5)
+        if r.status_code != 200:
+            return None, "❌ TTS 请求失败"
+    except Exception as e:
+        return None, f"❌ TTS 请求异常: {e}"
+    # Step2: 循环调用 /get 获取进度
+    progress = gr.Progress()
+    wav_file = None
+    for i in range(100):  # 最多尝试100次，避免死循环
+        time.sleep(0.5)
+        try:
+            resp = requests.post(GET_URL, data="", timeout=5).json()
+        except Exception as e:
+            return None, f"❌ GET 请求异常: {e}"
+        if resp.get("b_tts_runing", True):
+            progress(i / 100, desc="正在生成语音...")
+        else:
+            wav_file = resp.get("wav_file")
+            break
+    if not wav_file or not os.path.exists(wav_file):
+        return None, "❌ 语音文件未生成"
+    return wav_file, "✅ 生成完成"
+with gr.Blocks() as demo:
+    gr.Markdown("### 🎙️ AXERA CosyVoice2 Demo")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(label="输入音频", type="filepath")
+        with gr.Column():
+            audio_text = gr.Textbox(label="音频文本(自己改一下或者照着念)", value="锄禾日当午，汗滴禾下土。")
+            btn_update = gr.Button("更新音源")
+    with gr.Row():
+        text_input = gr.Textbox(value="琦琦，麻烦你适配一下这个新的模型吧。", label="输入文本")
+        with gr.Column():
+            timesteps = gr.Slider(minimum=4, maximum=30, value=7, step=1, label="Timesteps")
+            run_btn = gr.Button("生成语音")
+    status = gr.Label(label="状态")
+    audio_out = gr.Audio(label="生成结果", type="filepath")
+    run_btn.click(fn=run_tts, inputs=[text_input], outputs=[audio_out, status])
+    timesteps.change(fn=update_timesteps, inputs=timesteps)
+    btn_update.click(fn=update_audio, inputs=[audio_input, audio_text])
+ips = get_all_local_ips()
+for ip in ips:
+    print(f"* Running on local URL:  https://{ip}:7860")
+demo.launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    ssl_certfile="./server.crt",
+    ssl_keyfile="./server.key",
+    ssl_verify=False
+)

scripts/requirements.txt CHANGED Viewed

@@ -38,3 +38,4 @@ transformers>=4.40.1
 uvicorn==0.30.0
 wetext==0.0.4
 wget==3.2

 uvicorn==0.30.0
 wetext==0.0.4
 wget==3.2
+gradio==5.47.1