File size: 4,586 Bytes
1a48690 9a5d0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
---
language:
- ja
base_model:
- webbigdata/VoiceCore
tags:
- tts
- vllm
---
# VoiceCore_smoothquant
[webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore)をvLLMなどで高速に動かすためにsmoothquant(W8A8)量子化したモデルです
詳細は元モデルを見てください
## Install/Setup
```
python3 -m venv VL
source VL/bin/activate
pip install vllm
pip install snac
pip install numpy==1.26.4
```
## Sample script
```
import torch
import scipy.io.wavfile as wavfile
from transformers import AutoTokenizer
from snac import SNAC
from vllm import LLM, SamplingParams
# --- 1. 設定項目 ---
QUANTIZED_MODEL_PATH = "webbigdata/VoiceCore_smoothquant"
prompts = [
"テストです",
"スムーズクアント、問題なく動いてますかね?圧縮しすぎると別人の声になっちゃう事があるんですよね、ふふふ"
]
chosen_voice = "matsukaze_male[neutral]"
# --- 2. トークナイザーと入力の準備 ---
print("Loading tokenizer and preparing inputs...")
tokenizer = AutoTokenizer.from_pretrained(QUANTIZED_MODEL_PATH)
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
start_token, end_tokens = [128259], [128009, 128260, 128261]
all_prompt_token_ids = []
for prompt in prompts_:
input_ids = tokenizer.encode(prompt)
final_token_ids = start_token + input_ids + end_tokens
all_prompt_token_ids.append(final_token_ids)
print("Inputs prepared successfully.")
# --- 3. vLLMモデルの読み込み (GPUで実行) ---
print(f"Loading SmoothQuant model with vLLM from: {QUANTIZED_MODEL_PATH}")
llm = LLM(
model=QUANTIZED_MODEL_PATH,
trust_remote_code=True,
max_model_len=10000, # メモリ不足の場合は削ってください
#gpu_memory_utilization=0.9 # 最大GPUメモリの何割使うか?なので、適宜調整してください
)
sampling_params = SamplingParams(
temperature=0.6,
top_p=0.90,
repetition_penalty=1.1,
max_tokens=8192, # max_tokens + input_prompt <= max_model_len
stop_token_ids=[128258]
)
print("vLLM model loaded.")
# --- 4. vLLMによる推論 ---
print("Generating audio tokens with vLLM...")
outputs = llm.generate(prompt_token_ids=all_prompt_token_ids, sampling_params=sampling_params)
print("Generation complete.")
# --- 5. SNACデコーダーの準備 (CPUで実行) --- GPUの方が早いがvllmが大きく確保していると失敗するため
print("Loading SNAC decoder to CPU...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model.to("cpu") # 明示的にCPUに配置
print("SNAC model loaded.")
# --- 6. 後処理と音声デコード ---
print("Decoding tokens to audio...")
audio_start_token = 128257
def redistribute_codes(code_list):
"""SNACデコーダー用のフォーマットにコードを再構成する関数"""
layer_1, layer_2, layer_3 = [], [], []
for i in range(len(code_list) // 7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1] - 4096)
layer_3.append(code_list[7*i+2] - (2*4096))
layer_3.append(code_list[7*i+3] - (3*4096))
layer_2.append(code_list[7*i+4] - (4*4096))
layer_3.append(code_list[7*i+5] - (5*4096))
layer_3.append(code_list[7*i+6] - (6*4096))
codes = [torch.tensor(layer).unsqueeze(0)
for layer in [layer_1, layer_2, layer_3]]
audio_hat = snac_model.decode(codes)
return audio_hat
code_lists = []
for output in outputs:
generated_token_ids = output.outputs[0].token_ids
generated_tensor = torch.tensor([generated_token_ids])
token_indices = (generated_tensor == audio_start_token).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
cropped_tensor = generated_tensor[:, token_indices[1][-1].item() + 1:]
else:
cropped_tensor = generated_tensor
masked_row = cropped_tensor.squeeze()
row_length = masked_row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = masked_row[:new_length]
code_list = [t.item() - 128266 for t in trimmed_row]
code_lists.append(code_list)
# --- 7. 音声ファイルの保存 ---
for i, code_list in enumerate(code_lists):
if i >= len(prompts): break
print(f"Processing audio for prompt: '{prompts[i]}'")
samples = redistribute_codes(code_list)
sample_np = samples.detach().squeeze().numpy()
safe_prompt = "".join(c for c in prompts[i] if c.isalnum() or c in (' ', '_')).rstrip()
filename = f"audio_final_{i}_{safe_prompt[:20].replace(' ', '_')}.wav"
wavfile.write(filename, 24000, sample_np)
print(f"Saved audio to: {filename}")
```
|