webbigdata
/

VoiceCore_smoothquant

 - vllm
 ---
+# VoiceCore_smoothquant
+[webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore)をvLLMなどで高速に動かすためにsmoothquant(W8A8)量子化したモデルです
+詳細は元モデルを見てください
+## Install/Setup
+```
+python3 -m venv VL
+source VL/bin/activate
+pip install vllm
+pip install snac
+pip install numpy==1.26.4
+```
+## Sample script
+```
+import torch
+import scipy.io.wavfile as wavfile
+from transformers import AutoTokenizer
+from snac import SNAC
+from vllm import LLM, SamplingParams
+# --- 1. 設定項目 ---
+QUANTIZED_MODEL_PATH = "webbigdata/VoiceCore_smoothquant"
+prompts = [
+     "テストです",
+     "スムーズクアント、問題なく動いてますかね？圧縮しすぎると別人の声になっちゃう事があるんですよね、ふふふ"
+]
+chosen_voice = "matsukaze_male[neutral]"
+# --- 2. トークナイザーと入力の準備 ---
+print("Loading tokenizer and preparing inputs...")
+tokenizer = AutoTokenizer.from_pretrained(QUANTIZED_MODEL_PATH)
+prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
+start_token, end_tokens = [128259], [128009, 128260, 128261]
+all_prompt_token_ids = []
+for prompt in prompts_:
+  input_ids = tokenizer.encode(prompt)
+  final_token_ids = start_token + input_ids + end_tokens
+  all_prompt_token_ids.append(final_token_ids)
+print("Inputs prepared successfully.")
+# --- 3. vLLMモデルの読み込み (GPUで実行) ---
+print(f"Loading SmoothQuant model with vLLM from: {QUANTIZED_MODEL_PATH}")
+llm = LLM(
+    model=QUANTIZED_MODEL_PATH,
+    trust_remote_code=True,
+    max_model_len=10000,    # メモリ不足の場合は削ってください
+    #gpu_memory_utilization=0.9 # 最大GPUメモリの何割使うか？なので、適宜調整してください
+)
+sampling_params = SamplingParams(
+    temperature=0.6,
+    top_p=0.90,
+    repetition_penalty=1.1,
+    max_tokens=8192, # max_tokens + input_prompt <= max_model_len
+    stop_token_ids=[128258]
+)
+print("vLLM model loaded.")
+# --- 4. vLLMによる推論 ---
+print("Generating audio tokens with vLLM...")
+outputs = llm.generate(prompt_token_ids=all_prompt_token_ids, sampling_params=sampling_params)
+print("Generation complete.")
+# --- 5. SNACデコーダーの準備 (CPUで実行) --- GPUの方が早いがvllmが大きく確保していると失敗するため
+print("Loading SNAC decoder to CPU...")
+snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+snac_model.to("cpu") # 明示的にCPUに配置
+print("SNAC model loaded.")
+# --- 6. 後処理と音声デコード ---
+print("Decoding tokens to audio...")
+audio_start_token = 128257
+def redistribute_codes(code_list):
+  """SNACデコーダー用のフォーマットにコードを再構成する関数"""
+  layer_1, layer_2, layer_3 = [], [], []
+  for i in range(len(code_list) // 7):
+    layer_1.append(code_list[7*i])
+    layer_2.append(code_list[7*i+1] - 4096)
+    layer_3.append(code_list[7*i+2] - (2*4096))
+    layer_3.append(code_list[7*i+3] - (3*4096))
+    layer_2.append(code_list[7*i+4] - (4*4096))
+    layer_3.append(code_list[7*i+5] - (5*4096))
+    layer_3.append(code_list[7*i+6] - (6*4096))
+  codes = [torch.tensor(layer).unsqueeze(0)
+           for layer in [layer_1, layer_2, layer_3]]
+  audio_hat = snac_model.decode(codes)
+  return audio_hat
+code_lists = []
+for output in outputs:
+    generated_token_ids = output.outputs[0].token_ids
+    generated_tensor = torch.tensor([generated_token_ids])
+    token_indices = (generated_tensor == audio_start_token).nonzero(as_tuple=True)
+    if len(token_indices[1]) > 0:
+        cropped_tensor = generated_tensor[:, token_indices[1][-1].item() + 1:]
+    else:
+        cropped_tensor = generated_tensor
+    masked_row = cropped_tensor.squeeze()
+    row_length = masked_row.size(0)
+    new_length = (row_length // 7) * 7
+    trimmed_row = masked_row[:new_length]
+    code_list = [t.item() - 128266 for t in trimmed_row]
+    code_lists.append(code_list)
+# --- 7. 音声ファイルの保存 ---
+for i, code_list in enumerate(code_lists):
+    if i >= len(prompts): break
+    print(f"Processing audio for prompt: '{prompts[i]}'")
+    samples = redistribute_codes(code_list)
+    sample_np = samples.detach().squeeze().numpy()
+    safe_prompt = "".join(c for c in prompts[i] if c.isalnum() or c in (' ', '_')).rstrip()
+    filename = f"audio_final_{i}_{safe_prompt[:20].replace(' ', '_')}.wav"
+    wavfile.write(filename, 24000, sample_np)
+    print(f"Saved audio to: {filename}")
+```