File size: 4,586 Bytes
1a48690
 
 
 
 
 
 
 
 
 
9a5d0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
---
language:
- ja
base_model:
- webbigdata/VoiceCore
tags:
- tts
- vllm
---

# VoiceCore_smoothquant

[webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore)をvLLMなどで高速に動かすためにsmoothquant(W8A8)量子化したモデルです  
詳細は元モデルを見てください  



## Install/Setup

```
python3 -m venv VL
source VL/bin/activate
pip install vllm
pip install snac
pip install numpy==1.26.4
```

## Sample script
```
import torch
import scipy.io.wavfile as wavfile
from transformers import AutoTokenizer
from snac import SNAC
from vllm import LLM, SamplingParams

# --- 1. 設定項目 ---
QUANTIZED_MODEL_PATH = "webbigdata/VoiceCore_smoothquant"
prompts = [
     "テストです",
     "スムーズクアント、問題なく動いてますかね?圧縮しすぎると別人の声になっちゃう事があるんですよね、ふふふ"
]
chosen_voice = "matsukaze_male[neutral]"

# --- 2. トークナイザーと入力の準備 ---
print("Loading tokenizer and preparing inputs...")
tokenizer = AutoTokenizer.from_pretrained(QUANTIZED_MODEL_PATH)
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
start_token, end_tokens = [128259], [128009, 128260, 128261]
all_prompt_token_ids = []
for prompt in prompts_:
  input_ids = tokenizer.encode(prompt)
  final_token_ids = start_token + input_ids + end_tokens
  all_prompt_token_ids.append(final_token_ids)
print("Inputs prepared successfully.")

# --- 3. vLLMモデルの読み込み (GPUで実行) ---
print(f"Loading SmoothQuant model with vLLM from: {QUANTIZED_MODEL_PATH}")
llm = LLM(
    model=QUANTIZED_MODEL_PATH,
    trust_remote_code=True,
    max_model_len=10000,    # メモリ不足の場合は削ってください
    #gpu_memory_utilization=0.9 # 最大GPUメモリの何割使うか?なので、適宜調整してください
)
sampling_params = SamplingParams(
    temperature=0.6,
    top_p=0.90,
    repetition_penalty=1.1,
    max_tokens=8192, # max_tokens + input_prompt <= max_model_len
    stop_token_ids=[128258]
)
print("vLLM model loaded.")

# --- 4. vLLMによる推論 ---
print("Generating audio tokens with vLLM...")
outputs = llm.generate(prompt_token_ids=all_prompt_token_ids, sampling_params=sampling_params)
print("Generation complete.")

# --- 5. SNACデコーダーの準備 (CPUで実行) --- GPUの方が早いがvllmが大きく確保していると失敗するため
print("Loading SNAC decoder to CPU...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model.to("cpu") # 明示的にCPUに配置
print("SNAC model loaded.")

# --- 6. 後処理と音声デコード ---
print("Decoding tokens to audio...")
audio_start_token = 128257

def redistribute_codes(code_list):
  """SNACデコーダー用のフォーマットにコードを再構成する関数"""
  layer_1, layer_2, layer_3 = [], [], []
  for i in range(len(code_list) // 7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1] - 4096)
    layer_3.append(code_list[7*i+2] - (2*4096))
    layer_3.append(code_list[7*i+3] - (3*4096))
    layer_2.append(code_list[7*i+4] - (4*4096))
    layer_3.append(code_list[7*i+5] - (5*4096))
    layer_3.append(code_list[7*i+6] - (6*4096))

  codes = [torch.tensor(layer).unsqueeze(0)
           for layer in [layer_1, layer_2, layer_3]]

  audio_hat = snac_model.decode(codes)
  return audio_hat

code_lists = []
for output in outputs:
    generated_token_ids = output.outputs[0].token_ids
    generated_tensor = torch.tensor([generated_token_ids])
    token_indices = (generated_tensor == audio_start_token).nonzero(as_tuple=True)
    if len(token_indices[1]) > 0:
        cropped_tensor = generated_tensor[:, token_indices[1][-1].item() + 1:]
    else:
        cropped_tensor = generated_tensor

    masked_row = cropped_tensor.squeeze()
    row_length = masked_row.size(0)
    new_length = (row_length // 7) * 7
    trimmed_row = masked_row[:new_length]
    code_list = [t.item() - 128266 for t in trimmed_row]
    code_lists.append(code_list)

# --- 7. 音声ファイルの保存 ---
for i, code_list in enumerate(code_lists):
    if i >= len(prompts): break

    print(f"Processing audio for prompt: '{prompts[i]}'")
    samples = redistribute_codes(code_list)
    sample_np = samples.detach().squeeze().numpy()

    safe_prompt = "".join(c for c in prompts[i] if c.isalnum() or c in (' ', '_')).rstrip()
    filename = f"audio_final_{i}_{safe_prompt[:20].replace(' ', '_')}.wav"

    wavfile.write(filename, 24000, sample_np)
    print(f"Saved audio to: {filename}")
```