dahara1 commited on
Commit
9a5d0d8
·
verified ·
1 Parent(s): 1a48690

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +126 -0
README.md CHANGED
@@ -8,3 +8,129 @@ tags:
8
  - vllm
9
  ---
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  - vllm
9
  ---
10
 
11
+ # VoiceCore_smoothquant
12
+
13
+ [webbigdata/VoiceCore](https://huggingface.co/webbigdata/VoiceCore)をvLLMなどで高速に動かすためにsmoothquant(W8A8)量子化したモデルです
14
+ 詳細は元モデルを見てください
15
+
16
+
17
+
18
+ ## Install/Setup
19
+
20
+ ```
21
+ python3 -m venv VL
22
+ source VL/bin/activate
23
+ pip install vllm
24
+ pip install snac
25
+ pip install numpy==1.26.4
26
+ ```
27
+
28
+ ## Sample script
29
+ ```
30
+ import torch
31
+ import scipy.io.wavfile as wavfile
32
+ from transformers import AutoTokenizer
33
+ from snac import SNAC
34
+ from vllm import LLM, SamplingParams
35
+
36
+ # --- 1. 設定項目 ---
37
+ QUANTIZED_MODEL_PATH = "webbigdata/VoiceCore_smoothquant"
38
+ prompts = [
39
+ "テストです",
40
+ "スムーズクアント、問題なく動いてますかね?圧縮しすぎると別人の声になっちゃう事があるんですよね、ふふふ"
41
+ ]
42
+ chosen_voice = "matsukaze_male[neutral]"
43
+
44
+ # --- 2. トークナイザーと入力の準備 ---
45
+ print("Loading tokenizer and preparing inputs...")
46
+ tokenizer = AutoTokenizer.from_pretrained(QUANTIZED_MODEL_PATH)
47
+ prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
48
+ start_token, end_tokens = [128259], [128009, 128260, 128261]
49
+ all_prompt_token_ids = []
50
+ for prompt in prompts_:
51
+ input_ids = tokenizer.encode(prompt)
52
+ final_token_ids = start_token + input_ids + end_tokens
53
+ all_prompt_token_ids.append(final_token_ids)
54
+ print("Inputs prepared successfully.")
55
+
56
+ # --- 3. vLLMモデルの読み込み (GPUで実行) ---
57
+ print(f"Loading SmoothQuant model with vLLM from: {QUANTIZED_MODEL_PATH}")
58
+ llm = LLM(
59
+ model=QUANTIZED_MODEL_PATH,
60
+ trust_remote_code=True,
61
+ max_model_len=10000, # メモリ不足の場合は削ってください
62
+ #gpu_memory_utilization=0.9 # 最大GPUメモリの何割使うか?なので、適宜調整してください
63
+ )
64
+ sampling_params = SamplingParams(
65
+ temperature=0.6,
66
+ top_p=0.90,
67
+ repetition_penalty=1.1,
68
+ max_tokens=8192, # max_tokens + input_prompt <= max_model_len
69
+ stop_token_ids=[128258]
70
+ )
71
+ print("vLLM model loaded.")
72
+
73
+ # --- 4. vLLMによる推論 ---
74
+ print("Generating audio tokens with vLLM...")
75
+ outputs = llm.generate(prompt_token_ids=all_prompt_token_ids, sampling_params=sampling_params)
76
+ print("Generation complete.")
77
+
78
+ # --- 5. SNACデコーダーの準備 (CPUで実行) --- GPUの方が早いがvllmが大きく確保していると失敗するため
79
+ print("Loading SNAC decoder to CPU...")
80
+ snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
81
+ snac_model.to("cpu") # 明示的にCPUに配置
82
+ print("SNAC model loaded.")
83
+
84
+ # --- 6. 後処理と音声デコード ---
85
+ print("Decoding tokens to audio...")
86
+ audio_start_token = 128257
87
+
88
+ def redistribute_codes(code_list):
89
+ """SNACデコーダー用のフォーマットにコードを再構成する関数"""
90
+ layer_1, layer_2, layer_3 = [], [], []
91
+ for i in range(len(code_list) // 7):
92
+ layer_1.append(code_list[7*i])
93
+ layer_2.append(code_list[7*i+1] - 4096)
94
+ layer_3.append(code_list[7*i+2] - (2*4096))
95
+ layer_3.append(code_list[7*i+3] - (3*4096))
96
+ layer_2.append(code_list[7*i+4] - (4*4096))
97
+ layer_3.append(code_list[7*i+5] - (5*4096))
98
+ layer_3.append(code_list[7*i+6] - (6*4096))
99
+
100
+ codes = [torch.tensor(layer).unsqueeze(0)
101
+ for layer in [layer_1, layer_2, layer_3]]
102
+
103
+ audio_hat = snac_model.decode(codes)
104
+ return audio_hat
105
+
106
+ code_lists = []
107
+ for output in outputs:
108
+ generated_token_ids = output.outputs[0].token_ids
109
+ generated_tensor = torch.tensor([generated_token_ids])
110
+ token_indices = (generated_tensor == audio_start_token).nonzero(as_tuple=True)
111
+ if len(token_indices[1]) > 0:
112
+ cropped_tensor = generated_tensor[:, token_indices[1][-1].item() + 1:]
113
+ else:
114
+ cropped_tensor = generated_tensor
115
+
116
+ masked_row = cropped_tensor.squeeze()
117
+ row_length = masked_row.size(0)
118
+ new_length = (row_length // 7) * 7
119
+ trimmed_row = masked_row[:new_length]
120
+ code_list = [t.item() - 128266 for t in trimmed_row]
121
+ code_lists.append(code_list)
122
+
123
+ # --- 7. 音声ファイルの保存 ---
124
+ for i, code_list in enumerate(code_lists):
125
+ if i >= len(prompts): break
126
+
127
+ print(f"Processing audio for prompt: '{prompts[i]}'")
128
+ samples = redistribute_codes(code_list)
129
+ sample_np = samples.detach().squeeze().numpy()
130
+
131
+ safe_prompt = "".join(c for c in prompts[i] if c.isalnum() or c in (' ', '_')).rstrip()
132
+ filename = f"audio_final_{i}_{safe_prompt[:20].replace(' ', '_')}.wav"
133
+
134
+ wavfile.write(filename, 24000, sample_np)
135
+ print(f"Saved audio to: {filename}")
136
+ ```