taresh18 commited on
Commit
fd01f1b
·
verified ·
1 Parent(s): 2d0019b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +1 -192
README.md CHANGED
@@ -93,197 +93,6 @@ The model was fine-tuned on the following 10 voice IDs from the AniSpeech datase
93
 
94
  ## Usage
95
 
96
- First, install the necessary libraries:
97
- pip install torch transformers scipy tqdm unsloth snac
98
-
99
- Save the following code as a Python file (e.g., generate_speech.py) and run it. This script will generate audio for the specified prompts using each of the available voices.
100
-
101
- ```python
102
- import torch
103
- from unsloth import FastLanguageModel
104
- from snac import SNAC
105
- from scipy.io.wavfile import write as write_wav
106
- import os
107
- from tqdm import tqdm
108
-
109
- MODEL_NAME = "taresh18/orpheus-3B-animespeech-ft"
110
- SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
111
- MAX_SEQ_LENGTH = 2048
112
- LOAD_IN_4BIT = False
113
- DTYPE = None
114
- DEVICE = "cuda"
115
- OUTPUT_DIR = "outputs-animespeech-ft"
116
-
117
- PROMPTS = [
118
- "Rain tapped the tin roof as Mira whispered secrets to the dusk. Shadows danced between the lantern’s glow, weaving memories of laughter and loss.",
119
- ]
120
- VOICES = ["107", "125", "145", "16", "163", "179", "180", "183", "185", "187"]
121
-
122
- # Special token IDs
123
- START_TOKEN_ID = 128259
124
- END_TOKENS_IDS = [128009, 128260]
125
- PAD_TOKEN_ID = 128263
126
- CROP_START_TOKEN_ID = 128257
127
- REMOVE_TOKEN_ID = 128258
128
- AUDIO_CODE_OFFSET = 128266
129
-
130
-
131
- def redistribute_codes(code_list, device):
132
- """Redistributes flat token list into SNAC layers directly on the specified device."""
133
- layer_1 = []
134
- layer_2 = []
135
- layer_3 = []
136
- num_frames = len(code_list) // 7
137
- for i in range(num_frames):
138
- base_idx = 7 * i
139
- if base_idx + 6 >= len(code_list): break
140
- layer_1.append(code_list[base_idx])
141
- layer_2.append(code_list[base_idx + 1] - 4096)
142
- layer_3.append(code_list[base_idx + 2] - (2 * 4096))
143
- layer_3.append(code_list[base_idx + 3] - (3 * 4096))
144
- layer_2.append(code_list[base_idx + 4] - (4 * 4096))
145
- layer_3.append(code_list[base_idx + 5] - (5 * 4096))
146
- layer_3.append(code_list[base_idx + 6] - (6 * 4096))
147
-
148
- codes = [torch.tensor(layer_1, dtype=torch.long, device=device).unsqueeze(0),
149
- torch.tensor(layer_2, dtype=torch.long, device=device).unsqueeze(0),
150
- torch.tensor(layer_3, dtype=torch.long, device=device).unsqueeze(0)]
151
- return codes
152
-
153
-
154
- def load_models():
155
- """Loads the language model and the SNAC vocoder."""
156
- model, tokenizer = FastLanguageModel.from_pretrained(
157
- model_name=MODEL_NAME,
158
- max_seq_length=MAX_SEQ_LENGTH,
159
- dtype=DTYPE,
160
- load_in_4bit=LOAD_IN_4BIT,
161
- )
162
- FastLanguageModel.for_inference(model)
163
-
164
- snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME)
165
- snac_model.to(DEVICE)
166
- snac_model.eval()
167
- print("Models loaded.")
168
- return model, tokenizer, snac_model
169
-
170
- def generate_audio_from_prompts(model, tokenizer, snac_model, prompts, chosen_voice):
171
- """Generates audio tensors from text prompts."""
172
- prompts_with_voice = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
173
- all_input_ids = [tokenizer(p, return_tensors="pt").input_ids for p in prompts_with_voice]
174
-
175
- start_token = torch.tensor([[START_TOKEN_ID]], dtype=torch.int64)
176
- end_tokens = torch.tensor([END_TOKENS_IDS], dtype=torch.int64)
177
-
178
- all_modified_input_ids = [torch.cat([start_token, ids, end_tokens], dim=1) for ids in all_input_ids]
179
-
180
- max_length = max([mod_ids.shape[1] for mod_ids in all_modified_input_ids])
181
- all_padded_tensors = []
182
- all_attention_masks = []
183
- for mod_ids in all_modified_input_ids:
184
- padding_length = max_length - mod_ids.shape[1]
185
- padding_tensor = torch.full((1, padding_length), PAD_TOKEN_ID, dtype=torch.int64)
186
- padded_tensor = torch.cat([padding_tensor, mod_ids], dim=1)
187
- mask_padding = torch.zeros((1, padding_length), dtype=torch.int64)
188
- mask_real = torch.ones((1, mod_ids.shape[1]), dtype=torch.int64)
189
- attention_mask = torch.cat([mask_padding, mask_real], dim=1)
190
- all_padded_tensors.append(padded_tensor)
191
- all_attention_masks.append(attention_mask)
192
-
193
- batch_input_ids = torch.cat(all_padded_tensors, dim=0).to(DEVICE)
194
- batch_attention_mask = torch.cat(all_attention_masks, dim=0).to(DEVICE)
195
-
196
- print("Generating tokens...")
197
- with torch.no_grad():
198
- generated_ids = model.generate(
199
- input_ids=batch_input_ids,
200
- attention_mask=batch_attention_mask,
201
- max_new_tokens=1200,
202
- do_sample=True,
203
- temperature=0.6,
204
- top_p=0.95,
205
- repetition_penalty=1.1,
206
- num_return_sequences=1,
207
- eos_token_id=REMOVE_TOKEN_ID,
208
- pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else PAD_TOKEN_ID,
209
- use_cache=True
210
- )
211
- generated_ids = generated_ids.to("cpu")
212
- print("Token generation complete.")
213
-
214
- token_indices = (generated_ids == CROP_START_TOKEN_ID).nonzero(as_tuple=True)
215
- cropped_tensors = []
216
- if len(token_indices[0]) > 0:
217
- for i in range(generated_ids.shape[0]):
218
- seq_indices = token_indices[1][token_indices[0] == i]
219
- if len(seq_indices) > 0:
220
- last_occurrence_idx = seq_indices[-1].item()
221
- cropped_tensors.append(generated_ids[i, last_occurrence_idx + 1:].unsqueeze(0))
222
- else:
223
- cropped_tensors.append(generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0))
224
- else:
225
- cropped_tensors = [generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0) for i in range(generated_ids.shape[0])]
226
-
227
-
228
- processed_rows = []
229
- for row_tensor in cropped_tensors:
230
- if row_tensor.numel() > 0:
231
- row_1d = row_tensor.squeeze(0)
232
- mask = row_1d != REMOVE_TOKEN_ID
233
- processed_rows.append(row_1d[mask])
234
- else:
235
- processed_rows.append(row_tensor.squeeze(0))
236
-
237
- code_lists = []
238
- for row in processed_rows:
239
- if row.numel() >= 7:
240
- row_length = row.size(0)
241
- new_length = (row_length // 7) * 7
242
- trimmed_row = row[:new_length]
243
- adjusted_code_list = [(t.item() - AUDIO_CODE_OFFSET) for t in trimmed_row]
244
- code_lists.append(adjusted_code_list)
245
- else:
246
- code_lists.append([])
247
-
248
- print("Decoding audio with SNAC...")
249
- all_audio_samples = []
250
- for i, code_list in enumerate(code_lists):
251
- if code_list:
252
- codes_for_snac = redistribute_codes(code_list, DEVICE)
253
- with torch.no_grad():
254
- audio_hat = snac_model.decode(codes_for_snac)
255
- all_audio_samples.append(audio_hat.detach().cpu())
256
- else:
257
- all_audio_samples.append(torch.tensor([[]]))
258
-
259
- return all_audio_samples
260
-
261
-
262
- def main():
263
- model, tokenizer, snac_model = load_models()
264
-
265
- for voice in tqdm(VOICES):
266
- my_samples = generate_audio_from_prompts(model, tokenizer, snac_model, PROMPTS, voice)
267
-
268
- if len(PROMPTS) != len(my_samples):
269
- print("Error: Mismatch between number of prompts and generated samples.")
270
- else:
271
- os.makedirs(OUTPUT_DIR, exist_ok=True)
272
-
273
- for i, samples in enumerate(my_samples):
274
- if samples.numel() > 0:
275
- audio_data = samples.squeeze().numpy()
276
- if audio_data.ndim == 0:
277
- audio_data = audio_data.reshape(1)
278
- output_filename = os.path.join(OUTPUT_DIR, f"voice_{voice}_{i}.wav")
279
- write_wav(output_filename, 24000, audio_data)
280
- print(f"Saved audio to: {output_filename}")
281
- else:
282
- print(f"Skipping save for sample {i} as no audio data was generated.")
283
-
284
-
285
- if __name__ == "__main__":
286
- main()
287
- ```
288
 
289
 
 
93
 
94
  ## Usage
95
 
96
+ Refer `https://github.com/taresh18/orpheus-streaming`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98