taresh18
/

orpheus-animespeech

@@ -93,197 +93,6 @@ The model was fine-tuned on the following 10 voice IDs from the AniSpeech datase
 ## Usage
-First, install the necessary libraries:
-pip install torch transformers scipy tqdm unsloth snac
-Save the following code as a Python file (e.g., generate_speech.py) and run it. This script will generate audio for the specified prompts using each of the available voices.
-```python
-import torch
-from unsloth import FastLanguageModel
-from snac import SNAC
-from scipy.io.wavfile import write as write_wav
-import os
-from tqdm import tqdm
-MODEL_NAME = "taresh18/orpheus-3B-animespeech-ft"
-SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
-MAX_SEQ_LENGTH = 2048
-LOAD_IN_4BIT = False
-DTYPE = None
-DEVICE = "cuda"
-OUTPUT_DIR = "outputs-animespeech-ft"
-PROMPTS = [
-    "Rain tapped the tin roof as Mira whispered secrets to the dusk. Shadows danced between the lantern’s glow, weaving memories of laughter and loss.",
-]
-VOICES = ["107", "125", "145", "16", "163", "179", "180", "183", "185", "187"]
-# Special token IDs
-START_TOKEN_ID = 128259
-END_TOKENS_IDS = [128009, 128260]
-PAD_TOKEN_ID = 128263
-CROP_START_TOKEN_ID = 128257
-REMOVE_TOKEN_ID = 128258
-AUDIO_CODE_OFFSET = 128266
-def redistribute_codes(code_list, device):
-  """Redistributes flat token list into SNAC layers directly on the specified device."""
-  layer_1 = []
-  layer_2 = []
-  layer_3 = []
-  num_frames = len(code_list) // 7
-  for i in range(num_frames):
-    base_idx = 7 * i
-    if base_idx + 6 >= len(code_list): break
-    layer_1.append(code_list[base_idx])
-    layer_2.append(code_list[base_idx + 1] - 4096)
-    layer_3.append(code_list[base_idx + 2] - (2 * 4096))
-    layer_3.append(code_list[base_idx + 3] - (3 * 4096))
-    layer_2.append(code_list[base_idx + 4] - (4 * 4096))
-    layer_3.append(code_list[base_idx + 5] - (5 * 4096))
-    layer_3.append(code_list[base_idx + 6] - (6 * 4096))
-  codes = [torch.tensor(layer_1, dtype=torch.long, device=device).unsqueeze(0),
-           torch.tensor(layer_2, dtype=torch.long, device=device).unsqueeze(0),
-           torch.tensor(layer_3, dtype=torch.long, device=device).unsqueeze(0)]
-  return codes
-def load_models():
-    """Loads the language model and the SNAC vocoder."""
-    model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name=MODEL_NAME,
-        max_seq_length=MAX_SEQ_LENGTH,
-        dtype=DTYPE,
-        load_in_4bit=LOAD_IN_4BIT,
-    )
-    FastLanguageModel.for_inference(model)
-    snac_model = SNAC.from_pretrained(SNAC_MODEL_NAME)
-    snac_model.to(DEVICE)
-    snac_model.eval()
-    print("Models loaded.")
-    return model, tokenizer, snac_model
-def generate_audio_from_prompts(model, tokenizer, snac_model, prompts, chosen_voice):
-    """Generates audio tensors from text prompts."""
-    prompts_with_voice = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
-    all_input_ids = [tokenizer(p, return_tensors="pt").input_ids for p in prompts_with_voice]
-    start_token = torch.tensor([[START_TOKEN_ID]], dtype=torch.int64)
-    end_tokens = torch.tensor([END_TOKENS_IDS], dtype=torch.int64)
-    all_modified_input_ids = [torch.cat([start_token, ids, end_tokens], dim=1) for ids in all_input_ids]
-    max_length = max([mod_ids.shape[1] for mod_ids in all_modified_input_ids])
-    all_padded_tensors = []
-    all_attention_masks = []
-    for mod_ids in all_modified_input_ids:
-        padding_length = max_length - mod_ids.shape[1]
-        padding_tensor = torch.full((1, padding_length), PAD_TOKEN_ID, dtype=torch.int64)
-        padded_tensor = torch.cat([padding_tensor, mod_ids], dim=1)
-        mask_padding = torch.zeros((1, padding_length), dtype=torch.int64)
-        mask_real = torch.ones((1, mod_ids.shape[1]), dtype=torch.int64)
-        attention_mask = torch.cat([mask_padding, mask_real], dim=1)
-        all_padded_tensors.append(padded_tensor)
-        all_attention_masks.append(attention_mask)
-    batch_input_ids = torch.cat(all_padded_tensors, dim=0).to(DEVICE)
-    batch_attention_mask = torch.cat(all_attention_masks, dim=0).to(DEVICE)
-    print("Generating tokens...")
-    with torch.no_grad():
-        generated_ids = model.generate(
-            input_ids=batch_input_ids,
-            attention_mask=batch_attention_mask,
-            max_new_tokens=1200,
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.95,
-            repetition_penalty=1.1,
-            num_return_sequences=1,
-            eos_token_id=REMOVE_TOKEN_ID,
-            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else PAD_TOKEN_ID,
-            use_cache=True
-        )
-    generated_ids = generated_ids.to("cpu")
-    print("Token generation complete.")
-    token_indices = (generated_ids == CROP_START_TOKEN_ID).nonzero(as_tuple=True)
-    cropped_tensors = []
-    if len(token_indices[0]) > 0:
-        for i in range(generated_ids.shape[0]):
-            seq_indices = token_indices[1][token_indices[0] == i]
-            if len(seq_indices) > 0:
-                last_occurrence_idx = seq_indices[-1].item()
-                cropped_tensors.append(generated_ids[i, last_occurrence_idx + 1:].unsqueeze(0))
-            else:
-                cropped_tensors.append(generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0))
-    else:
-         cropped_tensors = [generated_ids[i, batch_input_ids.shape[1]:].unsqueeze(0) for i in range(generated_ids.shape[0])]
-    processed_rows = []
-    for row_tensor in cropped_tensors:
-         if row_tensor.numel() > 0:
-            row_1d = row_tensor.squeeze(0)
-            mask = row_1d != REMOVE_TOKEN_ID
-            processed_rows.append(row_1d[mask])
-         else:
-            processed_rows.append(row_tensor.squeeze(0))
-    code_lists = []
-    for row in processed_rows:
-        if row.numel() >= 7:
-            row_length = row.size(0)
-            new_length = (row_length // 7) * 7
-            trimmed_row = row[:new_length]
-            adjusted_code_list = [(t.item() - AUDIO_CODE_OFFSET) for t in trimmed_row]
-            code_lists.append(adjusted_code_list)
-        else:
-            code_lists.append([])
-    print("Decoding audio with SNAC...")
-    all_audio_samples = []
-    for i, code_list in enumerate(code_lists):
-        if code_list:
-            codes_for_snac = redistribute_codes(code_list, DEVICE)
-            with torch.no_grad():
-                audio_hat = snac_model.decode(codes_for_snac)
-            all_audio_samples.append(audio_hat.detach().cpu())
-        else:
-            all_audio_samples.append(torch.tensor([[]]))
-    return all_audio_samples
-def main():
-    model, tokenizer, snac_model = load_models()
-    for voice in tqdm(VOICES):
-        my_samples = generate_audio_from_prompts(model, tokenizer, snac_model, PROMPTS, voice)
-        if len(PROMPTS) != len(my_samples):
-            print("Error: Mismatch between number of prompts and generated samples.")
-        else:
-            os.makedirs(OUTPUT_DIR, exist_ok=True)
-            for i, samples in enumerate(my_samples):
-                if samples.numel() > 0:
-                    audio_data = samples.squeeze().numpy()
-                    if audio_data.ndim == 0:
-                        audio_data = audio_data.reshape(1)
-                    output_filename = os.path.join(OUTPUT_DIR, f"voice_{voice}_{i}.wav")
-                    write_wav(output_filename, 24000, audio_data)
-                    print(f"Saved audio to: {output_filename}")
-                else:
-                    print(f"Skipping save for sample {i} as no audio data was generated.")
-if __name__ == "__main__":
-    main()
-```


93
94	## Usage
95
96	+ Refer `https://github.com/taresh18/orpheus-streaming`































































































































































































97
98