File size: 1,450 Bytes
c5e3ece
 
d000c57
c5e3ece
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d000c57
c5e3ece
d000c57
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from transformers import BarkModel, AutoProcessor
import torch
import scipy 

def text_to_speech(text, voice_preset="v2/hi_speaker_2"):
    """
    Convert text to speech using Bark model

    Args:
        text (str): Text to convert to speech
        voice_preset (str): Voice preset to use for the speech synthesis

    Returns:
        torch.Tensor: Generated speech audio
        sampling_rate (int): Sampling rate of the generated audio
    """
    # Check if CUDA is available and set device accordingly
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    
    # Load the model and processor
    model = BarkModel.from_pretrained("suno/bark-small")
    processor = AutoProcessor.from_pretrained("suno/bark")

    # Move model and inputs to the appropriate device
    model = model.to(device)
    inputs = processor(text=text, voice_preset=voice_preset)
    for key, value in inputs.items():
        inputs[key] = value.to(device)
    
    # prepare the inputs
    inputs = processor(text, voice_preset=voice_preset)
    for key, value in inputs.items():
        inputs[key] = inputs[key].to(device)

    # generate speech
    speech_output = model.generate(**inputs)
    sampling_rate = model.generation_config.sample_rate
    path = "output_audio.wav"

    # Save the generated audio to a fileimport scipy
    scipy.io.wavfile.write("output_audio.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy())

    return path