File size: 2,395 Bytes
9676240
 
1d41d99
9676240
 
 
1d41d99
d6245c2
 
 
 
 
1d41d99
d6245c2
9676240
 
d6245c2
 
9676240
 
78b083f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6245c2
 
 
9676240
 
 
 
 
 
 
d6245c2
9676240
 
d6245c2
9676240
d6245c2
9676240
 
 
d6245c2
9676240
 
 
d6245c2
9676240
 
 
 
 
d6245c2
 
 
9676240
 
 
d6245c2
 
 
 
 
 
 
 
 
 
9676240
 
d6245c2
 
9676240
 
d6245c2
 
 
9676240
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForTextToWaveform
import scipy.io.wavfile as wavfile
import numpy as np
import tempfile

# -----------------------------
# Load Bark Model + Processor
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("suno/bark")

model = AutoModelForTextToWaveform.from_pretrained(
    "suno/bark",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
).to(device)


# -----------------------------
# Voice Presets (Bark v2 manual list)
# -----------------------------
voices = [
    "v2/en_speaker_0",
    "v2/en_speaker_1",
    "v2/en_speaker_2",
    "v2/en_speaker_3",
    "v2/en_speaker_4",
    "v2/en_speaker_5",
    "v2/en_speaker_6",
    "v2/en_speaker_7",
    "v2/en_speaker_8",
    "v2/en_speaker_9",
]

voices = sorted(voices)



# -----------------------------
# Audio Generation Function
# -----------------------------
def generate_audio(text, voice):
    if not text.strip():
        return None

    # Prepare inputs
    inputs = processor(text, voice_preset=voice)

    # Move tensors to model.device
    for k, v in inputs.items():
        if torch.is_tensor(v):
            inputs[k] = v.to(device)

    # Generate waveform
    with torch.no_grad():
        audio = model.generate(**inputs)

    # Convert to numpy
    audio = audio.cpu().numpy().squeeze()
    sample_rate = 24000

    # Save temporary WAV file for Gradio output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        wavfile.write(fp.name, sample_rate, audio)
        return fp.name


# -----------------------------
# Gradio Interface
# -----------------------------
demo = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.Textbox(
            label="Prompt",
            placeholder="Type what you want Bark to say or sing...",
            lines=3
        ),
        gr.Dropdown(
            voices,
            label="Voice Preset",
            value="v2/en_speaker_6"
        ),
    ],
    outputs=gr.Audio(label="Generated Audio"),
    title="🎤 Bark Text-to-Audio (Suno, via HuggingFace Transformers)",
    description="Generates speech, singing, music, and sound effects using the open-source Bark model.",
)

# -----------------------------
# Launch
# -----------------------------
demo.launch()