File size: 2,395 Bytes
9676240 1d41d99 9676240 1d41d99 d6245c2 1d41d99 d6245c2 9676240 d6245c2 9676240 78b083f d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 d6245c2 9676240 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForTextToWaveform
import scipy.io.wavfile as wavfile
import numpy as np
import tempfile
# -----------------------------
# Load Bark Model + Processor
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("suno/bark")
model = AutoModelForTextToWaveform.from_pretrained(
"suno/bark",
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
).to(device)
# -----------------------------
# Voice Presets (Bark v2 manual list)
# -----------------------------
voices = [
"v2/en_speaker_0",
"v2/en_speaker_1",
"v2/en_speaker_2",
"v2/en_speaker_3",
"v2/en_speaker_4",
"v2/en_speaker_5",
"v2/en_speaker_6",
"v2/en_speaker_7",
"v2/en_speaker_8",
"v2/en_speaker_9",
]
voices = sorted(voices)
# -----------------------------
# Audio Generation Function
# -----------------------------
def generate_audio(text, voice):
if not text.strip():
return None
# Prepare inputs
inputs = processor(text, voice_preset=voice)
# Move tensors to model.device
for k, v in inputs.items():
if torch.is_tensor(v):
inputs[k] = v.to(device)
# Generate waveform
with torch.no_grad():
audio = model.generate(**inputs)
# Convert to numpy
audio = audio.cpu().numpy().squeeze()
sample_rate = 24000
# Save temporary WAV file for Gradio output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
wavfile.write(fp.name, sample_rate, audio)
return fp.name
# -----------------------------
# Gradio Interface
# -----------------------------
demo = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(
label="Prompt",
placeholder="Type what you want Bark to say or sing...",
lines=3
),
gr.Dropdown(
voices,
label="Voice Preset",
value="v2/en_speaker_6"
),
],
outputs=gr.Audio(label="Generated Audio"),
title="🎤 Bark Text-to-Audio (Suno, via HuggingFace Transformers)",
description="Generates speech, singing, music, and sound effects using the open-source Bark model.",
)
# -----------------------------
# Launch
# -----------------------------
demo.launch()
|