|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
from diffusers import DiffusionPipeline |
|
|
import soundfile as sf |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
def load_text_processor(): |
|
|
tokenizer = AutoTokenizer.from_pretrained("./umt5-base") |
|
|
text_model = AutoModel.from_pretrained( |
|
|
"./umt5-base", |
|
|
use_safetensors=True, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
return tokenizer, text_model |
|
|
|
|
|
|
|
|
def load_transformer(): |
|
|
transformer = DiffusionPipeline.from_pretrained( |
|
|
"./phantomstep_transformer", |
|
|
use_safetensors=True, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
return transformer |
|
|
|
|
|
|
|
|
def load_dcae(): |
|
|
dcae = DiffusionPipeline.from_pretrained( |
|
|
"./phantomstep_dcae", |
|
|
use_safetensors=True, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
return dcae |
|
|
|
|
|
|
|
|
def load_vocoder(): |
|
|
vocoder = DiffusionPipeline.from_pretrained( |
|
|
"./phantomstep_vocoder", |
|
|
use_safetensors=True, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
return vocoder |
|
|
|
|
|
|
|
|
def generate_music(prompt, duration=20, seed=42): |
|
|
torch.manual_seed(seed) |
|
|
|
|
|
|
|
|
tokenizer, text_model = load_text_processor() |
|
|
transformer = load_transformer() |
|
|
dcae = load_dcae() |
|
|
vocoder = load_vocoder() |
|
|
|
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) |
|
|
inputs = {k: v.to(text_model.device) for k, v in inputs.items()} |
|
|
with torch.no_grad(): |
|
|
embeddings = text_model(**inputs).last_hidden_state.mean(dim=1) |
|
|
|
|
|
|
|
|
transformer_output = transformer( |
|
|
embeddings, |
|
|
num_inference_steps=50, |
|
|
audio_length_in_s=duration |
|
|
).audios[0] |
|
|
|
|
|
|
|
|
dcae_output = dcae( |
|
|
transformer_output, |
|
|
num_inference_steps=50, |
|
|
audio_length_in_s=duration |
|
|
).audios[0] |
|
|
|
|
|
|
|
|
audio = vocoder( |
|
|
dcae_output, |
|
|
num_inference_steps=50, |
|
|
audio_length_in_s=duration |
|
|
).audios[0] |
|
|
|
|
|
|
|
|
output_path = "output.wav" |
|
|
sf.write(output_path, audio, 22050) |
|
|
return output_path |
|
|
|
|
|
|
|
|
with gr.Blocks(title="PhantomStep: Text-to-Music Generation 🎵") as demo: |
|
|
gr.Markdown("# PhantomStep by GhostAI 🚀") |
|
|
gr.Markdown("Enter a text prompt to generate music! 🎶") |
|
|
|
|
|
prompt_input = gr.Textbox(label="Text Prompt", placeholder="A jazzy piano melody with a fast tempo") |
|
|
duration_input = gr.Slider(label="Duration (seconds)", minimum=10, maximum=60, value=20, step=1) |
|
|
seed_input = gr.Number(label="Random Seed", value=42, precision=0) |
|
|
generate_button = gr.Button("Generate Music") |
|
|
|
|
|
audio_output = gr.Audio(label="Generated Music") |
|
|
|
|
|
generate_button.click( |
|
|
fn=generate_music, |
|
|
inputs=[prompt_input, duration_input, seed_input], |
|
|
outputs=audio_output |
|
|
) |
|
|
|
|
|
demo.launch() |