File size: 3,410 Bytes
ac6219a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModel
from diffusers import DiffusionPipeline
import soundfile as sf
import numpy as np
# Load text tokenizer and embedding model (umt5-base)
def load_text_processor():
tokenizer = AutoTokenizer.from_pretrained("./umt5-base")
text_model = AutoModel.from_pretrained(
"./umt5-base",
use_safetensors=True,
torch_dtype=torch.float16,
device_map="auto"
)
return tokenizer, text_model
# Load the transformer backbone (phantomstep_transformer)
def load_transformer():
transformer = DiffusionPipeline.from_pretrained(
"./phantomstep_transformer",
use_safetensors=True,
torch_dtype=torch.float16,
device_map="auto"
)
return transformer
# Load the DCAE for audio encoding/decoding (phantomstep_dcae)
def load_dcae():
dcae = DiffusionPipeline.from_pretrained(
"./phantomstep_dcae",
use_safetensors=True,
torch_dtype=torch.float16,
device_map="auto"
)
return dcae
# Load the vocoder for audio synthesis (phantomstep_vocoder)
def load_vocoder():
vocoder = DiffusionPipeline.from_pretrained(
"./phantomstep_vocoder",
use_safetensors=True,
torch_dtype=torch.float16,
device_map="auto"
)
return vocoder
# Generate music from a text prompt
def generate_music(prompt, duration=20, seed=42):
torch.manual_seed(seed)
# Load all components
tokenizer, text_model = load_text_processor()
transformer = load_transformer()
dcae = load_dcae()
vocoder = load_vocoder()
# Step 1: Process text prompt to embeddings
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
inputs = {k: v.to(text_model.device) for k, v in inputs.items()}
with torch.no_grad():
embeddings = text_model(**inputs).last_hidden_state.mean(dim=1)
# Step 2: Pass embeddings through transformer
transformer_output = transformer(
embeddings,
num_inference_steps=50,
audio_length_in_s=duration
).audios[0]
# Step 3: Decode audio features with DCAE
dcae_output = dcae(
transformer_output,
num_inference_steps=50,
audio_length_in_s=duration
).audios[0]
# Step 4: Synthesize final audio with vocoder
audio = vocoder(
dcae_output,
num_inference_steps=50,
audio_length_in_s=duration
).audios[0]
# Save audio to a file
output_path = "output.wav"
sf.write(output_path, audio, 22050) # 22kHz sample rate
return output_path
# Gradio interface
with gr.Blocks(title="PhantomStep: Text-to-Music Generation 🎵") as demo:
gr.Markdown("# PhantomStep by GhostAI 🚀")
gr.Markdown("Enter a text prompt to generate music! 🎶")
prompt_input = gr.Textbox(label="Text Prompt", placeholder="A jazzy piano melody with a fast tempo")
duration_input = gr.Slider(label="Duration (seconds)", minimum=10, maximum=60, value=20, step=1)
seed_input = gr.Number(label="Random Seed", value=42, precision=0)
generate_button = gr.Button("Generate Music")
audio_output = gr.Audio(label="Generated Music")
generate_button.click(
fn=generate_music,
inputs=[prompt_input, duration_input, seed_input],
outputs=audio_output
)
demo.launch() |