import spaces import torch import torchaudio from einops import rearrange from stable_audio_tools import get_pretrained_model from stable_audio_tools.inference.generation import generate_diffusion_cond import gradio as gr import os from huggingface_hub import login # Authentifizierung if os.environ.get("HUGGING_FACE_HUB_TOKEN"): login(token=os.environ["HUGGING_FACE_HUB_TOKEN"]) @spaces.GPU(duration=180) def generate_audio(prompt, duration=10, steps=50, cfg_scale=7): device = "cuda" if torch.cuda.is_available() else "cpu" # Modell laden und zum Gerät verschieben model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0") model = model.to(device) sample_rate = model_config["sample_rate"] sample_size = model_config["sample_size"] # Konditionierung einrichten conditioning = [{ "prompt": prompt, "seconds_start": 0, "seconds_total": duration }] # Audio generieren mit anpassbaren Parametern output = generate_diffusion_cond( model, steps=steps, cfg_scale=cfg_scale, conditioning=conditioning, sample_size=sample_size, sigma_min=0.3, sigma_max=500, sampler_type="dpmpp-3m-sde", # Besserer Sampler device=device ) # Audio-Batch in eine einzelne Sequenz umwandeln output = rearrange(output, "b d n -> d (b n)") # Peak-Normalisierung, Clipping, Konvertierung zu int16 output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu() return output, sample_rate def generate(prompt, duration=10, steps=50, cfg_scale=7): audio, sr = generate_audio(prompt, duration, steps, cfg_scale) return (sr, audio.numpy()) # Verbesserte Benutzeroberfläche iface = gr.Interface( fn=generate, inputs=[ gr.Textbox(label="Prompt", placeholder="Beschreiben Sie den gewünschten Sound..."), gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Dauer (Sekunden)"), gr.Slider(minimum=20, maximum=100, value=50, step=5, label="Anzahl der Schritte"), gr.Slider(minimum=1, maximum=15, value=7, step=0.5, label="CFG Scale"), ], outputs=gr.Audio(label="Generiertes Audio"), title="Stable Audio Generator", description="Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0", ) if __name__ == "__main__": iface.launch()