Spaces:
Sleeping
Sleeping
| import spaces | |
| import torch | |
| import torchaudio | |
| from einops import rearrange | |
| from stable_audio_tools import get_pretrained_model | |
| from stable_audio_tools.inference.generation import generate_diffusion_cond | |
| import gradio as gr | |
| import os | |
| from huggingface_hub import login | |
| # Authentifizierung | |
| if os.environ.get("HUGGING_FACE_HUB_TOKEN"): | |
| login(token=os.environ["HUGGING_FACE_HUB_TOKEN"]) | |
| def generate_audio(prompt, duration=10, steps=50, cfg_scale=7): | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Modell laden und zum Gerät verschieben | |
| model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0") | |
| model = model.to(device) | |
| sample_rate = model_config["sample_rate"] | |
| sample_size = model_config["sample_size"] | |
| # Konditionierung einrichten | |
| conditioning = [{ | |
| "prompt": prompt, | |
| "seconds_start": 0, | |
| "seconds_total": duration | |
| }] | |
| # Audio generieren mit anpassbaren Parametern | |
| output = generate_diffusion_cond( | |
| model, | |
| steps=steps, | |
| cfg_scale=cfg_scale, | |
| conditioning=conditioning, | |
| sample_size=sample_size, | |
| sigma_min=0.3, | |
| sigma_max=500, | |
| sampler_type="dpmpp-3m-sde", # Besserer Sampler | |
| device=device | |
| ) | |
| # Audio-Batch in eine einzelne Sequenz umwandeln | |
| output = rearrange(output, "b d n -> d (b n)") | |
| # Peak-Normalisierung, Clipping, Konvertierung zu int16 | |
| output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu() | |
| return output, sample_rate | |
| def generate(prompt, duration=10, steps=50, cfg_scale=7): | |
| audio, sr = generate_audio(prompt, duration, steps, cfg_scale) | |
| return (sr, audio.numpy()) | |
| # Verbesserte Benutzeroberfläche | |
| iface = gr.Interface( | |
| fn=generate, | |
| inputs=[ | |
| gr.Textbox(label="Prompt", placeholder="Beschreiben Sie den gewünschten Sound..."), | |
| gr.Slider(minimum=1, maximum=30, value=10, step=1, label="Dauer (Sekunden)"), | |
| gr.Slider(minimum=20, maximum=100, value=50, step=5, label="Anzahl der Schritte"), | |
| gr.Slider(minimum=1, maximum=15, value=7, step=0.5, label="CFG Scale"), | |
| ], | |
| outputs=gr.Audio(label="Generiertes Audio"), | |
| title="Stable Audio Generator", | |
| description="Generieren Sie Audio aus Textbeschreibungen mit Stable Audio 1.0", | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |