Buckets:
| import{s as Oe,o as Ke,n as et}from"../chunks/scheduler.8c3d61f6.js";import{S as tt,i as nt,g as i,s as o,r as g,A as at,h as l,f as n,c as s,j as ee,u as _,x as m,k as te,y as d,a,v as b,d as y,t as w,w as M}from"../chunks/index.da70eac4.js";import{D as Je}from"../chunks/Docstring.634d8861.js";import{C as Le}from"../chunks/CodeBlock.a9c4becf.js";import{E as ot}from"../chunks/ExampleCodeBlock.f879b663.js";import{H as je,E as st}from"../chunks/getInferenceSnippets.ea1775db.js";function it(ae){let p,C="Examples:",T,f,h;return f=new Le({props:{code:"aW1wb3J0JTIwc2NpcHklMEFpbXBvcnQlMjB0b3JjaCUwQWltcG9ydCUyMHNvdW5kZmlsZSUyMGFzJTIwc2YlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlQXVkaW9QaXBlbGluZSUwQSUwQXJlcG9faWQlMjAlM0QlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1hdWRpby1vcGVuLTEuMCUyMiUwQXBpcGUlMjAlM0QlMjBTdGFibGVBdWRpb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZChyZXBvX2lkJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2KSUwQXBpcGUlMjAlM0QlMjBwaXBlLnRvKCUyMmN1ZGElMjIpJTBBJTBBJTIzJTIwZGVmaW5lJTIwdGhlJTIwcHJvbXB0cyUwQXByb21wdCUyMCUzRCUyMCUyMlRoZSUyMHNvdW5kJTIwb2YlMjBhJTIwaGFtbWVyJTIwaGl0dGluZyUyMGElMjB3b29kZW4lMjBzdXJmYWNlLiUyMiUwQW5lZ2F0aXZlX3Byb21wdCUyMCUzRCUyMCUyMkxvdyUyMHF1YWxpdHkuJTIyJTBBJTBBJTIzJTIwc2V0JTIwdGhlJTIwc2VlZCUyMGZvciUyMGdlbmVyYXRvciUwQWdlbmVyYXRvciUyMCUzRCUyMHRvcmNoLkdlbmVyYXRvciglMjJjdWRhJTIyKS5tYW51YWxfc2VlZCgwKSUwQSUwQSUyMyUyMHJ1biUyMHRoZSUyMGdlbmVyYXRpb24lMEFhdWRpbyUyMCUzRCUyMHBpcGUoJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTJDJTBBJTIwJTIwJTIwJTIwbmVnYXRpdmVfcHJvbXB0JTNEbmVnYXRpdmVfcHJvbXB0JTJDJTBBJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDIwMCUyQyUwQSUyMCUyMCUyMCUyMGF1ZGlvX2VuZF9pbl9zJTNEMTAuMCUyQyUwQSUyMCUyMCUyMCUyMG51bV93YXZlZm9ybXNfcGVyX3Byb21wdCUzRDMlMkMlMEElMjAlMjAlMjAlMjBnZW5lcmF0b3IlM0RnZW5lcmF0b3IlMkMlMEEpLmF1ZGlvcyUwQSUwQW91dHB1dCUyMCUzRCUyMGF1ZGlvJTVCMCU1RC5ULmZsb2F0KCkuY3B1KCkubnVtcHkoKSUwQXNmLndyaXRlKCUyMmhhbW1lci53YXYlMjIlMkMlMjBvdXRwdXQlMkMlMjBwaXBlLnZhZS5zYW1wbGluZ19yYXRlKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> scipy | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> soundfile <span class="hljs-keyword">as</span> sf | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableAudioPipeline | |
| <span class="hljs-meta">>>> </span>repo_id = <span class="hljs-string">"stabilityai/stable-audio-open-1.0"</span> | |
| <span class="hljs-meta">>>> </span>pipe = StableAudioPipeline.from_pretrained(repo_id, torch_dtype=torch.float16) | |
| <span class="hljs-meta">>>> </span>pipe = pipe.to(<span class="hljs-string">"cuda"</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># define the prompts</span> | |
| <span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"The sound of a hammer hitting a wooden surface."</span> | |
| <span class="hljs-meta">>>> </span>negative_prompt = <span class="hljs-string">"Low quality."</span> | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># set the seed for generator</span> | |
| <span class="hljs-meta">>>> </span>generator = torch.Generator(<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">0</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># run the generation</span> | |
| <span class="hljs-meta">>>> </span>audio = pipe( | |
| <span class="hljs-meta">... </span> prompt, | |
| <span class="hljs-meta">... </span> negative_prompt=negative_prompt, | |
| <span class="hljs-meta">... </span> num_inference_steps=<span class="hljs-number">200</span>, | |
| <span class="hljs-meta">... </span> audio_end_in_s=<span class="hljs-number">10.0</span>, | |
| <span class="hljs-meta">... </span> num_waveforms_per_prompt=<span class="hljs-number">3</span>, | |
| <span class="hljs-meta">... </span> generator=generator, | |
| <span class="hljs-meta">... </span>).audios | |
| <span class="hljs-meta">>>> </span>output = audio[<span class="hljs-number">0</span>].T.<span class="hljs-built_in">float</span>().cpu().numpy() | |
| <span class="hljs-meta">>>> </span>sf.write(<span class="hljs-string">"hammer.wav"</span>, output, pipe.vae.sampling_rate)`,wrap:!1}}),{c(){p=i("p"),p.textContent=C,T=o(),g(f.$$.fragment)},l(r){p=l(r,"P",{"data-svelte-h":!0}),m(p)!=="svelte-kvfsh7"&&(p.textContent=C),T=s(r),_(f.$$.fragment,r)},m(r,v){a(r,p,v),a(r,T,v),b(f,r,v),h=!0},p:et,i(r){h||(y(f.$$.fragment,r),h=!0)},o(r){w(f.$$.fragment,r),h=!1},d(r){r&&(n(p),n(T)),M(f,r)}}}function lt(ae){let p,C,T,f,h,r,v,$e='Stable Audio was proposed in <a href="https://huggingface.co/papers/2407.14358" rel="nofollow">Stable Audio Open</a> by Zach Evans et al. . it takes a text prompt as input and predicts the corresponding sound or music sample.',oe,k,We="Stable Audio Open generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder.",se,B,Xe="Stable Audio is trained on a corpus of around 48k audio recordings, where around 47k are from Freesound and the rest are from the Free Music Archive (FMA). All audio files are licensed under CC0, CC BY, or CC Sampling+. This data is used to train the autoencoder and the DiT.",ie,A,Ge=`The abstract of the paper is the following: | |
| <em>Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model’s performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.</em>`,le,S,Pe='This pipeline was contributed by <a href="https://huggingface.co/ylacombe" rel="nofollow">Yoach Lacombe</a>. The original codebase can be found at <a href="https://github.com/Stability-AI/stable-audio-tools" rel="nofollow">Stability-AI/stable-audio-tools</a>.',re,I,de,Z,Re="When constructing a prompt, keep in mind:",pe,$,Ee="<li>Descriptive prompt inputs work best; use adjectives to describe the sound (for example, “high quality” or “clear”) and make the prompt context specific where possible (e.g. “melodic techno with a fast beat and synths” works better than “techno”).</li> <li>Using a <em>negative prompt</em> can significantly improve the quality of the generated audio. Try using a negative prompt of “low quality, average quality”.</li>",ce,W,Ve="During inference:",me,X,Qe="<li>The <em>quality</em> of the generated audio sample can be controlled by the <code>num_inference_steps</code> argument; higher steps give higher quality audio at the expense of slower inference.</li> <li>Multiple waveforms can be generated in one go: set <code>num_waveforms_per_prompt</code> to a value greater than 1 to enable. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.</li>",ue,G,fe,P,ze="Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.",he,R,He='Refer to the <a href="../../quantization/overview">Quantization</a> overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized <a href="/docs/diffusers/pr_12403/en/api/pipelines/stable_audio#diffusers.StableAudioPipeline">StableAudioPipeline</a> for inference with bitsandbytes.',ge,E,_e,V,be,c,Q,Ue,D,qe="Pipeline for text-to-audio generation using StableAudio.",xe,Y,Ne=`This model inherits from <a href="/docs/diffusers/pr_12403/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods | |
| implemented for all pipelines (downloading, saving, running on a particular device, etc.).`,Ce,J,z,ke,F,De="The call function to the pipeline for generation.",Be,j,Ae,U,H,Se,L,Ye=`Disable sliced VAE decoding. If <code>enable_vae_slicing</code> was previously enabled, this method will go back to | |
| computing decoding in one step.`,Ie,x,q,Ze,O,Fe=`Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to | |
| compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.`,ye,N,we,ne,Me;return h=new je({props:{title:"Stable Audio",local:"stable-audio",headingTag:"h1"}}),I=new je({props:{title:"Tips",local:"tips",headingTag:"h2"}}),G=new je({props:{title:"Quantization",local:"quantization",headingTag:"h2"}}),E=new Le({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwQml0c0FuZEJ5dGVzQ29uZmlnJTIwYXMlMjBEaWZmdXNlcnNCaXRzQW5kQnl0ZXNDb25maWclMkMlMjBTdGFibGVBdWRpb0RpVE1vZGVsJTJDJTIwU3RhYmxlQXVkaW9QaXBlbGluZSUwQWZyb20lMjBkaWZmdXNlcnMudXRpbHMlMjBpbXBvcnQlMjBleHBvcnRfdG9fdmlkZW8lMEFmcm9tJTIwdHJhbnNmb3JtZXJzJTIwaW1wb3J0JTIwQml0c0FuZEJ5dGVzQ29uZmlnJTIwYXMlMjBCaXRzQW5kQnl0ZXNDb25maWclMkMlMjBUNUVuY29kZXJNb2RlbCUwQSUwQXF1YW50X2NvbmZpZyUyMCUzRCUyMEJpdHNBbmRCeXRlc0NvbmZpZyhsb2FkX2luXzhiaXQlM0RUcnVlKSUwQXRleHRfZW5jb2Rlcl84Yml0JTIwJTNEJTIwVDVFbmNvZGVyTW9kZWwuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJpbGl0eWFpJTJGc3RhYmxlLWF1ZGlvLW9wZW4tMS4wJTIyJTJDJTBBJTIwJTIwJTIwJTIwc3ViZm9sZGVyJTNEJTIydGV4dF9lbmNvZGVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwcXVhbnRpemF0aW9uX2NvbmZpZyUzRHF1YW50X2NvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFxdWFudF9jb25maWclMjAlM0QlMjBEaWZmdXNlcnNCaXRzQW5kQnl0ZXNDb25maWcobG9hZF9pbl84Yml0JTNEVHJ1ZSklMEF0cmFuc2Zvcm1lcl84Yml0JTIwJTNEJTIwU3RhYmxlQXVkaW9EaVRNb2RlbC5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtYXVkaW8tb3Blbi0xLjAlMjIlMkMlMEElMjAlMjAlMjAlMjBzdWJmb2xkZXIlM0QlMjJ0cmFuc2Zvcm1lciUyMiUyQyUwQSUyMCUyMCUyMCUyMHF1YW50aXphdGlvbl9jb25maWclM0RxdWFudF9jb25maWclMkMlMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMEEpJTBBJTBBcGlwZWxpbmUlMjAlM0QlMjBTdGFibGVBdWRpb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1hdWRpby1vcGVuLTEuMCUyMiUyQyUwQSUyMCUyMCUyMCUyMHRleHRfZW5jb2RlciUzRHRleHRfZW5jb2Rlcl84Yml0JTJDJTBBJTIwJTIwJTIwJTIwdHJhbnNmb3JtZXIlM0R0cmFuc2Zvcm1lcl84Yml0JTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBJTIwJTIwJTIwJTIwZGV2aWNlX21hcCUzRCUyMmJhbGFuY2VkJTIyJTJDJTBBKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMlRoZSUyMHNvdW5kJTIwb2YlMjBhJTIwaGFtbWVyJTIwaGl0dGluZyUyMGElMjB3b29kZW4lMjBzdXJmYWNlLiUyMiUwQW5lZ2F0aXZlX3Byb21wdCUyMCUzRCUyMCUyMkxvdyUyMHF1YWxpdHkuJTIyJTBBYXVkaW8lMjAlM0QlMjBwaXBlbGluZSglMEElMjAlMjAlMjAlMjBwcm9tcHQlMkMlMEElMjAlMjAlMjAlMjBuZWdhdGl2ZV9wcm9tcHQlM0RuZWdhdGl2ZV9wcm9tcHQlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMjAwJTJDJTBBJTIwJTIwJTIwJTIwYXVkaW9fZW5kX2luX3MlM0QxMC4wJTJDJTBBJTIwJTIwJTIwJTIwbnVtX3dhdmVmb3Jtc19wZXJfcHJvbXB0JTNEMyUyQyUwQSUyMCUyMCUyMCUyMGdlbmVyYXRvciUzRGdlbmVyYXRvciUyQyUwQSkuYXVkaW9zJTBBJTBBb3V0cHV0JTIwJTNEJTIwYXVkaW8lNUIwJTVELlQuZmxvYXQoKS5jcHUoKS5udW1weSgpJTBBc2Yud3JpdGUoJTIyaGFtbWVyLndhdiUyMiUyQyUyMG91dHB1dCUyQyUyMHBpcGVsaW5lLnZhZS5zYW1wbGluZ19yYXRlKQ==",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> BitsAndBytesConfig <span class="hljs-keyword">as</span> DiffusersBitsAndBytesConfig, StableAudioDiTModel, StableAudioPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BitsAndBytesConfig <span class="hljs-keyword">as</span> BitsAndBytesConfig, T5EncoderModel | |
| quant_config = BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) | |
| text_encoder_8bit = T5EncoderModel.from_pretrained( | |
| <span class="hljs-string">"stabilityai/stable-audio-open-1.0"</span>, | |
| subfolder=<span class="hljs-string">"text_encoder"</span>, | |
| quantization_config=quant_config, | |
| torch_dtype=torch.float16, | |
| ) | |
| quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) | |
| transformer_8bit = StableAudioDiTModel.from_pretrained( | |
| <span class="hljs-string">"stabilityai/stable-audio-open-1.0"</span>, | |
| subfolder=<span class="hljs-string">"transformer"</span>, | |
| quantization_config=quant_config, | |
| torch_dtype=torch.float16, | |
| ) | |
| pipeline = StableAudioPipeline.from_pretrained( | |
| <span class="hljs-string">"stabilityai/stable-audio-open-1.0"</span>, | |
| text_encoder=text_encoder_8bit, | |
| transformer=transformer_8bit, | |
| torch_dtype=torch.float16, | |
| device_map=<span class="hljs-string">"balanced"</span>, | |
| ) | |
| prompt = <span class="hljs-string">"The sound of a hammer hitting a wooden surface."</span> | |
| negative_prompt = <span class="hljs-string">"Low quality."</span> | |
| audio = pipeline( | |
| prompt, | |
| negative_prompt=negative_prompt, | |
| num_inference_steps=<span class="hljs-number">200</span>, | |
| audio_end_in_s=<span class="hljs-number">10.0</span>, | |
| num_waveforms_per_prompt=<span class="hljs-number">3</span>, | |
| generator=generator, | |
| ).audios | |
| output = audio[<span class="hljs-number">0</span>].T.<span class="hljs-built_in">float</span>().cpu().numpy() | |
| sf.write(<span class="hljs-string">"hammer.wav"</span>, output, pipeline.vae.sampling_rate)`,wrap:!1}}),V=new je({props:{title:"StableAudioPipeline",local:"diffusers.StableAudioPipeline",headingTag:"h2"}}),Q=new Je({props:{name:"class diffusers.StableAudioPipeline",anchor:"diffusers.StableAudioPipeline",parameters:[{name:"vae",val:": AutoencoderOobleck"},{name:"text_encoder",val:": T5EncoderModel"},{name:"projection_model",val:": StableAudioProjectionModel"},{name:"tokenizer",val:": typing.Union[transformers.models.t5.tokenization_t5.T5Tokenizer, transformers.models.t5.tokenization_t5_fast.T5TokenizerFast]"},{name:"transformer",val:": StableAudioDiTModel"},{name:"scheduler",val:": EDMDPMSolverMultistepScheduler"}],parametersDescription:[{anchor:"diffusers.StableAudioPipeline.vae",description:`<strong>vae</strong> (<a href="/docs/diffusers/pr_12403/en/api/models/autoencoder_oobleck#diffusers.AutoencoderOobleck">AutoencoderOobleck</a>) — | |
| Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.`,name:"vae"},{anchor:"diffusers.StableAudioPipeline.text_encoder",description:`<strong>text_encoder</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/t5#transformers.T5EncoderModel" rel="nofollow">T5EncoderModel</a>) — | |
| Frozen text-encoder. StableAudio uses the encoder of | |
| <a href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel" rel="nofollow">T5</a>, specifically the | |
| <a href="https://huggingface.co/google-t5/t5-base" rel="nofollow">google-t5/t5-base</a> variant.`,name:"text_encoder"},{anchor:"diffusers.StableAudioPipeline.projection_model",description:`<strong>projection_model</strong> (<code>StableAudioProjectionModel</code>) — | |
| A trained model used to linearly project the hidden-states from the text encoder model and the start and | |
| end seconds. The projected hidden-states from the encoder and the conditional seconds are concatenated to | |
| give the input to the transformer model.`,name:"projection_model"},{anchor:"diffusers.StableAudioPipeline.tokenizer",description:`<strong>tokenizer</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/t5#transformers.T5Tokenizer" rel="nofollow">T5Tokenizer</a>) — | |
| Tokenizer to tokenize text for the frozen text-encoder.`,name:"tokenizer"},{anchor:"diffusers.StableAudioPipeline.transformer",description:`<strong>transformer</strong> (<a href="/docs/diffusers/pr_12403/en/api/models/stable_audio_transformer#diffusers.StableAudioDiTModel">StableAudioDiTModel</a>) — | |
| A <code>StableAudioDiTModel</code> to denoise the encoded audio latents.`,name:"transformer"},{anchor:"diffusers.StableAudioPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_12403/en/api/schedulers/edm_multistep_dpm_solver#diffusers.EDMDPMSolverMultistepScheduler">EDMDPMSolverMultistepScheduler</a>) — | |
| A scheduler to be used in combination with <code>transformer</code> to denoise the encoded audio latents.`,name:"scheduler"}],source:"https://github.com/huggingface/diffusers/blob/vr_12403/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py#L78"}}),z=new Je({props:{name:"__call__",anchor:"diffusers.StableAudioPipeline.__call__",parameters:[{name:"prompt",val:": typing.Union[str, typing.List[str]] = None"},{name:"audio_end_in_s",val:": typing.Optional[float] = None"},{name:"audio_start_in_s",val:": typing.Optional[float] = 0.0"},{name:"num_inference_steps",val:": int = 100"},{name:"guidance_scale",val:": float = 7.0"},{name:"negative_prompt",val:": typing.Union[str, typing.List[str], NoneType] = None"},{name:"num_waveforms_per_prompt",val:": typing.Optional[int] = 1"},{name:"eta",val:": float = 0.0"},{name:"generator",val:": typing.Union[torch._C.Generator, typing.List[torch._C.Generator], NoneType] = None"},{name:"latents",val:": typing.Optional[torch.Tensor] = None"},{name:"initial_audio_waveforms",val:": typing.Optional[torch.Tensor] = None"},{name:"initial_audio_sampling_rate",val:": typing.Optional[torch.Tensor] = None"},{name:"prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"negative_prompt_embeds",val:": typing.Optional[torch.Tensor] = None"},{name:"attention_mask",val:": typing.Optional[torch.LongTensor] = None"},{name:"negative_attention_mask",val:": typing.Optional[torch.LongTensor] = None"},{name:"return_dict",val:": bool = True"},{name:"callback",val:": typing.Optional[typing.Callable[[int, int, torch.Tensor], NoneType]] = None"},{name:"callback_steps",val:": typing.Optional[int] = 1"},{name:"output_type",val:": typing.Optional[str] = 'pt'"}],parametersDescription:[{anchor:"diffusers.StableAudioPipeline.__call__.prompt",description:`<strong>prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt or prompts to guide audio generation. If not defined, you need to pass <code>prompt_embeds</code>.`,name:"prompt"},{anchor:"diffusers.StableAudioPipeline.__call__.audio_end_in_s",description:`<strong>audio_end_in_s</strong> (<code>float</code>, <em>optional</em>, defaults to 47.55) — | |
| Audio end index in seconds.`,name:"audio_end_in_s"},{anchor:"diffusers.StableAudioPipeline.__call__.audio_start_in_s",description:`<strong>audio_start_in_s</strong> (<code>float</code>, <em>optional</em>, defaults to 0) — | |
| Audio start index in seconds.`,name:"audio_start_in_s"},{anchor:"diffusers.StableAudioPipeline.__call__.num_inference_steps",description:`<strong>num_inference_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 100) — | |
| The number of denoising steps. More denoising steps usually lead to a higher quality audio at the | |
| expense of slower inference.`,name:"num_inference_steps"},{anchor:"diffusers.StableAudioPipeline.__call__.guidance_scale",description:`<strong>guidance_scale</strong> (<code>float</code>, <em>optional</em>, defaults to 7.0) — | |
| A higher guidance scale value encourages the model to generate audio that is closely linked to the text | |
| <code>prompt</code> at the expense of lower sound quality. Guidance scale is enabled when <code>guidance_scale > 1</code>.`,name:"guidance_scale"},{anchor:"diffusers.StableAudioPipeline.__call__.negative_prompt",description:`<strong>negative_prompt</strong> (<code>str</code> or <code>List[str]</code>, <em>optional</em>) — | |
| The prompt or prompts to guide what to not include in audio generation. If not defined, you need to | |
| pass <code>negative_prompt_embeds</code> instead. Ignored when not using guidance (<code>guidance_scale < 1</code>).`,name:"negative_prompt"},{anchor:"diffusers.StableAudioPipeline.__call__.num_waveforms_per_prompt",description:`<strong>num_waveforms_per_prompt</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| The number of waveforms to generate per prompt.`,name:"num_waveforms_per_prompt"},{anchor:"diffusers.StableAudioPipeline.__call__.eta",description:`<strong>eta</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) — | |
| Corresponds to parameter eta (η) from the <a href="https://huggingface.co/papers/2010.02502" rel="nofollow">DDIM</a> paper. Only | |
| applies to the <a href="/docs/diffusers/pr_12403/en/api/schedulers/ddim#diffusers.DDIMScheduler">DDIMScheduler</a>, and is ignored in other schedulers.`,name:"eta"},{anchor:"diffusers.StableAudioPipeline.__call__.generator",description:`<strong>generator</strong> (<code>torch.Generator</code> or <code>List[torch.Generator]</code>, <em>optional</em>) — | |
| A <a href="https://pytorch.org/docs/stable/generated/torch.Generator.html" rel="nofollow"><code>torch.Generator</code></a> to make | |
| generation deterministic.`,name:"generator"},{anchor:"diffusers.StableAudioPipeline.__call__.latents",description:`<strong>latents</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for audio | |
| generation. Can be used to tweak the same generation with different prompts. If not provided, a latents | |
| tensor is generated by sampling using the supplied random <code>generator</code>.`,name:"latents"},{anchor:"diffusers.StableAudioPipeline.__call__.initial_audio_waveforms",description:`<strong>initial_audio_waveforms</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Optional initial audio waveforms to use as the initial audio waveform for generation. Must be of shape | |
| <code>(batch_size, num_channels, audio_length)</code> or <code>(batch_size, audio_length)</code>, where <code>batch_size</code> | |
| corresponds to the number of prompts passed to the model.`,name:"initial_audio_waveforms"},{anchor:"diffusers.StableAudioPipeline.__call__.initial_audio_sampling_rate",description:`<strong>initial_audio_sampling_rate</strong> (<code>int</code>, <em>optional</em>) — | |
| Sampling rate of the <code>initial_audio_waveforms</code>, if they are provided. Must be the same as the model.`,name:"initial_audio_sampling_rate"},{anchor:"diffusers.StableAudioPipeline.__call__.prompt_embeds",description:`<strong>prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-computed text embeddings from the text encoder model. Can be used to easily tweak text inputs, | |
| <em>e.g.</em> prompt weighting. If not provided, text embeddings will be computed from <code>prompt</code> input | |
| argument.`,name:"prompt_embeds"},{anchor:"diffusers.StableAudioPipeline.__call__.negative_prompt_embeds",description:`<strong>negative_prompt_embeds</strong> (<code>torch.Tensor</code>, <em>optional</em>) — | |
| Pre-computed negative text embeddings from the text encoder model. Can be used to easily tweak text | |
| inputs, <em>e.g.</em> prompt weighting. If not provided, negative_prompt_embeds will be computed from | |
| <code>negative_prompt</code> input argument.`,name:"negative_prompt_embeds"},{anchor:"diffusers.StableAudioPipeline.__call__.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.LongTensor</code>, <em>optional</em>) — | |
| Pre-computed attention mask to be applied to the <code>prompt_embeds</code>. If not provided, attention mask will | |
| be computed from <code>prompt</code> input argument.`,name:"attention_mask"},{anchor:"diffusers.StableAudioPipeline.__call__.negative_attention_mask",description:`<strong>negative_attention_mask</strong> (<code>torch.LongTensor</code>, <em>optional</em>) — | |
| Pre-computed attention mask to be applied to the <code>negative_text_audio_duration_embeds</code>.`,name:"negative_attention_mask"},{anchor:"diffusers.StableAudioPipeline.__call__.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) — | |
| Whether or not to return a <a href="/docs/diffusers/pr_12403/en/api/pipelines/stable_diffusion/text2img#diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput">StableDiffusionPipelineOutput</a> instead of a | |
| plain tuple.`,name:"return_dict"},{anchor:"diffusers.StableAudioPipeline.__call__.callback",description:`<strong>callback</strong> (<code>Callable</code>, <em>optional</em>) — | |
| A function that calls every <code>callback_steps</code> steps during inference. The function is called with the | |
| following arguments: <code>callback(step: int, timestep: int, latents: torch.Tensor)</code>.`,name:"callback"},{anchor:"diffusers.StableAudioPipeline.__call__.callback_steps",description:`<strong>callback_steps</strong> (<code>int</code>, <em>optional</em>, defaults to 1) — | |
| The frequency at which the <code>callback</code> function is called. If not specified, the callback is called at | |
| every step.`,name:"callback_steps"},{anchor:"diffusers.StableAudioPipeline.__call__.output_type",description:`<strong>output_type</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"pt"</code>) — | |
| The output format of the generated audio. Choose between <code>"np"</code> to return a NumPy <code>np.ndarray</code> or | |
| <code>"pt"</code> to return a PyTorch <code>torch.Tensor</code> object. Set to <code>"latent"</code> to return the latent diffusion | |
| model (LDM) output.`,name:"output_type"}],source:"https://github.com/huggingface/diffusers/blob/vr_12403/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py#L490",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p>If <code>return_dict</code> is <code>True</code>, <a | |
| href="/docs/diffusers/pr_12403/en/api/pipelines/stable_diffusion/text2img#diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput" | |
| >StableDiffusionPipelineOutput</a> is returned, | |
| otherwise a <code>tuple</code> is returned where the first element is a list with the generated audio.</p> | |
| `,returnType:`<script context="module">export const metadata = 'undefined';<\/script> | |
| <p><a | |
| href="/docs/diffusers/pr_12403/en/api/pipelines/stable_diffusion/text2img#diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput" | |
| >StableDiffusionPipelineOutput</a> or <code>tuple</code></p> | |
| `}}),j=new ot({props:{anchor:"diffusers.StableAudioPipeline.__call__.example",$$slots:{default:[it]},$$scope:{ctx:ae}}}),H=new Je({props:{name:"disable_vae_slicing",anchor:"diffusers.StableAudioPipeline.disable_vae_slicing",parameters:[],source:"https://github.com/huggingface/diffusers/blob/vr_12403/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py#L142"}}),q=new Je({props:{name:"enable_vae_slicing",anchor:"diffusers.StableAudioPipeline.enable_vae_slicing",parameters:[],source:"https://github.com/huggingface/diffusers/blob/vr_12403/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py#L128"}}),N=new st({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_audio.md"}}),{c(){p=i("meta"),C=o(),T=i("p"),f=o(),g(h.$$.fragment),r=o(),v=i("p"),v.innerHTML=$e,oe=o(),k=i("p"),k.textContent=We,se=o(),B=i("p"),B.textContent=Xe,ie=o(),A=i("p"),A.innerHTML=Ge,le=o(),S=i("p"),S.innerHTML=Pe,re=o(),g(I.$$.fragment),de=o(),Z=i("p"),Z.textContent=Re,pe=o(),$=i("ul"),$.innerHTML=Ee,ce=o(),W=i("p"),W.textContent=Ve,me=o(),X=i("ul"),X.innerHTML=Qe,ue=o(),g(G.$$.fragment),fe=o(),P=i("p"),P.textContent=ze,he=o(),R=i("p"),R.innerHTML=He,ge=o(),g(E.$$.fragment),_e=o(),g(V.$$.fragment),be=o(),c=i("div"),g(Q.$$.fragment),Ue=o(),D=i("p"),D.textContent=qe,xe=o(),Y=i("p"),Y.innerHTML=Ne,Ce=o(),J=i("div"),g(z.$$.fragment),ke=o(),F=i("p"),F.textContent=De,Be=o(),g(j.$$.fragment),Ae=o(),U=i("div"),g(H.$$.fragment),Se=o(),L=i("p"),L.innerHTML=Ye,Ie=o(),x=i("div"),g(q.$$.fragment),Ze=o(),O=i("p"),O.textContent=Fe,ye=o(),g(N.$$.fragment),we=o(),ne=i("p"),this.h()},l(e){const t=at("svelte-u9bgzb",document.head);p=l(t,"META",{name:!0,content:!0}),t.forEach(n),C=s(e),T=l(e,"P",{}),ee(T).forEach(n),f=s(e),_(h.$$.fragment,e),r=s(e),v=l(e,"P",{"data-svelte-h":!0}),m(v)!=="svelte-wlprt6"&&(v.innerHTML=$e),oe=s(e),k=l(e,"P",{"data-svelte-h":!0}),m(k)!=="svelte-1fkqtr3"&&(k.textContent=We),se=s(e),B=l(e,"P",{"data-svelte-h":!0}),m(B)!=="svelte-1di6p47"&&(B.textContent=Xe),ie=s(e),A=l(e,"P",{"data-svelte-h":!0}),m(A)!=="svelte-lksd8b"&&(A.innerHTML=Ge),le=s(e),S=l(e,"P",{"data-svelte-h":!0}),m(S)!=="svelte-1kn7isa"&&(S.innerHTML=Pe),re=s(e),_(I.$$.fragment,e),de=s(e),Z=l(e,"P",{"data-svelte-h":!0}),m(Z)!=="svelte-1dvtu0c"&&(Z.textContent=Re),pe=s(e),$=l(e,"UL",{"data-svelte-h":!0}),m($)!=="svelte-1gnv7ni"&&($.innerHTML=Ee),ce=s(e),W=l(e,"P",{"data-svelte-h":!0}),m(W)!=="svelte-1g0t9wk"&&(W.textContent=Ve),me=s(e),X=l(e,"UL",{"data-svelte-h":!0}),m(X)!=="svelte-1xz0ikc"&&(X.innerHTML=Qe),ue=s(e),_(G.$$.fragment,e),fe=s(e),P=l(e,"P",{"data-svelte-h":!0}),m(P)!=="svelte-1ou2pxc"&&(P.textContent=ze),he=s(e),R=l(e,"P",{"data-svelte-h":!0}),m(R)!=="svelte-1fsnp0a"&&(R.innerHTML=He),ge=s(e),_(E.$$.fragment,e),_e=s(e),_(V.$$.fragment,e),be=s(e),c=l(e,"DIV",{class:!0});var u=ee(c);_(Q.$$.fragment,u),Ue=s(u),D=l(u,"P",{"data-svelte-h":!0}),m(D)!=="svelte-vd0h2c"&&(D.textContent=qe),xe=s(u),Y=l(u,"P",{"data-svelte-h":!0}),m(Y)!=="svelte-1yt3ip0"&&(Y.innerHTML=Ne),Ce=s(u),J=l(u,"DIV",{class:!0});var K=ee(J);_(z.$$.fragment,K),ke=s(K),F=l(K,"P",{"data-svelte-h":!0}),m(F)!=="svelte-50j04k"&&(F.textContent=De),Be=s(K),_(j.$$.fragment,K),K.forEach(n),Ae=s(u),U=l(u,"DIV",{class:!0});var ve=ee(U);_(H.$$.fragment,ve),Se=s(ve),L=l(ve,"P",{"data-svelte-h":!0}),m(L)!=="svelte-1s3c06i"&&(L.innerHTML=Ye),ve.forEach(n),Ie=s(u),x=l(u,"DIV",{class:!0});var Te=ee(x);_(q.$$.fragment,Te),Ze=s(Te),O=l(Te,"P",{"data-svelte-h":!0}),m(O)!=="svelte-14bnrb6"&&(O.textContent=Fe),Te.forEach(n),u.forEach(n),ye=s(e),_(N.$$.fragment,e),we=s(e),ne=l(e,"P",{}),ee(ne).forEach(n),this.h()},h(){te(p,"name","hf:doc:metadata"),te(p,"content",rt),te(J,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),te(U,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),te(x,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),te(c,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){d(document.head,p),a(e,C,t),a(e,T,t),a(e,f,t),b(h,e,t),a(e,r,t),a(e,v,t),a(e,oe,t),a(e,k,t),a(e,se,t),a(e,B,t),a(e,ie,t),a(e,A,t),a(e,le,t),a(e,S,t),a(e,re,t),b(I,e,t),a(e,de,t),a(e,Z,t),a(e,pe,t),a(e,$,t),a(e,ce,t),a(e,W,t),a(e,me,t),a(e,X,t),a(e,ue,t),b(G,e,t),a(e,fe,t),a(e,P,t),a(e,he,t),a(e,R,t),a(e,ge,t),b(E,e,t),a(e,_e,t),b(V,e,t),a(e,be,t),a(e,c,t),b(Q,c,null),d(c,Ue),d(c,D),d(c,xe),d(c,Y),d(c,Ce),d(c,J),b(z,J,null),d(J,ke),d(J,F),d(J,Be),b(j,J,null),d(c,Ae),d(c,U),b(H,U,null),d(U,Se),d(U,L),d(c,Ie),d(c,x),b(q,x,null),d(x,Ze),d(x,O),a(e,ye,t),b(N,e,t),a(e,we,t),a(e,ne,t),Me=!0},p(e,[t]){const u={};t&2&&(u.$$scope={dirty:t,ctx:e}),j.$set(u)},i(e){Me||(y(h.$$.fragment,e),y(I.$$.fragment,e),y(G.$$.fragment,e),y(E.$$.fragment,e),y(V.$$.fragment,e),y(Q.$$.fragment,e),y(z.$$.fragment,e),y(j.$$.fragment,e),y(H.$$.fragment,e),y(q.$$.fragment,e),y(N.$$.fragment,e),Me=!0)},o(e){w(h.$$.fragment,e),w(I.$$.fragment,e),w(G.$$.fragment,e),w(E.$$.fragment,e),w(V.$$.fragment,e),w(Q.$$.fragment,e),w(z.$$.fragment,e),w(j.$$.fragment,e),w(H.$$.fragment,e),w(q.$$.fragment,e),w(N.$$.fragment,e),Me=!1},d(e){e&&(n(C),n(T),n(f),n(r),n(v),n(oe),n(k),n(se),n(B),n(ie),n(A),n(le),n(S),n(re),n(de),n(Z),n(pe),n($),n(ce),n(W),n(me),n(X),n(ue),n(fe),n(P),n(he),n(R),n(ge),n(_e),n(be),n(c),n(ye),n(we),n(ne)),n(p),M(h,e),M(I,e),M(G,e),M(E,e),M(V,e),M(Q),M(z),M(j),M(H),M(q),M(N,e)}}}const rt='{"title":"Stable Audio","local":"stable-audio","sections":[{"title":"Tips","local":"tips","sections":[],"depth":2},{"title":"Quantization","local":"quantization","sections":[],"depth":2},{"title":"StableAudioPipeline","local":"diffusers.StableAudioPipeline","sections":[],"depth":2}],"depth":1}';function dt(ae){return Ke(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class gt extends tt{constructor(p){super(),nt(this,p,dt,lt,Oe,{})}}export{gt as component}; | |
Xet Storage Details
- Size:
- 33.1 kB
- Xet hash:
- a9a2c6fe973e6d85c318326bc2e08e592de8a1c41d470be96bf88e5600e55f8a
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.